[Scummvm-git-logs] scummvm master -> da99ff1efdf80d4a564ea3efca10855f325dccfd

sev- noreply at scummvm.org
Sun Jun 14 19:59:33 UTC 2026


This automated email contains information about 20 new commits which have been
pushed to the 'scummvm' repo located at https://api.github.com/repos/scummvm/scummvm .

Summary:
84056c68ba SCUMM: Create base-costume-optimised.cpp
55c6fddccf SCUMM: Optimise only paintCelByleRLECommon & byleRLEDecode
4e469cf33d SCUMM: Introduce BaseCostumeRenderer::byleRLEDecodeFast
8b49d1b5a0 SCUMM: byleRLEDecodeFast: Don't calculate masking for transparent color
041c9eb7d6 SCUMM: byleRLEDecodeFast: Check for height == 0 every repLen pixels
546fa65d45 SCUMM: byleRLEDecodeFast: Remove linesToSkip kludge
c5e04bec37 SCUMM: byleRLEDecodeFast: Introduce 10 identical copies
27a71d8195 SCUMM: byleRLEDecodeFast: Remove dead code
f0011e4729 SCUMM: byleRLEDecodeFast: Optimise non-scaled modes
ad29b12637 SCUMM: byleRLEDecodeFast: Move Y clipping to caller
907150addb SCUMM: byleRLEDecodeFast: Optimise non-scaled modes even more
783339e94c SCUMM: byleRLEDecodeFast: Proper Y clipping in scaled modes
2390a071b9 SCUMM: byleRLEDecodeFast: Fast path for Mode1
4f5361a2f7 SCUMM: byleRLEDecodeFast: m68k assembly functions
16b6d2a0ae SCUMM: byleRLEDecodeFast: Implement HE>=90 Mode3
ff472311ac SCUMM: byleRLEDecodeFast: Prefer non-exported labels with .L
de32e66a2a SCUMM: byleRLEDecodeFast: Optimise lastColumnX code
fded2d4203 SCUMM: byleRLEDecodeFast: Introduce specialised Mode0&1 Y clipping functions
7a854a8bd7 SCUMM: Optimise BaseCostumeRenderer::skipCelLines
da99ff1efd SCUMM: Rewrite asmDrawStripToScreen into m68k assembly


Commit: 84056c68ba0a79adb875fd7ab3c7033c5a0a16ba
    https://github.com/scummvm/scummvm/commit/84056c68ba0a79adb875fd7ab3c7033c5a0a16ba
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: Create base-costume-optimised.cpp

This is a duplicate of base-costume.cpp and is enabled by
SCUMM_OPTIMISED_CODE. Currently enabled only for m68k-atari-mint* builds
(Atari Lite, Atari Full, FireBee).

Changed paths:
  A engines/scumm/base-costume-optimised.cpp
    configure
    engines/scumm/base-costume.cpp
    engines/scumm/module.mk


diff --git a/configure b/configure
index b91d60bdb76..b454d2dcf41 100755
--- a/configure
+++ b/configure
@@ -3886,6 +3886,7 @@ if test -n "$_host"; then
 			#	# --disable-debug
 			#	append_var LDFLAGS "-s"
 			#fi
+			append_var DEFINES "-DSCUMM_OPTIMISED_CODE"
 
 			# auto -> no
 			if test "$_optimizations" = "yes"; then
diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
new file mode 100644
index 00000000000..17c19a38767
--- /dev/null
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -0,0 +1,456 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#include "scumm/base-costume.h"
+#include "scumm/util.h"
+
+namespace Scumm {
+
+#ifdef SCUMM_OPTIMISED_CODE
+byte BaseCostumeRenderer::drawCostume(const VirtScreen &vs, int numStrips, const Actor *a, bool drawToBackBuf) {
+	int i;
+	byte result = 0;
+
+	_out = vs;
+	if (drawToBackBuf)
+		_out.setPixels(vs.getBackPixels(0, 0));
+	else
+		_out.setPixels(vs.getPixels(0, 0));
+
+	_actorX += _vm->_virtscr[kMainVirtScreen].xstart & 7;
+	_out.w = _out.pitch / _vm->_bytesPerPixel;
+	// We do not use getBasePtr here because the offset to pixels never used
+	// _vm->_bytesPerPixel, but it seems unclear why.
+	_out.setPixels((byte *)_out.getPixels() - (_vm->_virtscr[kMainVirtScreen].xstart & 7));
+
+	_numStrips = numStrips;
+
+	if (_vm->_game.version <= 1) {
+		_xMove = 0;
+		_yMove = 0;
+	} else if (_vm->_game.features & GF_OLD_BUNDLE) {
+		_xMove = -72;
+		_yMove = -100;
+	} else {
+		_xMove = _yMove = 0;
+	}
+	for (i = 0; i < 16; i++)
+		result |= drawLimb(a, i);
+	return result;
+}
+
+byte BaseCostumeRenderer::paintCelByleRLECommon(
+	int xMoveCur,
+	int yMoveCur,
+	int numColors,
+	int scaletableSize,
+	bool amiOrPcEngCost,
+	bool c64Cost,
+	ByleRLEData &compData,
+	bool &decode) {
+
+	bool actorIsScaled;
+	int i, j;
+	int linesToSkip = 0, startScaleIndexX, startScaleIndexY;
+	Common::Rect rect;
+	int step;
+	byte drawFlag = 1;
+
+	// Setup color decoding variables
+	if (numColors == 32) {
+		compData.mask = 7;
+		compData.shr = 3;
+	} else if (numColors == 64) {
+		compData.mask = 3;
+		compData.shr = 2;
+	} else {
+		compData.mask = 15;
+		compData.shr = 4;
+	}
+
+	actorIsScaled = (_scaleX != 0xFF) || (_scaleY != 0xFF);
+
+	compData.boundsRect.left = 0;
+	compData.boundsRect.top = 0;
+	compData.boundsRect.right = _out.w;
+	compData.boundsRect.bottom = _out.h;
+
+	if (actorIsScaled) {
+
+		/* Scale direction */
+		compData.scaleXStep = -1;
+		if (xMoveCur < 0) {
+			xMoveCur = -xMoveCur;
+			compData.scaleXStep = 1;
+		}
+
+		if (_drawActorToRight) {
+			/* Adjust X position */
+			startScaleIndexX = j = (scaletableSize - xMoveCur) & compData.scaleIndexMask;
+			for (i = 0; i < xMoveCur; i++) {
+				if (compData.scaleTable[j++ & compData.scaleIndexMask] < _scaleX)
+					compData.x -= compData.scaleXStep;
+			}
+
+			rect.left = rect.right = compData.x;
+
+			j = startScaleIndexX;
+			for (i = 0; i < _width; i++) {
+				if (rect.right < 0) {
+					linesToSkip++;
+					startScaleIndexX = j;
+				}
+				if (compData.scaleTable[j++ & compData.scaleIndexMask] < _scaleX)
+					rect.right++;
+			}
+		} else {
+			/* No mirror */
+			/* Adjust X position */
+			startScaleIndexX = j = (scaletableSize + xMoveCur) & compData.scaleIndexMask;
+			for (i = 0; i < xMoveCur; i++) {
+				if (compData.scaleTable[j-- & compData.scaleIndexMask] < _scaleX)
+					compData.x += compData.scaleXStep;
+			}
+
+			rect.left = rect.right = compData.x;
+
+			j = startScaleIndexX;
+			for (i = 0; i < _width; i++) {
+				if (rect.left >= compData.boundsRect.right) {
+					startScaleIndexX = j;
+					linesToSkip++;
+				}
+				if (compData.scaleTable[j-- & compData.scaleIndexMask] < _scaleX)
+					rect.left--;
+			}
+		}
+
+		if (linesToSkip)
+			linesToSkip--;
+
+		step = -1;
+		if (yMoveCur < 0) {
+			yMoveCur = -yMoveCur;
+			step = -step;
+		}
+
+		startScaleIndexY = j = (scaletableSize - yMoveCur) & compData.scaleIndexMask;
+		for (i = 0; i < yMoveCur; i++) {
+			if (compData.scaleTable[j++ & compData.scaleIndexMask] < _scaleY)
+				compData.y -= step;
+		}
+
+		rect.top = rect.bottom = compData.y;
+
+		j = startScaleIndexY;
+		for (i = 0; i < _height; i++) {
+			if (compData.scaleTable[j++ & compData.scaleIndexMask] < _scaleY)
+				rect.bottom++;
+		}
+	} else {
+		if (!_drawActorToRight)
+			xMoveCur = -xMoveCur;
+
+		compData.x += xMoveCur;
+		compData.y += yMoveCur;
+
+		if (_drawActorToRight) {
+			rect.left = compData.x;
+			rect.right = compData.x + _width;
+		} else {
+			rect.left = compData.x - _width;
+			rect.right = compData.x;
+		}
+
+		rect.top = compData.y;
+		rect.bottom = rect.top + _height;
+
+		startScaleIndexX = scaletableSize;
+		startScaleIndexY = scaletableSize;
+	}
+
+	compData.scaleXIndex = startScaleIndexX;
+	compData.scaleYIndex = startScaleIndexY;
+	compData.skipWidth = _width;
+	compData.scaleXStep = _drawActorToRight ? 1 : -1;
+
+	markAsDirty(rect, compData, decode);
+	if (!decode)
+		return 0;
+
+	if (rect.top >= compData.boundsRect.bottom || rect.bottom <= compData.boundsRect.top) {
+		decode = false;
+		return 0;
+	}
+
+	if (rect.left >= compData.boundsRect.right || rect.right <= compData.boundsRect.left) {
+		decode = false;
+		return 0;
+	}
+
+	compData.repLen = 0;
+
+	if (_drawActorToRight) {
+		if (!actorIsScaled)
+			linesToSkip = compData.boundsRect.left - compData.x;
+		if (linesToSkip > 0) {
+			if (!amiOrPcEngCost && !c64Cost) {
+				compData.skipWidth -= linesToSkip;
+				skipCelLines(compData, linesToSkip);
+				compData.x = compData.boundsRect.left;
+			}
+		} else {
+			linesToSkip = rect.right - compData.boundsRect.right;
+			if (linesToSkip <= 0) {
+				drawFlag = 2;
+			} else {
+				compData.skipWidth -= linesToSkip;
+			}
+		}
+	} else {
+		if (!actorIsScaled) {
+			if (_akosRendering)
+				linesToSkip = rect.right - compData.boundsRect.right + 1;
+			else
+				linesToSkip = rect.right - compData.boundsRect.right;
+		}
+		if (linesToSkip > 0) {
+			if (!amiOrPcEngCost && !c64Cost) {
+				compData.skipWidth -= linesToSkip;
+				skipCelLines(compData, linesToSkip);
+				compData.x = compData.boundsRect.right - 1;
+			}
+		} else {
+			// V1 games uses 8 x 8 pixels for actors
+			if (c64Cost)
+				linesToSkip = (compData.boundsRect.left - 8) - rect.left;
+			else
+				linesToSkip = (compData.boundsRect.left - 1) - rect.left;
+			if (linesToSkip <= 0)
+				drawFlag = 2;
+			else
+				compData.skipWidth -= linesToSkip;
+		}
+	}
+
+	if (compData.skipWidth <= 0) {
+		decode = false;
+		return 0;
+	}
+
+	if (rect.left < compData.boundsRect.left)
+		rect.left = compData.boundsRect.left;
+
+	if (rect.top < compData.boundsRect.top)
+		rect.top = compData.boundsRect.top;
+
+	if (rect.top > compData.boundsRect.bottom)
+		rect.top = compData.boundsRect.bottom;
+
+	if (rect.bottom > compData.boundsRect.bottom)
+		rect.bottom = compData.boundsRect.bottom;
+
+	if (_drawTop > rect.top)
+		_drawTop = rect.top;
+	if (_drawBottom < rect.bottom)
+		_drawBottom = rect.bottom;
+
+	if (!_akosRendering && (_height + rect.top >= 256)) {
+		decode = false;
+		return 2;
+	}
+
+	compData.destPtr = (byte *)_out.getBasePtr(compData.x, compData.y);
+
+	return drawFlag;
+}
+
+void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX, int16 actorHitY, bool *actorHitResult, const uint8 *xmap) {
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	if (len)
+		goto StartPos;
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (actorHitResult) {
+					if (color && y == actorHitY && compData.x == actorHitX) {
+						*actorHitResult = true;
+						return;
+					}
+				} else {
+					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+						|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
+						|| (compData.maskPtr && (*mask & maskbit));
+					bool skipColumn = false;
+
+					if (color && !masked) {
+						uint16 pcolor;
+
+						if (!_akosRendering) {
+							if (_shadowMode & 0x20) {
+								pcolor = _shadowTable[*dst];
+							} else {
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable)
+									pcolor = _shadowTable[*dst];
+							}
+						} else {
+							pcolor = _palette[color];
+
+							if (_shadowMode == 1) {
+								if (pcolor == 13) {
+									// In shadow mode 1 skipColumn works more or less the same way as in shadow
+									// mode 3. It is only ever checked and applied if pcolor is 13.
+									skipColumn = (lastColumnX == compData.x);
+									pcolor = _shadowTable[*dst];
+								}
+							} else if (_shadowMode == 2) {
+								error("AkosRenderer::byleRLEDecode(): shadowMode 2 not implemented."); // TODO
+							} else if (_shadowMode == 3) {
+								if (_vm->_game.features & GF_16BIT_COLOR) {
+									// I add the column skip here, too, although I don't know whether it always
+									// applies. But this is the only way to prevent recursive shading of pixels.
+									// This might need more fine tuning...
+									skipColumn = (lastColumnX == compData.x);
+									uint16 srcColor = (pcolor >> 1) & 0x7DEF;
+									uint16 dstColor = (READ_UINT16(dst) >> 1) & 0x7DEF;
+									pcolor = srcColor + dstColor;
+								} else if (_vm->_game.heversion >= 90) {
+									// I add the column skip here, too, although I don't know whether it always
+									// applies. But this is the only way to prevent recursive shading of pixels.
+									// This might need more fine tuning...
+									skipColumn = (lastColumnX == compData.x);
+									pcolor = (pcolor << 8) + *dst;
+									pcolor = xmap[pcolor];
+								} else if (pcolor < 8) {
+									// This mode is used in COMI. The column skip only takes place when the shading
+									// is actually applied (for pcolor < 8). The skip avoids shading of pixels that
+									// already have been shaded.
+									skipColumn = (lastColumnX == compData.x);
+									pcolor = (pcolor << 8) + *dst;
+									pcolor = _shadowTable[pcolor];
+								}
+							}
+						}
+						if (!skipColumn) {
+							if (_vm->_bytesPerPixel == 2) {
+								WRITE_UINT16(dst, pcolor);
+							} else {
+								*dst = pcolor;
+							}
+						}
+					}
+				}
+				dst += _out.pitch;
+				mask += _numStrips;
+				y++;
+			}
+			if (!--height) {
+				if (!--compData.skipWidth)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					if (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
+						return;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep * _vm->_bytesPerPixel;
+				}
+
+				// From MONKEY1 EGA disasm: we only increment by 1.
+				// This accurately produces the original wonky scaling
+				// for the floppy editions of Monkey Island 1.
+				// Also valid for all other v4 games (this code is
+				// also in the executable for LOOM CD).
+				if (_vm->_game.version == 4) {
+					compData.scaleXIndex = (compData.scaleXIndex + 1) & compData.scaleIndexMask;
+				} else {
+					compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				}
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (--len);
+	} while (true);
+}
+
+void BaseCostumeRenderer::skipCelLines(ByleRLEData &compData, int num) {
+	num *= _height;
+
+	do {
+		compData.repLen = *_srcPtr++;
+		compData.repColor = compData.repLen >> compData.shr;
+		compData.repLen &= compData.mask;
+
+		if (!compData.repLen)
+			compData.repLen = *_srcPtr++;
+
+		do {
+			if (!--num)
+				return;
+		} while (--compData.repLen);
+	} while (true);
+}
+
+bool ScummEngine::isCostumeInUse(int cost) const {
+	Actor *a;
+
+	if (_roomResource != 0)
+		for (int i = 1; i < _numActors; i++) {
+			a = derefActor(i);
+			if (a->isInCurrentRoom() && a->_costume == cost)
+				return true;
+		}
+
+	return false;
+}
+#endif
+
+} // End of namespace Scumm
diff --git a/engines/scumm/base-costume.cpp b/engines/scumm/base-costume.cpp
index e4106fa343d..9f47d3fc864 100644
--- a/engines/scumm/base-costume.cpp
+++ b/engines/scumm/base-costume.cpp
@@ -25,6 +25,7 @@
 
 namespace Scumm {
 
+#ifndef SCUMM_OPTIMISED_CODE
 byte BaseCostumeRenderer::drawCostume(const VirtScreen &vs, int numStrips, const Actor *a, bool drawToBackBuf) {
 	int i;
 	byte result = 0;
@@ -450,5 +451,6 @@ bool ScummEngine::isCostumeInUse(int cost) const {
 
 	return false;
 }
+#endif
 
 } // End of namespace Scumm
diff --git a/engines/scumm/module.mk b/engines/scumm/module.mk
index c85694c0f0b..819efda0e67 100644
--- a/engines/scumm/module.mk
+++ b/engines/scumm/module.mk
@@ -4,6 +4,7 @@ MODULE_OBJS := \
 	actor.o \
 	akos.o \
 	base-costume.o \
+	base-costume-optimised.o \
 	bomp.o \
 	boxes.o \
 	camera.o \


Commit: 55c6fddccf2be886c33ac7de68ef6856aee48dbf
    https://github.com/scummvm/scummvm/commit/55c6fddccf2be886c33ac7de68ef6856aee48dbf
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: Optimise only paintCelByleRLECommon & byleRLEDecode

Other functions are shared with base-costume.cpp.

Changed paths:
    engines/scumm/base-costume-optimised.cpp
    engines/scumm/base-costume.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 17c19a38767..e1567f33331 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -26,38 +26,6 @@
 namespace Scumm {
 
 #ifdef SCUMM_OPTIMISED_CODE
-byte BaseCostumeRenderer::drawCostume(const VirtScreen &vs, int numStrips, const Actor *a, bool drawToBackBuf) {
-	int i;
-	byte result = 0;
-
-	_out = vs;
-	if (drawToBackBuf)
-		_out.setPixels(vs.getBackPixels(0, 0));
-	else
-		_out.setPixels(vs.getPixels(0, 0));
-
-	_actorX += _vm->_virtscr[kMainVirtScreen].xstart & 7;
-	_out.w = _out.pitch / _vm->_bytesPerPixel;
-	// We do not use getBasePtr here because the offset to pixels never used
-	// _vm->_bytesPerPixel, but it seems unclear why.
-	_out.setPixels((byte *)_out.getPixels() - (_vm->_virtscr[kMainVirtScreen].xstart & 7));
-
-	_numStrips = numStrips;
-
-	if (_vm->_game.version <= 1) {
-		_xMove = 0;
-		_yMove = 0;
-	} else if (_vm->_game.features & GF_OLD_BUNDLE) {
-		_xMove = -72;
-		_yMove = -100;
-	} else {
-		_xMove = _yMove = 0;
-	}
-	for (i = 0; i < 16; i++)
-		result |= drawLimb(a, i);
-	return result;
-}
-
 byte BaseCostumeRenderer::paintCelByleRLECommon(
 	int xMoveCur,
 	int yMoveCur,
@@ -420,37 +388,6 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 		} while (--len);
 	} while (true);
 }
-
-void BaseCostumeRenderer::skipCelLines(ByleRLEData &compData, int num) {
-	num *= _height;
-
-	do {
-		compData.repLen = *_srcPtr++;
-		compData.repColor = compData.repLen >> compData.shr;
-		compData.repLen &= compData.mask;
-
-		if (!compData.repLen)
-			compData.repLen = *_srcPtr++;
-
-		do {
-			if (!--num)
-				return;
-		} while (--compData.repLen);
-	} while (true);
-}
-
-bool ScummEngine::isCostumeInUse(int cost) const {
-	Actor *a;
-
-	if (_roomResource != 0)
-		for (int i = 1; i < _numActors; i++) {
-			a = derefActor(i);
-			if (a->isInCurrentRoom() && a->_costume == cost)
-				return true;
-		}
-
-	return false;
-}
 #endif
 
 } // End of namespace Scumm
diff --git a/engines/scumm/base-costume.cpp b/engines/scumm/base-costume.cpp
index 9f47d3fc864..a43f2d0302b 100644
--- a/engines/scumm/base-costume.cpp
+++ b/engines/scumm/base-costume.cpp
@@ -25,7 +25,6 @@
 
 namespace Scumm {
 
-#ifndef SCUMM_OPTIMISED_CODE
 byte BaseCostumeRenderer::drawCostume(const VirtScreen &vs, int numStrips, const Actor *a, bool drawToBackBuf) {
 	int i;
 	byte result = 0;
@@ -58,6 +57,7 @@ byte BaseCostumeRenderer::drawCostume(const VirtScreen &vs, int numStrips, const
 	return result;
 }
 
+#ifndef SCUMM_OPTIMISED_CODE
 byte BaseCostumeRenderer::paintCelByleRLECommon(
 	int xMoveCur,
 	int yMoveCur,
@@ -420,6 +420,7 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 		} while (--len);
 	} while (true);
 }
+#endif
 
 void BaseCostumeRenderer::skipCelLines(ByleRLEData &compData, int num) {
 	num *= _height;
@@ -451,6 +452,5 @@ bool ScummEngine::isCostumeInUse(int cost) const {
 
 	return false;
 }
-#endif
 
 } // End of namespace Scumm


Commit: 4e469cf33d812b368d459eb8b629f79d4ed1df95
    https://github.com/scummvm/scummvm/commit/4e469cf33d812b368d459eb8b629f79d4ed1df95
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: Introduce BaseCostumeRenderer::byleRLEDecodeFast

Use this code path if:

- _vm->_bytesPerPixel == 1

- (!_akosRendering || _shadowMode != 3 || _vm->_game.heversion < 90) (to be implemented later)

- !(_vm->_game.features & GF_16BIT_COLOR)

- actorHitResult == NULL

- compData.maskPtr != NULL

Otherwise use original BaseCostumeRenderer::byleRLEDecode code.

The fast version also contains two functional changes:

- Monkey 1 EGA uses proper scaling

- double-shadowing is not done in the classic renderer

Changed paths:
    engines/scumm/base-costume-optimised.cpp
    engines/scumm/base-costume.cpp
    engines/scumm/base-costume.h


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index e1567f33331..c268fd0d326 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -252,7 +252,14 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 	return drawFlag;
 }
 
-void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX, int16 actorHitY, bool *actorHitResult, const uint8 *xmap) {
+enum class ShadowMode : int {
+	Mode0,
+	Mode1,
+	Mode3,
+	Classic
+};
+
+void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 	const byte *src = _srcPtr;
 	byte *dst = compData.destPtr;
 
@@ -268,6 +275,19 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
+	ShadowMode shadowMode = ShadowMode::Mode0;
+	if (!_akosRendering) {
+		if (_shadowMode & 0x20)
+			shadowMode = ShadowMode::Classic;
+		else
+			shadowMode = ShadowMode::Mode1;
+	} else {
+		if (_shadowMode == 1)
+			shadowMode = ShadowMode::Mode1;
+		else if (_shadowMode == 3)
+			shadowMode = ShadowMode::Mode3;
+	}
+
 	if (len)
 		goto StartPos;
 
@@ -280,73 +300,43 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 
 		do {
 			if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-				if (actorHitResult) {
-					if (color && y == actorHitY && compData.x == actorHitX) {
-						*actorHitResult = true;
-						return;
-					}
-				} else {
-					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
-						|| (compData.maskPtr && (*mask & maskbit));
-					bool skipColumn = false;
-
-					if (color && !masked) {
-						uint16 pcolor;
-
-						if (!_akosRendering) {
-							if (_shadowMode & 0x20) {
-								pcolor = _shadowTable[*dst];
-							} else {
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable)
-									pcolor = _shadowTable[*dst];
-							}
+				const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+				|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
+					|| (*mask & maskbit);
+				if (color && !masked) {
+					uint16 pcolor;
+
+					switch(shadowMode) {
+					case ShadowMode::Mode0:
+						*dst = _palette[color];
+						break;
+
+					case ShadowMode::Classic:
+						if (lastColumnX != compData.x)
+							*dst = _shadowTable[*dst];
+						break;
+
+					case ShadowMode::Mode1:
+						pcolor = _palette[color];
+						if (pcolor == 13 && _shadowTable) {
+							if (lastColumnX != compData.x)
+								*dst = _shadowTable[*dst];
 						} else {
-							pcolor = _palette[color];
-
-							if (_shadowMode == 1) {
-								if (pcolor == 13) {
-									// In shadow mode 1 skipColumn works more or less the same way as in shadow
-									// mode 3. It is only ever checked and applied if pcolor is 13.
-									skipColumn = (lastColumnX == compData.x);
-									pcolor = _shadowTable[*dst];
-								}
-							} else if (_shadowMode == 2) {
-								error("AkosRenderer::byleRLEDecode(): shadowMode 2 not implemented."); // TODO
-							} else if (_shadowMode == 3) {
-								if (_vm->_game.features & GF_16BIT_COLOR) {
-									// I add the column skip here, too, although I don't know whether it always
-									// applies. But this is the only way to prevent recursive shading of pixels.
-									// This might need more fine tuning...
-									skipColumn = (lastColumnX == compData.x);
-									uint16 srcColor = (pcolor >> 1) & 0x7DEF;
-									uint16 dstColor = (READ_UINT16(dst) >> 1) & 0x7DEF;
-									pcolor = srcColor + dstColor;
-								} else if (_vm->_game.heversion >= 90) {
-									// I add the column skip here, too, although I don't know whether it always
-									// applies. But this is the only way to prevent recursive shading of pixels.
-									// This might need more fine tuning...
-									skipColumn = (lastColumnX == compData.x);
-									pcolor = (pcolor << 8) + *dst;
-									pcolor = xmap[pcolor];
-								} else if (pcolor < 8) {
-									// This mode is used in COMI. The column skip only takes place when the shading
-									// is actually applied (for pcolor < 8). The skip avoids shading of pixels that
-									// already have been shaded.
-									skipColumn = (lastColumnX == compData.x);
-									pcolor = (pcolor << 8) + *dst;
-									pcolor = _shadowTable[pcolor];
-								}
-							}
+							*dst = pcolor;
 						}
-						if (!skipColumn) {
-							if (_vm->_bytesPerPixel == 2) {
-								WRITE_UINT16(dst, pcolor);
-							} else {
-								*dst = pcolor;
+						break;
+
+					case ShadowMode::Mode3:
+						pcolor = _palette[color];
+						if (pcolor < 8) {
+							if (lastColumnX != compData.x) {
+								pcolor = (pcolor << 8) + *dst;
+								*dst = _shadowTable[pcolor];
 							}
+						} else {
+							*dst = pcolor;
 						}
+						break;
 					}
 				}
 				dst += _out.pitch;
@@ -367,19 +357,10 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 					if (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
 						return;
 					maskbit = revBitMask(compData.x & 7);
-					compData.destPtr += compData.scaleXStep * _vm->_bytesPerPixel;
+					compData.destPtr += compData.scaleXStep;
 				}
 
-				// From MONKEY1 EGA disasm: we only increment by 1.
-				// This accurately produces the original wonky scaling
-				// for the floppy editions of Monkey Island 1.
-				// Also valid for all other v4 games (this code is
-				// also in the executable for LOOM CD).
-				if (_vm->_game.version == 4) {
-					compData.scaleXIndex = (compData.scaleXIndex + 1) & compData.scaleIndexMask;
-				} else {
-					compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
-				}
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
diff --git a/engines/scumm/base-costume.cpp b/engines/scumm/base-costume.cpp
index a43f2d0302b..cfe12d5a481 100644
--- a/engines/scumm/base-costume.cpp
+++ b/engines/scumm/base-costume.cpp
@@ -283,8 +283,18 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 
 	return drawFlag;
 }
+#endif
 
 void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX, int16 actorHitY, bool *actorHitResult, const uint8 *xmap) {
+#ifdef SCUMM_OPTIMISED_CODE
+	if ((_vm->_bytesPerPixel == 1) &&
+		(!_akosRendering || _shadowMode != 3 || (!(_vm->_game.features & GF_16BIT_COLOR) && _vm->_game.heversion < 90)) &&
+		(actorHitResult == NULL) &&
+		(compData.maskPtr != NULL)) {
+		byleRLEDecodeFast(compData);
+		return;
+	}
+#endif
 	const byte *src = _srcPtr;
 	byte *dst = compData.destPtr;
 
@@ -420,7 +430,6 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 		} while (--len);
 	} while (true);
 }
-#endif
 
 void BaseCostumeRenderer::skipCelLines(ByleRLEData &compData, int num) {
 	num *= _height;
diff --git a/engines/scumm/base-costume.h b/engines/scumm/base-costume.h
index e568c075d45..9ffa861c5af 100644
--- a/engines/scumm/base-costume.h
+++ b/engines/scumm/base-costume.h
@@ -173,7 +173,9 @@ protected:
 		bool &decode);
 
 	void byleRLEDecode(ByleRLEData &compData, int16 actorHitX = 0, int16 actorHitY = 0, bool *actorHitResult = nullptr, const uint8 *xmap = nullptr);
-
+#ifdef SCUMM_OPTIMISED_CODE
+	void byleRLEDecodeFast(ByleRLEData &compData);
+#endif
 	void skipCelLines(ByleRLEData &compData, int num);
 
 private:


Commit: 8b49d1b5a03177eeeac2de43830b8cfd220f2be0
    https://github.com/scummvm/scummvm/commit/8b49d1b5a03177eeeac2de43830b8cfd220f2be0
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Don't calculate masking for transparent color

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index c268fd0d326..422582777e1 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -300,51 +300,54 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 
 		do {
 			if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-				const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-				|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
-					|| (*mask & maskbit);
-				if (color && !masked) {
-					uint16 pcolor;
-
-					switch(shadowMode) {
-					case ShadowMode::Mode0:
-						*dst = _palette[color];
-						break;
-
-					case ShadowMode::Classic:
-						if (lastColumnX != compData.x)
-							*dst = _shadowTable[*dst];
-						break;
-
-					case ShadowMode::Mode1:
-						pcolor = _palette[color];
-						if (pcolor == 13 && _shadowTable) {
+				if (color) {
+					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+					|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
+						|| (*mask & maskbit);
+
+					if (!masked) {
+						uint16 pcolor;
+
+						switch(shadowMode) {
+						case ShadowMode::Mode0:
+							*dst = _palette[color];
+							break;
+
+						case ShadowMode::Classic:
 							if (lastColumnX != compData.x)
 								*dst = _shadowTable[*dst];
-						} else {
-							*dst = pcolor;
-						}
-						break;
-
-					case ShadowMode::Mode3:
-						pcolor = _palette[color];
-						if (pcolor < 8) {
-							if (lastColumnX != compData.x) {
-								pcolor = (pcolor << 8) + *dst;
-								*dst = _shadowTable[pcolor];
+							break;
+
+						case ShadowMode::Mode1:
+							pcolor = _palette[color];
+							if (pcolor == 13 && _shadowTable) {
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+							} else {
+								*dst = pcolor;
+							}
+							break;
+
+						case ShadowMode::Mode3:
+							pcolor = _palette[color];
+							if (pcolor < 8) {
+								if (lastColumnX != compData.x) {
+									pcolor = (pcolor << 8) + *dst;
+									*dst = _shadowTable[pcolor];
+								}
+							} else {
+								*dst = pcolor;
 							}
-						} else {
-							*dst = pcolor;
+							break;
 						}
-						break;
 					}
 				}
 				dst += _out.pitch;
 				mask += _numStrips;
 				y++;
 			}
-			if (!--height) {
-				if (!--compData.skipWidth)
+			if (--height == 0) {
+				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
 				y = compData.y;


Commit: 041c9eb7d616fff8680d5efcb633195cc1068d6d
    https://github.com/scummvm/scummvm/commit/041c9eb7d616fff8680d5efcb633195cc1068d6d
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Check for height == 0 every repLen pixels

Instead of every pixel.

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 422582777e1..80be54069b9 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -288,8 +288,11 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 			shadowMode = ShadowMode::Mode3;
 	}
 
-	if (len)
+	byte batch;
+	if (len) {
+		--len;
 		goto StartPos;
+	}
 
 	do {
 		len = *src++;
@@ -299,54 +302,61 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 			len = *src++;
 
 		do {
-			if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-				if (color) {
-					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-					|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
-						|| (*mask & maskbit);
-
-					if (!masked) {
-						uint16 pcolor;
-
-						switch(shadowMode) {
-						case ShadowMode::Mode0:
-							*dst = _palette[color];
-							break;
-
-						case ShadowMode::Classic:
-							if (lastColumnX != compData.x)
-								*dst = _shadowTable[*dst];
-							break;
-
-						case ShadowMode::Mode1:
-							pcolor = _palette[color];
-							if (pcolor == 13 && _shadowTable) {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+						|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
 								if (lastColumnX != compData.x)
 									*dst = _shadowTable[*dst];
-							} else {
-								*dst = pcolor;
-							}
-							break;
-
-						case ShadowMode::Mode3:
-							pcolor = _palette[color];
-							if (pcolor < 8) {
-								if (lastColumnX != compData.x) {
-									pcolor = (pcolor << 8) + *dst;
-									*dst = _shadowTable[pcolor];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
 								}
-							} else {
-								*dst = pcolor;
+								break;
 							}
-							break;
 						}
 					}
+					dst += _out.pitch;
+					mask += _numStrips;
+					y++;
 				}
-				dst += _out.pitch;
-				mask += _numStrips;
-				y++;
-			}
-			if (--height == 0) {
+			} while (--batch);
+
+			if (height == 0) {
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
@@ -369,7 +379,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				mask = compData.maskPtr + compData.x / 8;
 			}
 		StartPos:;
-		} while (--len);
+		} while (len > 0);
 	} while (true);
 }
 #endif


Commit: 546fa65d4549f62ecdf9debca145717f29e1e81a
    https://github.com/scummvm/scummvm/commit/546fa65d4549f62ecdf9debca145717f29e1e81a
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Remove linesToSkip kludge

Original SCUMM code had a bug in startScaleIndexX calculation in the
scaled mode: in the last loop's iteration, it wasn't updated to the
final value but kept lowered by 1.

The linesToSkip-- kludge was implemented in SCUMM to work around but it
introduced another bug, x could become < 0 when entering byleRLEDecode
(which was addressed in ScummVM by commit 863bed3f to fix
https://bugs.scummvm.org/ticket/2700).

With the kludge out of the way, the initial xMasked is not needed. Also,
clipping coordinates are now deterministic, so we got a clearer picture
of needed changes to the dirty rectangle.

IMPORTANT: causes a visual change against original SCUMM. Actors on the
left and right edge are no more shifted by one column, i.e. the
correct(ed) code draws one columns less on each edge. Visible in e.g.
Full Throttle's initial screen when the actor goes out and returns back
to the scene with the trash bin.

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 80be54069b9..4ceae657b76 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -38,7 +38,7 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 
 	bool actorIsScaled;
 	int i, j;
-	int linesToSkip = 0, startScaleIndexX, startScaleIndexY;
+	int linesToSkip = 0, trailingLinesToSkip = 0, startScaleIndexX, startScaleIndexY;
 	Common::Rect rect;
 	int step;
 	byte drawFlag = 1;
@@ -63,12 +63,11 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 	compData.boundsRect.bottom = _out.h;
 
 	if (actorIsScaled) {
-
 		/* Scale direction */
 		compData.scaleXStep = -1;
 		if (xMoveCur < 0) {
 			xMoveCur = -xMoveCur;
-			compData.scaleXStep = 1;
+			compData.scaleXStep = -compData.scaleXStep;
 		}
 
 		if (_drawActorToRight) {
@@ -83,13 +82,15 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 
 			j = startScaleIndexX;
 			for (i = 0; i < _width; i++) {
-				if (rect.right < 0) {
+				if (rect.right < compData.boundsRect.left) {
 					linesToSkip++;
-					startScaleIndexX = j;
+				} else if (rect.right >= compData.boundsRect.right) {
+					trailingLinesToSkip++;
 				}
 				if (compData.scaleTable[j++ & compData.scaleIndexMask] < _scaleX)
 					rect.right++;
 			}
+			startScaleIndexX += linesToSkip;
 		} else {
 			/* No mirror */
 			/* Adjust X position */
@@ -104,17 +105,16 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 			j = startScaleIndexX;
 			for (i = 0; i < _width; i++) {
 				if (rect.left >= compData.boundsRect.right) {
-					startScaleIndexX = j;
 					linesToSkip++;
+				} else if (rect.left < compData.boundsRect.left) {
+					trailingLinesToSkip++;
 				}
 				if (compData.scaleTable[j-- & compData.scaleIndexMask] < _scaleX)
 					rect.left--;
 			}
+			startScaleIndexX -= linesToSkip;
 		}
 
-		if (linesToSkip)
-			linesToSkip--;
-
 		step = -1;
 		if (yMoveCur < 0) {
 			yMoveCur = -yMoveCur;
@@ -144,9 +144,18 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 		if (_drawActorToRight) {
 			rect.left = compData.x;
 			rect.right = compData.x + _width;
+
+			linesToSkip = compData.boundsRect.left - compData.x;
+			trailingLinesToSkip = rect.right - compData.boundsRect.right;
 		} else {
 			rect.left = compData.x - _width;
 			rect.right = compData.x;
+
+			linesToSkip = rect.right - compData.boundsRect.right + 1;
+			if (c64Cost)
+				trailingLinesToSkip = (compData.boundsRect.left - 8) - rect.left;
+			else
+				trailingLinesToSkip = (compData.boundsRect.left - 1) - rect.left;
 		}
 
 		rect.top = compData.y;
@@ -161,7 +170,29 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 	compData.skipWidth = _width;
 	compData.scaleXStep = _drawActorToRight ? 1 : -1;
 
-	markAsDirty(rect, compData, decode);
+	// All the important 'rect' values. scale sequence = 'compData.scaleTable[i] < _scaleX' result)
+	//
+	// rendering dir | scaled | scale sequence | drawn columns | dirty columns          | ideal fix
+	// --------------+--------+----------------+---------------+------------------------+----------------
+	// left-to-right | no     | N/A            | 10, 11, 12    | [10, 13) = 10, 11, 12  | none needed
+	// left-to-right | yes    | T, T, T        | 10, 11, 12    | [10, 13) = 10, 11, 12  | none needed
+	// left-to-right | yes    | T, F, T        | 10, 11        | [10, 12) = 10, 11      | none needed
+	// left-to-right | yes    | T, T, F        | 10, 11, 12    | [10, 12) = 10, 11      | right++
+	// right-to-left | no     | N/A            | 10,  9,  8    | [ 7, 10) =  9,  8,  7  | left++, right++
+	// right-to-left | yes    | T, T, T        | 10,  9,  8    | [ 7, 10) =  9,  8,  7  | left++, right++
+	// right-to-left | yes    | T, F, T        | 10,  9        | [ 8, 10) =  9,  8      | left++, right++
+	// right-to-left | yes    | T, T, F        | 10,  9,  8    | [ 8, 10) =  9,  8      | right++
+	//
+	// Considering how complex would be to handle all left/right adjustments precisely, go with what the old
+	// costume renderer did, just add +1 to the right. That sometimes extends the dirty rect by one or two
+	// columns but definitely fixes all edge cases with zero effort.
+	Common::Rect dirtyRect = rect;
+	if (_akosRendering) {
+		// ClassicCostumeRenderer::markAsDirty already does that
+		dirtyRect.right++;
+	}
+
+	markAsDirty(dirtyRect, compData, decode);
 	if (!decode)
 		return 0;
 
@@ -177,49 +208,22 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 
 	compData.repLen = 0;
 
-	if (_drawActorToRight) {
-		if (!actorIsScaled)
-			linesToSkip = compData.boundsRect.left - compData.x;
-		if (linesToSkip > 0) {
-			if (!amiOrPcEngCost && !c64Cost) {
-				compData.skipWidth -= linesToSkip;
-				skipCelLines(compData, linesToSkip);
-				compData.x = compData.boundsRect.left;
-			}
-		} else {
-			linesToSkip = rect.right - compData.boundsRect.right;
-			if (linesToSkip <= 0) {
-				drawFlag = 2;
-			} else {
-				compData.skipWidth -= linesToSkip;
-			}
-		}
-	} else {
-		if (!actorIsScaled) {
-			if (_akosRendering)
-				linesToSkip = rect.right - compData.boundsRect.right + 1;
-			else
-				linesToSkip = rect.right - compData.boundsRect.right;
-		}
-		if (linesToSkip > 0) {
-			if (!amiOrPcEngCost && !c64Cost) {
-				compData.skipWidth -= linesToSkip;
-				skipCelLines(compData, linesToSkip);
-				compData.x = compData.boundsRect.right - 1;
-			}
-		} else {
-			// V1 games uses 8 x 8 pixels for actors
-			if (c64Cost)
-				linesToSkip = (compData.boundsRect.left - 8) - rect.left;
-			else
-				linesToSkip = (compData.boundsRect.left - 1) - rect.left;
-			if (linesToSkip <= 0)
-				drawFlag = 2;
-			else
-				compData.skipWidth -= linesToSkip;
+	if (linesToSkip > 0) {
+		if (!amiOrPcEngCost && !c64Cost) {
+			compData.skipWidth -= linesToSkip;
+			skipCelLines(compData, linesToSkip);
+			compData.x = _drawActorToRight ? compData.boundsRect.left : (compData.boundsRect.right - 1);
 		}
 	}
 
+	if (trailingLinesToSkip > 0) {
+		compData.skipWidth -= trailingLinesToSkip;
+	}
+
+	if (linesToSkip <= 0 && trailingLinesToSkip <= 0) {
+		drawFlag = 2;
+	}
+
 	if (compData.skipWidth <= 0) {
 		decode = false;
 		return 0;
@@ -306,11 +310,12 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 			len -= batch;
 			height -= batch;
 
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
 			do {
 				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
 							|| (*mask & maskbit);
 
 						if (!masked) {
@@ -367,8 +372,6 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 
 				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
-					if (compData.x < compData.boundsRect.left || compData.x >= compData.boundsRect.right)
-						return;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
 				}


Commit: c5e04bec37cdc2a0faf6e276b32648d097a8def1
    https://github.com/scummvm/scummvm/commit/c5e04bec37cdc2a0faf6e276b32648d097a8def1
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Introduce 10 identical copies

For easier tracking of what has changed.

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 4ceae657b76..3317bdea9ce 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -263,7 +263,1127 @@ enum class ShadowMode : int {
 	Classic
 };
 
-void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
+void ByleRLEDecode_Mode0(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Mode1(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Mode3(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Classic(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Scaled_Mode0(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Scaled_Mode0_SMask(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Scaled_Mode1(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Scaled_Mode1_SMask(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Scaled_Mode3(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Scaled_Classic_SMask(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
 	const byte *src = _srcPtr;
 	byte *dst = compData.destPtr;
 
@@ -279,6 +1399,131 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+							|| (*mask & maskbit);
+
+						if (!masked) {
+							uint16 pcolor;
+
+							switch(shadowMode) {
+							case ShadowMode::Mode0:
+								*dst = _palette[color];
+								break;
+
+							case ShadowMode::Classic:
+								if (lastColumnX != compData.x)
+									*dst = _shadowTable[*dst];
+								break;
+
+							case ShadowMode::Mode1:
+								pcolor = _palette[color];
+								if (pcolor == 13 && _shadowTable) {
+									if (lastColumnX != compData.x)
+										*dst = _shadowTable[*dst];
+								} else {
+									*dst = pcolor;
+								}
+								break;
+
+							case ShadowMode::Mode3:
+								pcolor = _palette[color];
+								if (pcolor < 8) {
+									if (lastColumnX != compData.x) {
+										pcolor = (pcolor << 8) + *dst;
+										*dst = _shadowTable[pcolor];
+									}
+								} else {
+									*dst = pcolor;
+								}
+								break;
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+				y = compData.y;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+typedef void (*ByleRLEDecodeFunc)(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const ShadowMode shadowMode,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
+static const ByleRLEDecodeFunc byleRLEDecodeNoScaleTable[4] = {
+	ByleRLEDecode_Mode0,    // 0: Mode0
+	ByleRLEDecode_Mode1,    // 1: Mode1
+	ByleRLEDecode_Mode3,    // 2: Mode3
+	ByleRLEDecode_Classic,  // 3: Classic
+};
+
+static const ByleRLEDecodeFunc byleRLEDecodeScaledTable[8] = {
+	ByleRLEDecode_Scaled_Mode0,         // 0: Mode0,   no scaleIndexMask
+	ByleRLEDecode_Scaled_Mode0_SMask,   // 1: Mode0,   scaleIndexMask
+	ByleRLEDecode_Scaled_Mode1,         // 2: Mode1,   no scaleIndexMask
+	ByleRLEDecode_Scaled_Mode1_SMask,   // 3: Mode1,   scaleIndexMask
+	ByleRLEDecode_Scaled_Mode3,         // 4: Mode3,   no scaleIndexMask
+	nullptr,                            // 5: Mode3,   scaleIndexMask (COMI's Mode3 always uses bigCostumeScaleTable)
+	nullptr,                            // 6: Classic, no scaleIndexMask (_shadowMode & 0x20 always uses smallCostumeScaleTable)
+	ByleRLEDecode_Scaled_Classic_SMask, // 7: Classic, scaleIndexMask
+};
+
+void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 	ShadowMode shadowMode = ShadowMode::Mode0;
 	if (!_akosRendering) {
 		if (_shadowMode & 0x20)
@@ -292,6 +1537,51 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 			shadowMode = ShadowMode::Mode3;
 	}
 
+	{
+		const int scaled = (_scaleX != 255 || _scaleY != 255);
+		const int useScaleIndexMask = compData.scaleIndexMask != -1;
+		if (!scaled)
+			byleRLEDecodeNoScaleTable[static_cast<int>(shadowMode)](
+				&compData,
+				_scaleX,
+				_scaleY,
+				_height,
+				_out.pitch,
+				_numStrips,
+				shadowMode,
+				_srcPtr,
+				_shadowTable,
+				_palette);
+		else
+			byleRLEDecodeScaledTable[(static_cast<int>(shadowMode) << 1) | useScaleIndexMask](
+				&compData,
+				_scaleX,
+				_scaleY,
+				_height,
+				_out.pitch,
+				_numStrips,
+				shadowMode,
+				_srcPtr,
+				_shadowTable,
+				_palette);
+		return;
+	}
+
+	const byte *src = _srcPtr;
+	byte *dst = compData.destPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	int lastColumnX = -1;
+	int y = compData.y;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
 	byte batch;
 	if (len) {
 		--len;


Commit: 27a71d8195129e7165745aece8f0f2f49593b990
    https://github.com/scummvm/scummvm/commit/27a71d8195129e7165745aece8f0f2f49593b990
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Remove dead code

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 3317bdea9ce..baa515cb3e9 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -265,14 +265,13 @@ enum class ShadowMode : int {
 
 void ByleRLEDecode_Mode0(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
-	const byte _scaleX,
-	const byte _scaleY,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
-	const byte *_shadowTable,
+	const byte *_shadowTable, /* unused */
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
@@ -284,10 +283,8 @@ void ByleRLEDecode_Mode0(
 	uint16 color = compData.repColor;
 
 	// reset every column
-	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
-	int scaleIndexY = compData.scaleYIndex;
 
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -313,52 +310,17 @@ void ByleRLEDecode_Mode0(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
-
-						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
+				if (color) {
+					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+					|| (*mask & maskbit);
 
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
-							}
-						}
+					if (!masked) {
+						*dst = _palette[color];
 					}
-					dst += pitch;
-					mask += _numStrips;
-					y++;
 				}
+				dst += pitch;
+				mask += _numStrips;
+				y++;
 			} while (--batch);
 
 			if (height == 0) {
@@ -367,16 +329,9 @@ void ByleRLEDecode_Mode0(
 				height = _height;
 				y = compData.y;
 
-				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
-
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
-					compData.x += compData.scaleXStep;
-					maskbit = revBitMask(compData.x & 7);
-					compData.destPtr += compData.scaleXStep;
-				}
-
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -388,12 +343,11 @@ void ByleRLEDecode_Mode0(
 
 void ByleRLEDecode_Mode1(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
-	const byte _scaleX,
-	const byte _scaleY,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
 	const uint16 *_palette) {
@@ -410,7 +364,6 @@ void ByleRLEDecode_Mode1(
 	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
-	int scaleIndexY = compData.scaleYIndex;
 
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -436,52 +389,25 @@ void ByleRLEDecode_Mode1(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
-
-						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
-							}
+				if (color) {
+					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+					|| (*mask & maskbit);
+
+					if (!masked) {
+						uint16 pcolor;
+
+						pcolor = _palette[color];
+						if (pcolor == 13 && _shadowTable) {
+							if (lastColumnX != compData.x)
+								*dst = _shadowTable[*dst];
+						} else {
+							*dst = pcolor;
 						}
 					}
-					dst += pitch;
-					mask += _numStrips;
-					y++;
 				}
+				dst += pitch;
+				mask += _numStrips;
+				y++;
 			} while (--batch);
 
 			if (height == 0) {
@@ -490,16 +416,11 @@ void ByleRLEDecode_Mode1(
 				height = _height;
 				y = compData.y;
 
-				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
-					compData.x += compData.scaleXStep;
-					maskbit = revBitMask(compData.x & 7);
-					compData.destPtr += compData.scaleXStep;
-				}
-
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -511,12 +432,11 @@ void ByleRLEDecode_Mode1(
 
 void ByleRLEDecode_Mode3(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
-	const byte _scaleX,
-	const byte _scaleY,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
 	const uint16 *_palette) {
@@ -533,7 +453,6 @@ void ByleRLEDecode_Mode3(
 	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
-	int scaleIndexY = compData.scaleYIndex;
 
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -559,52 +478,27 @@ void ByleRLEDecode_Mode3(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
-
-						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
+				if (color) {
+					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+					|| (*mask & maskbit);
+
+					if (!masked) {
+						uint16 pcolor;
+
+						pcolor = _palette[color];
+						if (pcolor < 8) {
+							if (lastColumnX != compData.x) {
+								pcolor = (pcolor << 8) + *dst;
+								*dst = _shadowTable[pcolor];
 							}
+						} else {
+							*dst = pcolor;
 						}
 					}
-					dst += pitch;
-					mask += _numStrips;
-					y++;
 				}
+				dst += pitch;
+				mask += _numStrips;
+				y++;
 			} while (--batch);
 
 			if (height == 0) {
@@ -613,16 +507,11 @@ void ByleRLEDecode_Mode3(
 				height = _height;
 				y = compData.y;
 
-				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
-					compData.x += compData.scaleXStep;
-					maskbit = revBitMask(compData.x & 7);
-					compData.destPtr += compData.scaleXStep;
-				}
-
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -634,15 +523,14 @@ void ByleRLEDecode_Mode3(
 
 void ByleRLEDecode_Classic(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
-	const byte _scaleX,
-	const byte _scaleY,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
-	const uint16 *_palette) {
+	const uint16 *_palette /* unused */) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
@@ -656,7 +544,6 @@ void ByleRLEDecode_Classic(
 	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
-	int scaleIndexY = compData.scaleYIndex;
 
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -682,52 +569,18 @@ void ByleRLEDecode_Classic(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
-					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
-
-						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
+				if (color) {
+					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
+					|| (*mask & maskbit);
 
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
-							}
-						}
+					if (!masked) {
+						if (lastColumnX != compData.x)
+							*dst = _shadowTable[*dst];
 					}
-					dst += pitch;
-					mask += _numStrips;
-					y++;
 				}
+				dst += pitch;
+				mask += _numStrips;
+				y++;
 			} while (--batch);
 
 			if (height == 0) {
@@ -736,16 +589,11 @@ void ByleRLEDecode_Classic(
 				height = _height;
 				y = compData.y;
 
-				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
-					compData.x += compData.scaleXStep;
-					maskbit = revBitMask(compData.x & 7);
-					compData.destPtr += compData.scaleXStep;
-				}
-
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -762,9 +610,8 @@ void ByleRLEDecode_Scaled_Mode0(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
-	const byte *_shadowTable,
+	const byte *_shadowTable, /* unused */
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
@@ -776,7 +623,6 @@ void ByleRLEDecode_Scaled_Mode0(
 	uint16 color = compData.repColor;
 
 	// reset every column
-	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
@@ -805,46 +651,13 @@ void ByleRLEDecode_Scaled_Mode0(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
-							}
+							*dst = _palette[color];
 						}
 					}
 					dst += pitch;
@@ -860,15 +673,14 @@ void ByleRLEDecode_Scaled_Mode0(
 				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
 				}
 
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.scaleXIndex += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -885,9 +697,8 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
-	const byte *_shadowTable,
+	const byte *_shadowTable, /* unused */
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
@@ -899,7 +710,6 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 	uint16 color = compData.repColor;
 
 	// reset every column
-	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
@@ -928,46 +738,13 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
-							}
+							*dst = _palette[color];
 						}
 					}
 					dst += pitch;
@@ -983,9 +760,8 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1008,7 +784,6 @@ void ByleRLEDecode_Scaled_Mode1(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
 	const uint16 *_palette) {
@@ -1051,45 +826,20 @@ void ByleRLEDecode_Scaled_Mode1(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
 							uint16 pcolor;
 
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
+							pcolor = _palette[color];
+							if (pcolor == 13 && _shadowTable) {
 								if (lastColumnX != compData.x)
 									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
+							} else {
+								*dst = pcolor;
 							}
 						}
 					}
@@ -1108,13 +858,13 @@ void ByleRLEDecode_Scaled_Mode1(
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
 				}
 
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.scaleXIndex += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -1131,7 +881,6 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
 	const uint16 *_palette) {
@@ -1174,45 +923,20 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
 							uint16 pcolor;
 
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
+							pcolor = _palette[color];
+							if (pcolor == 13 && _shadowTable) {
 								if (lastColumnX != compData.x)
 									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
+							} else {
+								*dst = pcolor;
 							}
 						}
 					}
@@ -1231,7 +955,7 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1254,7 +978,6 @@ void ByleRLEDecode_Scaled_Mode3(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
 	const uint16 *_palette) {
@@ -1297,45 +1020,22 @@ void ByleRLEDecode_Scaled_Mode3(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
 							uint16 pcolor;
 
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
+							pcolor = _palette[color];
+							if (pcolor < 8) {
+								if (lastColumnX != compData.x) {
+									pcolor = (pcolor << 8) + *dst;
+									*dst = _shadowTable[pcolor];
 								}
-								break;
+							} else {
+								*dst = pcolor;
 							}
 						}
 					}
@@ -1354,13 +1054,13 @@ void ByleRLEDecode_Scaled_Mode3(
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
 				}
 
-				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+				compData.scaleXIndex += compData.scaleXStep;
 
 				dst = compData.destPtr;
 				mask = compData.maskPtr + compData.x / 8;
@@ -1377,10 +1077,9 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
-	const uint16 *_palette) {
+	const uint16 *_palette /* unused */) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
@@ -1420,46 +1119,14 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
 			do {
-				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
-							uint16 pcolor;
-
-							switch(shadowMode) {
-							case ShadowMode::Mode0:
-								*dst = _palette[color];
-								break;
-
-							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
-								break;
-
-							case ShadowMode::Mode1:
-								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
-								} else {
-									*dst = pcolor;
-								}
-								break;
-
-							case ShadowMode::Mode3:
-								pcolor = _palette[color];
-								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
-								} else {
-									*dst = pcolor;
-								}
-								break;
-							}
+							if (lastColumnX != compData.x)
+								*dst = _shadowTable[*dst];
 						}
 					}
 					dst += pitch;
@@ -1477,7 +1144,7 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
 
-				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1500,7 +1167,6 @@ typedef void (*ByleRLEDecodeFunc)(
 	const int _height,
 	const int pitch,
 	const int _numStrips,
-	const ShadowMode shadowMode,
 	const byte *_srcPtr,
 	const byte *_shadowTable,
 	const uint16 *_palette);
@@ -1548,7 +1214,6 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				_height,
 				_out.pitch,
 				_numStrips,
-				shadowMode,
 				_srcPtr,
 				_shadowTable,
 				_palette);
@@ -1560,7 +1225,6 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				_height,
 				_out.pitch,
 				_numStrips,
-				shadowMode,
 				_srcPtr,
 				_shadowTable,
 				_palette);


Commit: f0011e4729538e131eae6908f1f06e14079d371c
    https://github.com/scummvm/scummvm/commit/f0011e4729538e131eae6908f1f06e14079d371c
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Optimise non-scaled modes

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index baa515cb3e9..02c9a878c59 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -309,19 +309,23 @@ void ByleRLEDecode_Mode0(
 
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
-			do {
-				if (color) {
+			if (color) {
+				do {
 					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
 					|| (*mask & maskbit);
 
 					if (!masked) {
 						*dst = _palette[color];
 					}
-				}
-				dst += pitch;
-				mask += _numStrips;
-				y++;
-			} while (--batch);
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				} while (--batch);
+			} else {
+				dst += batch * pitch;
+				mask += batch * _numStrips;
+				y += batch;
+			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
@@ -388,8 +392,8 @@ void ByleRLEDecode_Mode1(
 
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
-			do {
-				if (color) {
+			if (color) {
+				do {
 					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
 					|| (*mask & maskbit);
 
@@ -404,11 +408,15 @@ void ByleRLEDecode_Mode1(
 							*dst = pcolor;
 						}
 					}
-				}
-				dst += pitch;
-				mask += _numStrips;
-				y++;
-			} while (--batch);
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				} while (--batch);
+			} else {
+				dst += batch * pitch;
+				mask += batch * _numStrips;
+				y += batch;
+			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
@@ -477,8 +485,8 @@ void ByleRLEDecode_Mode3(
 
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
-			do {
-				if (color) {
+			if (color) {
+				do {
 					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
 					|| (*mask & maskbit);
 
@@ -495,11 +503,15 @@ void ByleRLEDecode_Mode3(
 							*dst = pcolor;
 						}
 					}
-				}
-				dst += pitch;
-				mask += _numStrips;
-				y++;
-			} while (--batch);
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				} while (--batch);
+			} else {
+				dst += batch * pitch;
+				mask += batch * _numStrips;
+				y += batch;
+			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
@@ -568,8 +580,8 @@ void ByleRLEDecode_Classic(
 
 			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
 
-			do {
-				if (color) {
+			if (color) {
+				do {
 					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
 					|| (*mask & maskbit);
 
@@ -577,11 +589,15 @@ void ByleRLEDecode_Classic(
 						if (lastColumnX != compData.x)
 							*dst = _shadowTable[*dst];
 					}
-				}
-				dst += pitch;
-				mask += _numStrips;
-				y++;
-			} while (--batch);
+					dst += pitch;
+					mask += _numStrips;
+					y++;
+				} while (--batch);
+			} else {
+				dst += batch * pitch;
+				mask += batch * _numStrips;
+				y += batch;
+			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)


Commit: ad29b12637a8bfa52453c9eac96dee665f1a3a3a
    https://github.com/scummvm/scummvm/commit/ad29b12637a8bfa52453c9eac96dee665f1a3a3a
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Move Y clipping to caller

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 02c9a878c59..bf63751a23b 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -277,19 +277,17 @@ void ByleRLEDecode_Mode0(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
-	byte len = compData.repLen;
+	uint16 len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
-	int y = compData.y;
+	byte *dst = compData.destPtr;
 	uint16 height = _height;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
-	byte batch;
+	uint16 batch;
 	if (len) {
 		--len;
 		goto StartPos;
@@ -303,7 +301,7 @@ void ByleRLEDecode_Mode0(
 			len = *src++;
 
 		do {
-			batch = height < len ? (byte)height : len;
+			batch = height < len ? height : len;
 			len -= batch;
 			height -= batch;
 
@@ -311,27 +309,20 @@ void ByleRLEDecode_Mode0(
 
 			if (color) {
 				do {
-					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-					|| (*mask & maskbit);
-
-					if (!masked) {
+					if (!(*mask & maskbit))
 						*dst = _palette[color];
-					}
-					dst += pitch;
+					dst  += pitch;
 					mask += _numStrips;
-					y++;
 				} while (--batch);
 			} else {
-				dst += batch * pitch;
+				dst  += batch * pitch;
 				mask += batch * _numStrips;
-				y += batch;
 			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				compData.x += compData.scaleXStep;
 				maskbit = revBitMask(compData.x & 7);
@@ -359,20 +350,18 @@ void ByleRLEDecode_Mode1(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
-	byte len = compData.repLen;
+	uint16 len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
-	byte batch;
+	uint16 batch;
 	if (len) {
 		--len;
 		goto StartPos;
@@ -386,7 +375,7 @@ void ByleRLEDecode_Mode1(
 			len = *src++;
 
 		do {
-			batch = height < len ? (byte)height : len;
+			batch = height < len ? height : len;
 			len -= batch;
 			height -= batch;
 
@@ -394,13 +383,8 @@ void ByleRLEDecode_Mode1(
 
 			if (color) {
 				do {
-					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-					|| (*mask & maskbit);
-
-					if (!masked) {
-						uint16 pcolor;
-
-						pcolor = _palette[color];
+					if (!(*mask & maskbit)) {
+						uint16 pcolor = _palette[color];
 						if (pcolor == 13 && _shadowTable) {
 							if (lastColumnX != compData.x)
 								*dst = _shadowTable[*dst];
@@ -408,21 +392,18 @@ void ByleRLEDecode_Mode1(
 							*dst = pcolor;
 						}
 					}
-					dst += pitch;
+					dst  += pitch;
 					mask += _numStrips;
-					y++;
 				} while (--batch);
 			} else {
-				dst += batch * pitch;
+				dst  += batch * pitch;
 				mask += batch * _numStrips;
-				y += batch;
 			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				lastColumnX = compData.x;
 
@@ -452,20 +433,18 @@ void ByleRLEDecode_Mode3(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
-	byte len = compData.repLen;
+	uint16 len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
-	byte batch;
+	uint16 batch;
 	if (len) {
 		--len;
 		goto StartPos;
@@ -479,7 +458,7 @@ void ByleRLEDecode_Mode3(
 			len = *src++;
 
 		do {
-			batch = height < len ? (byte)height : len;
+			batch = height < len ? height : len;
 			len -= batch;
 			height -= batch;
 
@@ -487,13 +466,8 @@ void ByleRLEDecode_Mode3(
 
 			if (color) {
 				do {
-					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-					|| (*mask & maskbit);
-
-					if (!masked) {
-						uint16 pcolor;
-
-						pcolor = _palette[color];
+					if (!(*mask & maskbit)) {
+						uint16 pcolor = _palette[color];
 						if (pcolor < 8) {
 							if (lastColumnX != compData.x) {
 								pcolor = (pcolor << 8) + *dst;
@@ -503,21 +477,18 @@ void ByleRLEDecode_Mode3(
 							*dst = pcolor;
 						}
 					}
-					dst += pitch;
+					dst  += pitch;
 					mask += _numStrips;
-					y++;
 				} while (--batch);
 			} else {
-				dst += batch * pitch;
+				dst  += batch * pitch;
 				mask += batch * _numStrips;
-				y += batch;
 			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				lastColumnX = compData.x;
 
@@ -547,20 +518,18 @@ void ByleRLEDecode_Classic(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
-	byte len = compData.repLen;
+	uint16 len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
-	byte batch;
+	uint16 batch;
 	if (len) {
 		--len;
 		goto StartPos;
@@ -574,7 +543,7 @@ void ByleRLEDecode_Classic(
 			len = *src++;
 
 		do {
-			batch = height < len ? (byte)height : len;
+			batch = height < len ? height : len;
 			len -= batch;
 			height -= batch;
 
@@ -582,28 +551,22 @@ void ByleRLEDecode_Classic(
 
 			if (color) {
 				do {
-					const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-					|| (*mask & maskbit);
-
-					if (!masked) {
+					if (!(*mask & maskbit)) {
 						if (lastColumnX != compData.x)
 							*dst = _shadowTable[*dst];
 					}
-					dst += pitch;
+					dst  += pitch;
 					mask += _numStrips;
-					y++;
 				} while (--batch);
 			} else {
-				dst += batch * pitch;
+				dst  += batch * pitch;
 				mask += batch * _numStrips;
-				y += batch;
 			}
 
 			if (height == 0) {
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				lastColumnX = compData.x;
 
@@ -633,16 +596,14 @@ void ByleRLEDecode_Scaled_Mode0(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
-	int y = compData.y;
+	byte *dst = compData.destPtr;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
@@ -669,16 +630,11 @@ void ByleRLEDecode_Scaled_Mode0(
 			do {
 				if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (*mask & maskbit);
-
-						if (!masked) {
+						if (!(*mask & maskbit))
 							*dst = _palette[color];
-						}
 					}
 					dst += pitch;
 					mask += _numStrips;
-					y++;
 				}
 			} while (--batch);
 
@@ -686,7 +642,6 @@ void ByleRLEDecode_Scaled_Mode0(
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
 
@@ -720,16 +675,14 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
-	int y = compData.y;
+	byte *dst = compData.destPtr;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
@@ -756,16 +709,11 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 			do {
 				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (*mask & maskbit);
-
-						if (!masked) {
+						if (!(*mask & maskbit))
 							*dst = _palette[color];
-						}
 					}
 					dst += pitch;
 					mask += _numStrips;
-					y++;
 				}
 			} while (--batch);
 
@@ -773,7 +721,6 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
 
@@ -807,17 +754,15 @@ void ByleRLEDecode_Scaled_Mode1(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
@@ -844,10 +789,7 @@ void ByleRLEDecode_Scaled_Mode1(
 			do {
 				if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (*mask & maskbit);
-
-						if (!masked) {
+						if (!(*mask & maskbit)) {
 							uint16 pcolor;
 
 							pcolor = _palette[color];
@@ -861,7 +803,6 @@ void ByleRLEDecode_Scaled_Mode1(
 					}
 					dst += pitch;
 					mask += _numStrips;
-					y++;
 				}
 			} while (--batch);
 
@@ -869,7 +810,6 @@ void ByleRLEDecode_Scaled_Mode1(
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
@@ -904,17 +844,15 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
@@ -941,10 +879,7 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 			do {
 				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (*mask & maskbit);
-
-						if (!masked) {
+						if (!(*mask & maskbit)) {
 							uint16 pcolor;
 
 							pcolor = _palette[color];
@@ -958,7 +893,6 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 					}
 					dst += pitch;
 					mask += _numStrips;
-					y++;
 				}
 			} while (--batch);
 
@@ -966,7 +900,6 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
@@ -1001,17 +934,15 @@ void ByleRLEDecode_Scaled_Mode3(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
@@ -1038,10 +969,7 @@ void ByleRLEDecode_Scaled_Mode3(
 			do {
 				if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (*mask & maskbit);
-
-						if (!masked) {
+						if (!(*mask & maskbit)) {
 							uint16 pcolor;
 
 							pcolor = _palette[color];
@@ -1057,7 +985,6 @@ void ByleRLEDecode_Scaled_Mode3(
 					}
 					dst += pitch;
 					mask += _numStrips;
-					y++;
 				}
 			} while (--batch);
 
@@ -1065,7 +992,6 @@ void ByleRLEDecode_Scaled_Mode3(
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
@@ -1100,17 +1026,15 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
-	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 
@@ -1137,17 +1061,13 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 			do {
 				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
-						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-						|| (*mask & maskbit);
-
-						if (!masked) {
+						if (!(*mask & maskbit)) {
 							if (lastColumnX != compData.x)
 								*dst = _shadowTable[*dst];
 						}
 					}
 					dst += pitch;
 					mask += _numStrips;
-					y++;
 				}
 			} while (--batch);
 
@@ -1155,7 +1075,6 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 				if (--compData.skipWidth == 0)
 					return;
 				height = _height;
-				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
 				lastColumnX = compData.x;
@@ -1219,10 +1138,9 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 			shadowMode = ShadowMode::Mode3;
 	}
 
-	{
+	if (compData.y >= compData.boundsRect.top && compData.y + _height <= compData.boundsRect.bottom) {
 		const int scaled = (_scaleX != 255 || _scaleY != 255);
-		const int useScaleIndexMask = compData.scaleIndexMask != -1;
-		if (!scaled)
+		if (!scaled) {
 			byleRLEDecodeNoScaleTable[static_cast<int>(shadowMode)](
 				&compData,
 				_scaleX,
@@ -1233,7 +1151,9 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				_srcPtr,
 				_shadowTable,
 				_palette);
-		else
+		} else {
+			const int useScaleIndexMask = compData.scaleIndexMask != -1;
+
 			byleRLEDecodeScaledTable[(static_cast<int>(shadowMode) << 1) | useScaleIndexMask](
 				&compData,
 				_scaleX,
@@ -1244,21 +1164,21 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				_srcPtr,
 				_shadowTable,
 				_palette);
+		}
 		return;
 	}
 
 	const byte *src = _srcPtr;
-	byte *dst = compData.destPtr;
 
 	byte len = compData.repLen;
 	uint16 color = compData.repColor;
 
 	// reset every column
+	byte *dst = compData.destPtr;
 	int lastColumnX = -1;
 	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
-
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
 


Commit: 907150addb3dd917b5341e135f94d69b95fc7f05
    https://github.com/scummvm/scummvm/commit/907150addb3dd917b5341e135f94d69b95fc7f05
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Optimise non-scaled modes even more

Non-scaled modes have always the next x different from previous one.
Should have been removed as part of "SCUMM: byleRLEDecodeFast: Remove
dead code".

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index bf63751a23b..db99330e2ea 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -356,7 +356,6 @@ void ByleRLEDecode_Mode1(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
 	uint16 height = _height;
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -386,8 +385,7 @@ void ByleRLEDecode_Mode1(
 					if (!(*mask & maskbit)) {
 						uint16 pcolor = _palette[color];
 						if (pcolor == 13 && _shadowTable) {
-							if (lastColumnX != compData.x)
-								*dst = _shadowTable[*dst];
+							*dst = _shadowTable[*dst];
 						} else {
 							*dst = pcolor;
 						}
@@ -405,8 +403,6 @@ void ByleRLEDecode_Mode1(
 					return;
 				height = _height;
 
-				lastColumnX = compData.x;
-
 				compData.x += compData.scaleXStep;
 				maskbit = revBitMask(compData.x & 7);
 				compData.destPtr += compData.scaleXStep;
@@ -439,7 +435,6 @@ void ByleRLEDecode_Mode3(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
 	uint16 height = _height;
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -469,10 +464,8 @@ void ByleRLEDecode_Mode3(
 					if (!(*mask & maskbit)) {
 						uint16 pcolor = _palette[color];
 						if (pcolor < 8) {
-							if (lastColumnX != compData.x) {
-								pcolor = (pcolor << 8) + *dst;
-								*dst = _shadowTable[pcolor];
-							}
+							pcolor = (pcolor << 8) + *dst;
+							*dst = _shadowTable[pcolor];
 						} else {
 							*dst = pcolor;
 						}
@@ -490,8 +483,6 @@ void ByleRLEDecode_Mode3(
 					return;
 				height = _height;
 
-				lastColumnX = compData.x;
-
 				compData.x += compData.scaleXStep;
 				maskbit = revBitMask(compData.x & 7);
 				compData.destPtr += compData.scaleXStep;
@@ -524,7 +515,6 @@ void ByleRLEDecode_Classic(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
 	uint16 height = _height;
 	byte maskbit = revBitMask(compData.x & 7);
 	const byte *mask = compData.maskPtr + compData.x / 8;
@@ -551,10 +541,8 @@ void ByleRLEDecode_Classic(
 
 			if (color) {
 				do {
-					if (!(*mask & maskbit)) {
-						if (lastColumnX != compData.x)
-							*dst = _shadowTable[*dst];
-					}
+					if (!(*mask & maskbit))
+						*dst = _shadowTable[*dst];
 					dst  += pitch;
 					mask += _numStrips;
 				} while (--batch);
@@ -568,8 +556,6 @@ void ByleRLEDecode_Classic(
 					return;
 				height = _height;
 
-				lastColumnX = compData.x;
-
 				compData.x += compData.scaleXStep;
 				maskbit = revBitMask(compData.x & 7);
 				compData.destPtr += compData.scaleXStep;


Commit: 783339e94c29a9f48e7df50c7ad02ba11a61d1cf
    https://github.com/scummvm/scummvm/commit/783339e94c29a9f48e7df50c7ad02ba11a61d1cf
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Proper Y clipping in scaled modes

This reduces 'false positives' where clipping functions would be called
instead of the faster ones.

Changed paths:
    engines/scumm/base-costume-optimised.cpp
    engines/scumm/base-costume.h


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index db99330e2ea..18eb9c4370e 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -169,6 +169,7 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 	compData.scaleYIndex = startScaleIndexY;
 	compData.skipWidth = _width;
 	compData.scaleXStep = _drawActorToRight ? 1 : -1;
+	compData.scaledHeight = rect.bottom - rect.top;
 
 	// All the important 'rect' values. scale sequence = 'compData.scaleTable[i] < _scaleX' result)
 	//
@@ -1124,7 +1125,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 			shadowMode = ShadowMode::Mode3;
 	}
 
-	if (compData.y >= compData.boundsRect.top && compData.y + _height <= compData.boundsRect.bottom) {
+	if (compData.y >= compData.boundsRect.top && compData.y + compData.scaledHeight <= compData.boundsRect.bottom) {
 		const int scaled = (_scaleX != 255 || _scaleY != 255);
 		if (!scaled) {
 			byleRLEDecodeNoScaleTable[static_cast<int>(shadowMode)](
diff --git a/engines/scumm/base-costume.h b/engines/scumm/base-costume.h
index 9ffa861c5af..1268a39a677 100644
--- a/engines/scumm/base-costume.h
+++ b/engines/scumm/base-costume.h
@@ -126,6 +126,9 @@ public:
 		Common::Rect boundsRect;
 		int scaleXIndex, scaleYIndex;
 		int scaleIndexMask;
+#ifdef SCUMM_OPTIMISED_CODE
+		int scaledHeight;
+#endif
 	};
 
     BaseCostumeRenderer(ScummEngine *scumm, bool akosRendering = false) {


Commit: 2390a071b95398d3ccd510c1cff18e67e7a593e4
    https://github.com/scummvm/scummvm/commit/2390a071b95398d3ccd510c1cff18e67e7a593e4
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Fast path for Mode1

Mode1 with _shadowTable == NULL is equal to Mode0. Used in early SCUMM
games like Zak, Maniac or Loom. Reduces one if per pixel.

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 18eb9c4370e..1a8d8ae1b90 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -385,7 +385,7 @@ void ByleRLEDecode_Mode1(
 				do {
 					if (!(*mask & maskbit)) {
 						uint16 pcolor = _palette[color];
-						if (pcolor == 13 && _shadowTable) {
+						if (pcolor == 13) {
 							*dst = _shadowTable[*dst];
 						} else {
 							*dst = pcolor;
@@ -780,7 +780,7 @@ void ByleRLEDecode_Scaled_Mode1(
 							uint16 pcolor;
 
 							pcolor = _palette[color];
-							if (pcolor == 13 && _shadowTable) {
+							if (pcolor == 13) {
 								if (lastColumnX != compData.x)
 									*dst = _shadowTable[*dst];
 							} else {
@@ -870,7 +870,7 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 							uint16 pcolor;
 
 							pcolor = _palette[color];
-							if (pcolor == 13 && _shadowTable) {
+							if (pcolor == 13) {
 								if (lastColumnX != compData.x)
 									*dst = _shadowTable[*dst];
 							} else {
@@ -1116,7 +1116,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 	if (!_akosRendering) {
 		if (_shadowMode & 0x20)
 			shadowMode = ShadowMode::Classic;
-		else
+		else if (_shadowTable)
 			shadowMode = ShadowMode::Mode1;
 	} else {
 		if (_shadowMode == 1)
@@ -1210,7 +1210,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 
 							case ShadowMode::Mode1:
 								pcolor = _palette[color];
-								if (pcolor == 13 && _shadowTable) {
+								if (pcolor == 13) {
 									if (lastColumnX != compData.x)
 										*dst = _shadowTable[*dst];
 								} else {


Commit: 4f5361a2f75926c54191d2f3e4c9ad816128e66b
    https://github.com/scummvm/scummvm/commit/4f5361a2f75926c54191d2f3e4c9ad816128e66b
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: m68k assembly functions

Changed paths:
  A engines/scumm/bylerledecodeM68K.h
  A engines/scumm/m68k/bylerledecode.inc
  A engines/scumm/m68k/bylerledecode_classic.S
  A engines/scumm/m68k/bylerledecode_mode0.S
  A engines/scumm/m68k/bylerledecode_mode1.S
  A engines/scumm/m68k/bylerledecode_mode3.S
  A engines/scumm/m68k/bylerledecode_scaled_mode0.S
  A engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
  A engines/scumm/m68k/bylerledecode_scaled_mode1.S
  A engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
  A engines/scumm/m68k/bylerledecode_scaled_mode3.S
    configure
    engines/scumm/base-costume-optimised.cpp
    engines/scumm/base-costume.cpp
    engines/scumm/module.mk


diff --git a/configure b/configure
index b454d2dcf41..0739c9d6ae2 100755
--- a/configure
+++ b/configure
@@ -4256,6 +4256,7 @@ case $_backend in
 		;;
 	atari)
 		append_var DEFINES "-DATARI"
+		define_in_config_if_yes yes "USE_M68K_COSTUME_ASM"
 		append_var DEFINES "-DDISABLE_NES_APU"
 		append_var LIBS "-lgem"
 		_ogg=no
diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 1a8d8ae1b90..b35c4632c06 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -20,12 +20,20 @@
  */
 
 
+#ifdef SCUMM_OPTIMISED_CODE
+
+#define FORCE_TEXT_CONSOLE
+
+#include "common/textconsole.h"
 #include "scumm/base-costume.h"
 #include "scumm/util.h"
 
+#ifdef USE_M68K_COSTUME_ASM
+#include "bylerledecodeM68K.h"
+#endif
+
 namespace Scumm {
 
-#ifdef SCUMM_OPTIMISED_CODE
 byte BaseCostumeRenderer::paintCelByleRLECommon(
 	int xMoveCur,
 	int yMoveCur,
@@ -264,6 +272,7 @@ enum class ShadowMode : int {
 	Classic
 };
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Mode0(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX, /* unused */
@@ -276,6 +285,7 @@ void ByleRLEDecode_Mode0(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -336,7 +346,9 @@ void ByleRLEDecode_Mode0(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Mode1(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX, /* unused */
@@ -349,6 +361,7 @@ void ByleRLEDecode_Mode1(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -415,7 +428,9 @@ void ByleRLEDecode_Mode1(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Mode3(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX, /* unused */
@@ -428,6 +443,7 @@ void ByleRLEDecode_Mode3(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -495,7 +511,9 @@ void ByleRLEDecode_Mode3(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Classic(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX, /* unused */
@@ -508,6 +526,7 @@ void ByleRLEDecode_Classic(
 	const uint16 *_palette /* unused */) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -568,7 +587,9 @@ void ByleRLEDecode_Classic(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Scaled_Mode0(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -581,6 +602,7 @@ void ByleRLEDecode_Scaled_Mode0(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -647,7 +669,9 @@ void ByleRLEDecode_Scaled_Mode0(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Scaled_Mode0_SMask(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -660,6 +684,7 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -726,7 +751,9 @@ void ByleRLEDecode_Scaled_Mode0_SMask(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Scaled_Mode1(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -739,6 +766,7 @@ void ByleRLEDecode_Scaled_Mode1(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -816,7 +844,9 @@ void ByleRLEDecode_Scaled_Mode1(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Scaled_Mode1_SMask(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -829,6 +859,7 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -906,7 +937,9 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
+#ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Scaled_Mode3(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -919,6 +952,7 @@ void ByleRLEDecode_Scaled_Mode3(
 	const uint16 *_palette) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -998,6 +1032,7 @@ void ByleRLEDecode_Scaled_Mode3(
 		} while (len > 0);
 	} while (true);
 }
+#endif
 
 void ByleRLEDecode_Scaled_Classic_SMask(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
@@ -1011,6 +1046,7 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 	const uint16 *_palette /* unused */) {
 
 	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	warning("%s: unexpected call, save your game and report", __FUNCTION__);
 
 	const byte *src = _srcPtr;
 
@@ -1155,6 +1191,9 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 		return;
 	}
 
+	warning("%s: unexpected call, save your game and report: %d (%d, %d, %d, %d)", __FUNCTION__, (int)shadowMode,
+			compData.y, compData.boundsRect.top, compData.y + compData.scaledHeight, compData.boundsRect.bottom);
+
 	const byte *src = _srcPtr;
 
 	byte len = compData.repLen;
@@ -1262,6 +1301,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 		} while (len > 0);
 	} while (true);
 }
-#endif
 
 } // End of namespace Scumm
+
+#endif
diff --git a/engines/scumm/base-costume.cpp b/engines/scumm/base-costume.cpp
index cfe12d5a481..13b32bdf578 100644
--- a/engines/scumm/base-costume.cpp
+++ b/engines/scumm/base-costume.cpp
@@ -20,6 +20,9 @@
  */
 
 
+#define FORCE_TEXT_CONSOLE
+
+#include "common/textconsole.h"
 #include "scumm/base-costume.h"
 #include "scumm/util.h"
 
@@ -294,6 +297,7 @@ void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX,
 		byleRLEDecodeFast(compData);
 		return;
 	}
+	warning("%s: unoptimised version is being executed", __FUNCTION__);
 #endif
 	const byte *src = _srcPtr;
 	byte *dst = compData.destPtr;
diff --git a/engines/scumm/bylerledecodeM68K.h b/engines/scumm/bylerledecodeM68K.h
new file mode 100644
index 00000000000..a7e5f254140
--- /dev/null
+++ b/engines/scumm/bylerledecodeM68K.h
@@ -0,0 +1,126 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef SCUMM_BYLE_RLE_DECODE_M68K_H
+#define SCUMM_BYLE_RLE_DECODE_M68K_H
+
+#include "scumm/base-costume.h"
+
+extern "C" void ByleRLEDecode_Mode0(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable, /* unused */
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Mode1(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Mode3(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Classic(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette /* unused */);
+
+extern "C" void ByleRLEDecode_Scaled_Mode0(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable, /* unused */
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Scaled_Mode0_SMask(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable, /* unused */
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Scaled_Mode1(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Scaled_Mode1_SMask(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
+extern "C" void ByleRLEDecode_Scaled_Mode3(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
+#endif
diff --git a/engines/scumm/m68k/bylerledecode.inc b/engines/scumm/m68k/bylerledecode.inc
new file mode 100644
index 00000000000..c1dda9bc148
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode.inc
@@ -0,0 +1,100 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "../../../backends/platform/atari/symbols.h"
+
+	.struct	0
+compData.x:
+	.space	4	// int x;
+compData.y:
+	.space	4	// int y;
+compData.scaleTable:
+	.space	4	// const byte *scaleTable;
+//compData.height:
+//	.space	4	// int height;
+//compData.width:
+//	.space	4	// int width;
+
+	.space	2
+compData.skipWidth:
+	.space	2	// int skipWidth; (used as uint16)
+
+compData.destPtr:
+	.space	4	// byte *destPtr;
+compData.maskPtr:
+	.space	4	// const byte *maskPtr;
+compData.scaleXStep:
+	.space	4	// int scaleXStep;
+compData.mask:
+	.space	1	// byte mask;
+compData.shr:
+	.space	1	// byte shr;
+compData.repColor:
+	.space	1	// byte repColor;
+compData.repLen:
+	.space	1	// byte repLen;
+compData.boundsRect:
+	.space	8	// Common::Rect boundsRect; (4 x int16)
+compData.scaleXIndex:
+	.space	4	// int scaleXIndex;
+compData.scaleYIndex:
+	.space	4	// int scaleYIndex;
+compData.scaleIndexMask:
+	.space	4	// int scaleIndexMask;
+
+	.set	sizeof_local_vars,0
+	.macro	add_var name,size
+	.set	sizeof_local_vars,sizeof_local_vars+\size
+	.set	\name,-sizeof_local_vars
+	.endm
+
+	.struct 0
+arg_fp:
+	.space	4
+arg_rts:
+	.space	4
+arg_pcompData:
+	.space	4	// BaseCostumeRenderer::ByleRLEData *pcompData
+
+	.space	3
+arg_scaleX:
+	.space	1	// const byte _scaleX
+
+	.space	3
+arg_scaleY:
+	.space	1	// const byte _scaleY
+
+	.space	2
+arg_height:
+	.space	2	// const int _height (used as uint16)
+
+arg_pitch:
+	.space	4	// const int pitch
+arg_numStrips:
+	.space	4	// const int _numStrips
+arg_srcPtr:
+	.space	4	// const byte *_srcPtr
+arg_shadowTable:
+	.space	4	// const byte *_shadowTable
+arg_palette:
+	.space	4	// const uint16 *_palette
diff --git a/engines/scumm/m68k/bylerledecode_classic.S b/engines/scumm/m68k/bylerledecode_classic.S
new file mode 100644
index 00000000000..590062d7d31
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_classic.S
@@ -0,0 +1,190 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Classic)
+
+	add_var	color,1+3	// byte color; (align stack)
+
+
+	.text
+
+| void ByleRLEDecode_Classic(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX, /* unused */
+|	const byte _scaleY, /* unused */
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette /* unused */);
+SYM(ByleRLEDecode_Classic):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(zero/shadow scratch), d6(len), d5(height), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(destPtr), a4(_shadowTable), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	d6
+	move.b	(compData.repLen,a5),d6		| d6.w: len
+	// uint16 color = compData.repColor;
+	move.b	(compData.repColor,a5),(color,a6)
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a3	| a3: compData.destPtr
+	move.l	a3,a1				| a1: dst
+	// uint16 height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_shadowTable,a6),a4		| a4: _shadowTable
+	clr.l	d7				| d7.l: used for shadow table indexing
+
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,d6
+	bgt.b	len_loop
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.w	d6				| d6.w: clear the high byte
+	move.b	(a0)+,d0
+	move.b	d0,d6
+	move.b	(compData.shr,a5),d1
+	lsr.b	d1,d0				| d0.b: color (only care about zero/nonzero)
+	move.b	d0,(color,a6)
+	and.b	(compData.mask,a5),d6		| d6.w: len
+	bne.b	len_loop
+	move.b	(a0)+,d6
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	d5,d1				| d1.w: batch
+	cmp.w	d6,d1
+	blt.b	1f
+	move.w	d6,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,d6
+	sub.w	d1,d5
+	// 	if (color) {
+	tst.b	(color,a6)
+	beq.b	2f
+
+	subq.w	#1,d1				| dbra
+batch_loop_shadow:
+	// 		do {
+	// 			if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 				*dst = _shadowTable[*dst];
+	move.b	(a1),d7
+	move.b	(a4,d7.l),(a1)
+1:	// 			dst  += pitch;
+	// 			mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 		} while (--batch);
+	dbra	d1,batch_loop_shadow
+	bra.b	3f
+
+	// 	} else {
+2:	// 		dst  += batch * pitch;
+	move.w	d1,d0
+	mulu.w	d3,d0
+	add.l	d0,a1
+	// 		mask += batch * _numStrips;
+	mulu.w	d2,d1
+	add.l	d1,a2
+	// 	}
+
+3:	// 	if (height == 0) {
+	tst.w	d5
+	bne.b	5f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+	// 		compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d0
+	add.l	d1,d0
+	move.l	d0,(compData.x,a5)		| d0.l: compData.x
+	move.b	d0,d4
+	// 		maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 		destPtr += compData.scaleXStep;
+	add.l	d1,a3
+	// 		dst = destPtr;
+	move.l	a3,a1				| a1: dst
+	// 		mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+5:	// StartPos:
+	// } while (len > 0);
+	tst.w	d6
+	bne.b	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_mode0.S b/engines/scumm/m68k/bylerledecode_mode0.S
new file mode 100644
index 00000000000..4dfb7f0f12b
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_mode0.S
@@ -0,0 +1,191 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Mode0)
+
+	add_var	color,1+3	// byte color; (align stack)
+
+
+	.text
+
+| void ByleRLEDecode_Mode0(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX, /* unused */
+|	const byte _scaleY, /* unused */
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable, /* unused */
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Mode0):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(len), d5(height), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(_palette), a4(destPtr), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	d6
+	move.b	(compData.repLen,a5),d6		| d6.w: len
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a4	| a4: compData.destPtr
+	move.l	a4,a1				| a1: dst
+	// uint16 height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_palette,a6),a3		| a3: _palette
+
+	// uint16 color = compData.repColor;
+	clr.l	d0				| d0.l: keep the upper 24 bits clean
+	move.b	(compData.repColor,a5),d0
+	move.b	d0,(color,a6)
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,d6
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.l	d0				| d0.l: keep the upper 24 bits clean
+	move.b	(a0)+,d0
+	move.w	d0,d6				| d6.w: clear the high byte
+	move.b	(compData.shr,a5),d1
+	lsr.b	d1,d0				| d0.b: color (used as long, too)
+	move.b	d0,(color,a6)
+	and.b	(compData.mask,a5),d6		| d6.w: len
+	bne.b	pcolor_setup
+	move.b	(a0)+,d6
+pcolor_setup:
+	move.w	(a3,d0.l*2),d7			| d7.w: _palette[color]
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	d5,d1				| d1.w: batch
+	cmp.w	d6,d1
+	blt.b	1f
+	move.w	d6,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,d6
+	sub.w	d1,d5
+	// 	if (color) {
+	tst.b	(color,a6)
+	beq.b	2f
+
+	subq.w	#1,d1				| dbra
+batch_loop:
+	// 		do {
+	// 			if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 				*dst = _palette[color];
+	move.b	d7,(a1)
+1:	// 			dst  += pitch;
+	// 			mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 		} while (--batch);
+	dbra	d1,batch_loop
+	bra.b	3f
+	// 	} else {
+2:	// 		dst  += batch * pitch;
+	move.w	d1,d0
+	mulu.w	d3,d0
+	add.l	d0,a1
+	// 		mask += batch * _numStrips;
+	mulu.w	d2,d1
+	add.l	d1,a2
+	// 	}
+
+3:	// 	if (height == 0) {
+	tst.w	d5
+	bne.b	5f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+	// 		compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d0
+	add.l	d1,d0
+	move.l	d0,(compData.x,a5)		| d0.l: compData.x
+	move.l	d0,d4
+	// 		maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 		destPtr += compData.scaleXStep;
+	add.l	d1,a4
+	// 		dst = destPtr;
+	move.l	a4,a1				| a1: dst
+	// 		mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+5:	// StartPos:
+	// } while (len > 0);
+	tst.w	d6
+	bne.b	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_mode1.S b/engines/scumm/m68k/bylerledecode_mode1.S
new file mode 100644
index 00000000000..6b11f50c2e1
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_mode1.S
@@ -0,0 +1,218 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Mode1)
+
+	add_var	len,2+2	// uint16 len; (align stack)
+
+
+	.text
+
+| void ByleRLEDecode_Mode1(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX, /* unused */
+|	const byte _scaleY, /* unused */
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Mode1):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(color), d5(height), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(_palette), a4(_shadowTable), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	(len,a6)
+	move.b	(compData.repLen,a5),(len+1,a6)
+	// uint16 color = compData.repColor;
+	clr.l	d6
+	move.b	(compData.repColor,a5),d6	| d6.b: color (used as long, too)
+	clr.l	d7				| used as long, too
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// uint16 height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_palette,a6),a3		| a3: _palette
+	move.l	(arg_shadowTable,a6),a4		| a4: _shadowTable
+
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,(len,a6)
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.w	d7
+	move.b	(a0)+,d7
+	move.w	d7,d6				| d6.w: clear the highest bit ("shadow_on" bit cleared)
+	move.b	(compData.shr,a5),d0
+	lsr.b	d0,d6				| d6.b: color (used as long, too)
+	and.b	(compData.mask,a5),d7
+	bne.b	1f
+	move.b	(a0)+,d7
+1:	move.w	d7,(len,a6)
+
+pcolor_setup:
+	tst.b	d6
+	beq.b	len_loop
+	move.w	(a3,d6.l*2),d7			| d7.w: pcolor = _palette[color] (used as long, too)
+	cmp.w	#13,d7
+	bne.b	len_loop
+	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(len,a6),d0
+	move.w	d5,d1				| d1.w: batch
+	cmp.w	d0,d1
+	blt.b	1f
+	move.w	d0,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,(len,a6)
+	sub.w	d1,d5
+	// 	if (color) {
+	tst.b	d6
+	beq.b	2f
+
+	subq.w	#1,d1				| dbra
+	tst.w	d6
+	bmi.b	batch_loop_shadow
+batch_loop:
+	// 		do {
+	// 			if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 				*dst = _palette[color];
+	move.b	d7,(a1)
+1:	// 			dst  += pitch;
+	// 			mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 		} while (--batch);
+	dbra	d1,batch_loop
+	bra.b	3f
+
+batch_loop_shadow:
+	// 		do {
+	// 			if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 				*dst = _shadowTable[*dst];
+	move.b	(a1),d7
+	move.b	(a4,d7.l),(a1)
+1:	// 			dst  += pitch;
+	// 			mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 		} while (--batch);
+	dbra	d1,batch_loop_shadow
+	bra.b	3f
+
+	// 	} else {
+2:	// 		dst  += batch * pitch;
+	move.w	d1,d0
+	mulu.w	d3,d0
+	add.l	d0,a1
+	// 		mask += batch * _numStrips;
+	mulu.w	d2,d1
+	add.l	d1,a2
+	// 	}
+
+3:	// 	if (height == 0) {
+	tst.w	d5
+	bne.b	5f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+	// 		compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d0
+	add.l	d1,d0
+	move.l	d0,(compData.x,a5)		| d0.l: compData.x
+	move.l	d0,d4
+	// 		maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 		compData.destPtr += compData.scaleXStep;
+	// 		dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	add.l	d1,a1
+	move.l	a1,(compData.destPtr,a5)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+5:	// StartPos:
+	// } while (len > 0);
+	tst.w	(len,a6)
+	bne.w	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_mode3.S b/engines/scumm/m68k/bylerledecode_mode3.S
new file mode 100644
index 00000000000..fa921e378b1
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_mode3.S
@@ -0,0 +1,222 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Mode3)
+
+	add_var	len,2+2	// uint16 len; (align stack)
+
+
+	.text
+
+| void ByleRLEDecode_Mode3(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX, /* unused */
+|	const byte _scaleY, /* unused */
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Mode3):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(color), d5(height), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(_palette), a4(_shadowTable), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	(len,a6)
+	move.b	(compData.repLen,a5),(len+1,a6)
+	// uint16 color = compData.repColor;
+	clr.l	d6
+	move.b	(compData.repColor,a5),d6	| d6.b: color (used as long, too)
+	clr.l	d7				| used as long, too
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// uint16 height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_palette,a6),a3		| a3: _palette
+	move.l	(arg_shadowTable,a6),a4		| a4: _shadowTable
+
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,(len,a6)
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.w	d7
+	move.b	(a0)+,d7
+	move.w	d7,d6				| d6.w: clear the highest bit ("shadow_on" bit cleared)
+	move.b	(compData.shr,a5),d0
+	lsr.b	d0,d6				| d6.b: color (used as long, too)
+	and.b	(compData.mask,a5),d7
+	bne.b	1f
+	move.b	(a0)+,d7
+1:	move.w	d7,(len,a6)
+
+pcolor_setup:
+	tst.b	d6
+	beq.b	len_loop
+	move.w	(a3,d6.l*2),d7			| d7.w: pcolor = _palette[color] (used as long, too)
+	cmp.w	#8,d7
+	bhs.b	len_loop
+	lsl.w	#8,d7				| d7.w: if pcolor < 8: pcolor << 8
+	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(len,a6),d0
+	move.w	d5,d1				| d1.w: batch
+	cmp.w	d0,d1
+	blt.b	1f
+	move.w	d0,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,(len,a6)
+	sub.w	d1,d5
+	// 	if (color) {
+	tst.b	d6
+	beq.b	2f
+
+	subq.w	#1,d1				| dbra
+	tst.w	d6
+	bmi.b	batch_loop_shadow
+batch_loop:
+	// 		do {
+	// 			if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 				*dst = _palette[color];
+	move.b	d7,(a1)
+1:	// 			dst  += pitch;
+	// 			mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 		} while (--batch);
+	dbra	d1,batch_loop
+	bra.b	3f
+
+batch_loop_shadow:
+	// 		do {
+	// 			if (!(*mask & maskbit)) {
+	btst	d4,(a2)
+	bne.b	1f
+	// 				uint16 pcolor = _palette[color];
+	// 				pcolor = (pcolor << 8) + *dst;
+	// 				*dst = _shadowTable[pcolor];
+	move.b	(a1),d7
+	move.b	(a4,d7.l),(a1)
+	// 			}
+1:	// 			dst  += pitch;
+	// 			mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 		} while (--batch);
+	dbra	d1,batch_loop_shadow
+	bra.b	3f
+
+	// 	} else {
+2:	// 		dst  += batch * pitch;
+	move.w	d1,d0
+	mulu.w	d3,d0
+	add.l	d0,a1
+	// 		mask += batch * _numStrips;
+	mulu.w	d2,d1
+	add.l	d1,a2
+	// 	}
+
+3:	// 	if (height == 0) {
+	tst.w	d5
+	bne.b	5f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),d5		| d5.w: height
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+	// 		compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d0
+	add.l	d1,d0
+	move.l	d0,(compData.x,a5)		| d0.l: compData.x
+	move.l	d0,d4
+	// 		maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 		compData.destPtr += compData.scaleXStep;
+	// 		dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	add.l	d1,a1
+	move.l	a1,(compData.destPtr,a5)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+5:	// StartPos:
+	// } while (len > 0);
+	tst.w	(len,a6)
+	bne.w	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode0.S b/engines/scumm/m68k/bylerledecode_scaled_mode0.S
new file mode 100644
index 00000000000..e960b0bf68a
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode0.S
@@ -0,0 +1,209 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Scaled_Mode0)
+
+	add_var	color,1+1	// byte color; (align stack)
+	add_var	height,2	// uint16 height;
+	add_var	scaleTableX,4	// &compData.scaleTable[compData.scaleXIndex]
+	add_var	scaleTableY,4	// &compData.scaleTable[compData.scaleYIndex]
+
+	.text
+
+| void ByleRLEDecode_Scaled_Mode0(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX,
+|	const byte _scaleY,
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable, /* unused */
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Scaled_Mode0):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(len), d5(_scaleY), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(scaleTableY), a4(destPtr), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	d6
+	move.b	(compData.repLen,a5),d6		| d6.w: len
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a4	| a4: compData.destPtr
+	move.l	a4,a1				| a1: dst
+	// uint16 height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// int scaleIndexY = compData.scaleYIndex;
+	move.l	(compData.scaleXIndex,a5),d0
+	lea	([compData.scaleTable,a5],d0.l),a3
+	move.l	a3,(scaleTableX,a6)
+	move.l	(compData.scaleYIndex,a5),d0
+	lea	([compData.scaleTable,a5],d0.l),a3	| a3: scaleTableY
+	move.l	a3,(scaleTableY,a6)
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.b	(arg_scaleY,a6),d5		| d5.b: _scaleY
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+
+	// uint16 color = compData.repColor;
+	clr.l	d0				| d0.l: keep the upper 24 bits clean
+	move.b	(compData.repColor,a5),d0
+	move.b	d0,(color,a6)
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,d6
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.l	d0				| d0.l: keep the upper 24 bits clean
+	move.b	(a0)+,d0
+	move.w	d0,d6				| d6.w: clear the high byte
+	move.b	(compData.shr,a5),d1
+	lsr.b	d1,d0				| d0.b: color
+	move.b	d0,(color,a6)
+	and.b	(compData.mask,a5),d6		| d6.w: len
+	bne.b	pcolor_setup
+	move.b	(a0)+,d6
+pcolor_setup:
+	move.w	([arg_palette,a6],d0.l*2),d7	| d7.w: _palette[color]
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(height,a6),d1			| d1.w: batch
+	cmp.w	d6,d1
+	blt.b	1f
+	move.w	d6,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,d6
+	sub.w	d1,(height,a6)
+
+	subq.w	#1,d1				| dbra
+batch_loop:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY)
+	cmp.b	(a3)+,d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	(color,a6)
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _palette[color];
+	move.b	d7,(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	// 		} while (--batch);
+	dbra	d1,batch_loop
+
+	// 	if (height == 0) {
+	tst.w	(height,a6)
+	bne.b	6f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// 		scaleIndexY = compData.scaleYIndex;
+	move.l	(scaleTableY,a6),a3
+
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+
+	// 		dst = compData.destPtr; (moved from the below)
+	move.l	a4,a1				| a1: dst
+	// 		if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+	move.b	(arg_scaleX,a6),d0
+	cmp.b	([scaleTableX,a6]),d0
+	bls.b	5f
+
+	// 			compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d4
+	add.l	d1,d4
+	move.l	d4,(compData.x,a5)
+	// 			maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 			compData.destPtr += compData.scaleXStep;
+	add.l	d1,a4
+	// 			dst = destPtr;
+	move.l	a4,a1				| a1: dst
+	// 		}
+5:	// 		compData.scaleXIndex += compData.scaleXStep;
+	add.l	d1,(scaleTableX,a6)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	move.l	(compData.x,a5),d0
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+6:	// StartPos:
+	// } while (len > 0);
+	tst.w	d6
+	bne.b	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S b/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
new file mode 100644
index 00000000000..6a5a39838bf
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
@@ -0,0 +1,216 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Scaled_Mode0_SMask)
+
+	add_var	color,1+1	// byte color; (align stack)
+	add_var	height,2	// uint16 height;
+	add_var	scaleIndexY,1	// byte scaleIndexX;
+	add_var	scaleIndexX,1	// byte scaleIndexY;
+
+	.text
+
+| void ByleRLEDecode_Scaled_Mode0_SMask(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX,
+|	const byte _scaleY,
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable, /* unused */
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Scaled_Mode0_SMask):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(len), d5(_scaleY), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(scaleTable), a4(destPtr), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	d6
+	move.b	(compData.repLen,a5),d6		| d6.w: len
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a4	| a4: compData.destPtr
+	move.l	a4,a1				| a1: dst
+	// uint16 height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// byte scaleIndexX = compData.scaleXIndex;
+	// byte scaleIndexY = compData.scaleYIndex;
+	move.l	(compData.scaleTable,a5),a3	| a3: compData.scaleTable
+	move.b	(compData.scaleXIndex+3,a5),(scaleIndexX,a6)
+	move.b	(compData.scaleYIndex+3,a5),(scaleIndexY,a6)
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.b	(arg_scaleY,a6),d5		| d5.b: _scaleY
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+
+	// uint16 color = compData.repColor;
+	clr.l	d0				| d0.l: keep the upper 24 bits clean
+	move.b	(compData.repColor,a5),d0
+	move.b	d0,(color,a6)
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,d6
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.l	d0				| d0.l: keep the upper 24 bits clean
+	move.b	(a0)+,d0
+	move.w	d0,d6				| d6.w: clear the high byte
+	move.b	(compData.shr,a5),d1
+	lsr.b	d1,d0				| d0.b: color
+	move.b	d0,(color,a6)
+	and.b	(compData.mask,a5),d6		| d6.w: len
+	bne.b	pcolor_setup
+	move.b	(a0)+,d6
+pcolor_setup:
+	move.w	([arg_palette,a6],d0.l*2),d7	| d7.w: _palette[color]
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(height,a6),d1			| d1.w: batch
+	cmp.w	d6,d1
+	blt.b	1f
+	move.w	d6,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,d6
+	sub.w	d1,(height,a6)
+
+	subq.w	#1,d1				| dbra
+
+	// load scaleIndexY into d0 for batch loop (byte index, d0.l clean)
+	clr.l	d0
+	move.b	(scaleIndexY,a6),d0
+
+batch_loop:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY)
+	cmp.b	(a3,d0.l),d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	(color,a6)
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _palette[color];
+	move.b	d7,(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	addq.b	#1,d0				| scaleIndexY++ (byte wrap)
+	// 		} while (--batch);
+	dbra	d1,batch_loop
+
+	move.b	d0,(scaleIndexY,a6)
+	// 	if (height == 0) {
+	tst.w	(height,a6)
+	bne.b	6f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// 		scaleIndexY = compData.scaleYIndex;
+	move.b	(compData.scaleYIndex+3,a5),(scaleIndexY,a6)
+
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+
+	// 		dst = compData.destPtr; (moved from the below)
+	move.l	a4,a1				| a1: dst
+	// 		if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+	clr.l	d0
+	move.b	(scaleIndexX,a6),d0
+	move.b	(a3,d0.l),d0
+	cmp.b	(arg_scaleX,a6),d0
+	bhs.b	5f
+
+	// 			compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d4
+	add.l	d1,d4
+	move.l	d4,(compData.x,a5)
+	// 			maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 			compData.destPtr += compData.scaleXStep;
+	add.l	d1,a4
+	// 			dst = destPtr;
+	move.l	a4,a1				| a1: dst
+	// 		}
+5:	// 		compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & scaleIndexMask;
+	add.b	d1,(scaleIndexX,a6)		| byte wrap (d1.b = LSB of scaleXStep)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	move.l	(compData.x,a5),d0
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+6:	// StartPos:
+	// } while (len > 0);
+	tst.w	d6
+	bne.w	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode1.S b/engines/scumm/m68k/bylerledecode_scaled_mode1.S
new file mode 100644
index 00000000000..1c78f1bcc90
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode1.S
@@ -0,0 +1,249 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Scaled_Mode1)
+
+	add_var	len,2		// uint16 len;
+	add_var	height,2	// uint16 height;
+	add_var	scaleTableX,4	// &compData.scaleTable[compData.scaleXIndex]
+	add_var	scaleTableY,4	// &compData.scaleTable[compData.scaleYIndex]
+	add_var	shadowMask,1+3	// 0xFF = draw shadows (new column), 0x00 = skip shadows (duplicate column) (align stack)
+
+	.text
+
+| void ByleRLEDecode_Scaled_Mode1(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX,
+|	const byte _scaleY,
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Scaled_Mode1):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(color), d5(_scaleY), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(scaleTableY), a4(_shadowTable), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	(len,a6)
+	move.b	(compData.repLen,a5),(len+1,a6)
+	// uint16 color = compData.repColor;
+	clr.l	d6
+	move.b	(compData.repColor,a5),d6	| d6.b: color (used as long, too)
+	clr.l	d7				| used as long, too
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// int lastColumnX = -1;
+	st.b	(shadowMask,a6)
+	// uint16 height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// int scaleIndexY = compData.scaleYIndex;
+	move.l	(compData.scaleXIndex,a5),d0
+	lea	([compData.scaleTable,a5],d0.l),a3
+	move.l	a3,(scaleTableX,a6)
+	move.l	(compData.scaleYIndex,a5),d0
+	lea	([compData.scaleTable,a5],d0.l),a3	| a3: scaleTableY
+	move.l	a3,(scaleTableY,a6)
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.b	(arg_scaleY,a6),d5		| d5.b: _scaleY
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_shadowTable,a6),a4		| a4: _shadowTable
+
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,(len,a6)
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.w	d7
+	move.b	(a0)+,d7
+	move.w	d7,d6				| d6.w: clear the highest bit ("shadow_on" bit cleared)
+	move.b	(compData.shr,a5),d0
+	lsr.b	d0,d6				| d6.b: color (used as long, too)
+	and.b	(compData.mask,a5),d7
+	bne.b	1f
+	move.b	(a0)+,d7
+1:	move.w	d7,(len,a6)
+
+pcolor_setup:
+	tst.b	d6
+	beq.b	len_loop
+	move.w	([arg_palette,a6],d6.l*2),d7	| d7.w: pcolor = _palette[color] (used as long, too)
+	cmp.w	#13,d7
+	bne.b	len_loop
+	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
+	and.b	(shadowMask,a6),d6		| 0xFF: no-op; 0x00: clears color, suppresses writes
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(len,a6),d0
+	move.w	(height,a6),d1			| d1.w: batch
+	cmp.w	d0,d1
+	blt.b	1f
+	move.w	d0,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,(len,a6)
+	sub.w	d1,(height,a6)
+
+	subq.w	#1,d1				| dbra
+	tst.w	d6
+	bmi.b	batch_loop_shadow
+batch_loop:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY)
+	cmp.b	(a3)+,d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	d6
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _palette[color];
+	move.b	d7,(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	// 		} while (--batch);
+	dbra	d1,batch_loop
+	bra.b	3f
+
+batch_loop_shadow:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY) {
+	cmp.b	(a3)+,d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	d6
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _shadowTable[*dst];
+	move.b	(a1),d7
+	move.b	(a4,d7.l),(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	// 		} while (--batch);
+	dbra	d1,batch_loop_shadow
+
+3:	// 	if (height == 0) {
+	tst.w	(height,a6)
+	bne.b	6f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// 		scaleIndexY = compData.scaleYIndex;
+	move.l	(scaleTableY,a6),a3
+	// 		lastColumnX = compData.x;
+	sf.b	(shadowMask,a6)			| assume duplicate column (0x00)
+
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+
+	// 		dst = compData.destPtr; (moved from the below)
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// 		if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+	move.b	(arg_scaleX,a6),d0
+	cmp.b	([scaleTableX,a6]),d0
+	bls.b	5f
+
+	st.b	(shadowMask,a6)			| new column (0xFF)
+
+	// 			compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d4
+	add.l	d1,d4
+	move.l	d4,(compData.x,a5)
+	// 			maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 			compData.destPtr += compData.scaleXStep;
+	add.l	d1,a1
+	move.l	a1,(compData.destPtr,a5)
+	// 		}
+5:	// 		compData.scaleXIndex += compData.scaleXStep;
+	add.l	d1,(scaleTableX,a6)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	move.l	(compData.x,a5),d0
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+6:	// StartPos:
+	// } while (len > 0);
+	tst.w	(len,a6)
+	bne.w	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S b/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
new file mode 100644
index 00000000000..506495963ef
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
@@ -0,0 +1,254 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Scaled_Mode1_SMask)
+
+	add_var	len,2		// uint16 len;
+	add_var	height,2	// uint16 height;
+	add_var	scaleIndexX,1	// byte scaleIndexX;
+	add_var	scaleIndexY,1	// byte scaleIndexY;
+	add_var	shadowMask,1+1	// 0xFF = draw shadows (new column), 0x00 = skip shadows (duplicate column) (align stack)
+
+	.text
+
+| void ByleRLEDecode_Scaled_Mode1_SMask(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX,
+|	const byte _scaleY,
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Scaled_Mode1_SMask):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(color), d5(_scaleY), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(scaleTable), a4(_shadowTable), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	(len,a6)
+	move.b	(compData.repLen,a5),(len+1,a6)
+	// uint16 color = compData.repColor;
+	clr.l	d6
+	move.b	(compData.repColor,a5),d6	| d6.b: color (used as long, too)
+	clr.l	d7				| used as long, too
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// int lastColumnX = -1;
+	st.b	(shadowMask,a6)
+	// uint16 height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// byte scaleIndexX = compData.scaleXIndex;
+	// byte scaleIndexY = compData.scaleYIndex;
+	move.l	(compData.scaleTable,a5),a3	| a3: compData.scaleTable
+	move.b	(compData.scaleXIndex+3,a5),(scaleIndexX,a6)
+	move.b	(compData.scaleYIndex+3,a5),(scaleIndexY,a6)
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.b	(arg_scaleY,a6),d5		| d5.b: _scaleY
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_shadowTable,a6),a4		| a4: _shadowTable
+
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,(len,a6)
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.w	d7
+	move.b	(a0)+,d7
+	move.w	d7,d6				| d6.w: clear the highest bit ("shadow_on" bit cleared)
+	move.b	(compData.shr,a5),d0
+	lsr.b	d0,d6				| d6.b: color (used as long, too)
+	and.b	(compData.mask,a5),d7
+	bne.b	1f
+	move.b	(a0)+,d7
+1:	move.w	d7,(len,a6)
+
+pcolor_setup:
+	tst.b	d6
+	beq.b	len_loop
+	move.w	([arg_palette,a6],d6.l*2),d7	| d7.w: pcolor = _palette[color] (used as long, too)
+	cmp.w	#13,d7
+	bne.b	len_loop
+	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
+	and.b	(shadowMask,a6),d6		| 0xFF: no-op; 0x00: clears color, suppresses writes
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(len,a6),d0
+	move.w	(height,a6),d1			| d1.w: batch
+	cmp.w	d0,d1
+	blt.b	1f
+	move.w	d0,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,(len,a6)
+	sub.w	d1,(height,a6)
+
+	subq.w	#1,d1				| dbra
+	clr.l	d0
+	move.b	(scaleIndexY,a6),d0
+	tst.w	d6
+	bmi.b	batch_loop_shadow
+batch_loop:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY)
+	cmp.b	(a3,d0.l),d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	d6
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _palette[color];
+	move.b	d7,(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	addq.b	#1,d0				| scaleIndexY++ (byte wrap)
+	// 		} while (--batch);
+	dbra	d1,batch_loop
+	bra.b	3f
+
+batch_loop_shadow:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+	cmp.b	(a3,d0.l),d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	d6
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _shadowTable[*dst];
+	move.b	(a1),d7
+	move.b	(a4,d7.l),(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	addq.b	#1,d0				| scaleIndexY++ (byte wrap)
+	// 		} while (--batch);
+	dbra	d1,batch_loop_shadow
+
+3:	move.b	d0,(scaleIndexY,a6)
+	// 	if (height == 0) {
+	tst.w	(height,a6)
+	bne.b	6f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// 		scaleIndexY = compData.scaleYIndex;
+	move.b	(compData.scaleYIndex+3,a5),(scaleIndexY,a6)
+	// 		lastColumnX = compData.x;
+	sf.b	(shadowMask,a6)			| assume duplicate column (0x00)
+
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+
+	// 		dst = compData.destPtr; (moved from the below)
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// 		if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+	clr.l	d0
+	move.b	(scaleIndexX,a6),d0
+	move.b	(a3,d0.l),d0
+	cmp.b	(arg_scaleX,a6),d0
+	bhs.b	5f
+
+	st.b	(shadowMask,a6)			| new column (0xFF)
+
+	// 			compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d4
+	add.l	d1,d4
+	move.l	d4,(compData.x,a5)
+	// 			maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 			compData.destPtr += compData.scaleXStep;
+	add.l	d1,a1
+	move.l	a1,(compData.destPtr,a5)
+	// 		}
+5:	// 		compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+	add.b	d1,(scaleIndexX,a6)		| byte wrap (d1.b = LSB of scaleXStep)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	move.l	(compData.x,a5),d0
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+6:	// StartPos:
+	// } while (len > 0);
+	tst.w	(len,a6)
+	bne.w	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode3.S b/engines/scumm/m68k/bylerledecode_scaled_mode3.S
new file mode 100644
index 00000000000..e30c99011ac
--- /dev/null
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode3.S
@@ -0,0 +1,253 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "bylerledecode.inc"
+
+	.globl	SYM(ByleRLEDecode_Scaled_Mode3)
+
+	add_var	len,2		// uint16 len;
+	add_var	height,2	// uint16 height;
+	add_var	scaleTableX,4	// &compData.scaleTable[compData.scaleXIndex]
+	add_var	scaleTableY,4	// &compData.scaleTable[compData.scaleYIndex]
+	add_var	shadowMask,1+3	// 0xFF = draw shadows (new column), 0x00 = skip shadows (duplicate column) (align stack)
+
+	.text
+
+| void ByleRLEDecode_Scaled_Mode3(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX,
+|	const byte _scaleY,
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Scaled_Mode3):
+// TODO: __FASTCALL__
+	link	a6,#-sizeof_local_vars		| a6: frame pointer
+	movem.l	d2-d7/a2-a5,-(sp)
+
+	// registers taken so far:
+	// - d7(pcolor), d6(color), d5(_scaleY), d4(maskbit), d3(pitch), d2(_numStrips)
+	// - a0(src), a1(dst), a2(mask), a3(scaleTableY), a4(_shadowTable), a5(pcompData), a6(frame pointer)
+
+	// BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	move.l	(arg_pcompData,a6),a5		| a5: pcompData
+
+	// const byte *src = _srcPtr;
+	move.l	(arg_srcPtr,a6),a0		| a0: src
+
+	// uint16 len = compData.repLen;
+	clr.w	(len,a6)
+	move.b	(compData.repLen,a5),(len+1,a6)
+	// uint16 color = compData.repColor;
+	clr.l	d6
+	move.b	(compData.repColor,a5),d6	| d6.b: color (used as long, too)
+	clr.l	d7				| used as long, too
+
+	// byte *dst = compData.destPtr;
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// int lastColumnX = -1;
+	st.b	(shadowMask,a6)
+	// uint16 height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// int scaleIndexY = compData.scaleYIndex;
+	move.l	(compData.scaleXIndex,a5),d0
+	lea	([compData.scaleTable,a5],d0.l),a3
+	move.l	a3,(scaleTableX,a6)
+	move.l	(compData.scaleYIndex,a5),d0
+	lea	([compData.scaleTable,a5],d0.l),a3	| a3: scaleTableY
+	move.l	a3,(scaleTableY,a6)
+	// byte maskbit = revBitMask(compData.x & 7);
+	// #define revBitMask(x) (0x80 >> (x))
+	move.l	(compData.x,a5),d0
+	move.b	d0,d4
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// const byte *mask = compData.maskPtr + compData.x / 8;
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+
+	move.b	(arg_scaleY,a6),d5		| d5.b: _scaleY
+	move.l	(arg_pitch,a6),d3		| d3.w: pitch (used as long, too)
+	move.l	(arg_numStrips,a6),d2		| d2.w: _numStrips (used as long, too)
+	move.l	(arg_shadowTable,a6),a4		| a4: _shadowTable
+
+	// if (len) {
+	// 	--len;
+	// 	goto StartPos;
+	// }
+	// len   --len action
+	// 0     -1    jump into main_loop
+	// 1      0    jump into main_loop = jump into len_loop (0x)
+	// 2      1    jump into len_loop (1x)
+	// 3      2    jump into len_loop (2x)
+	subq.w	#1,(len,a6)
+	bgt.b	pcolor_setup
+main_loop:
+	// do {
+	// len = *src++;
+	// color = len >> compData.shr;
+	// len &= compData.mask;
+	// if (!len)
+	// 	len = *src++;
+	clr.w	d7
+	move.b	(a0)+,d7
+	move.w	d7,d6				| d6.w: clear the highest bit ("shadow_on" bit cleared)
+	move.b	(compData.shr,a5),d0
+	lsr.b	d0,d6				| d6.b: color (used as long, too)
+	and.b	(compData.mask,a5),d7
+	bne.b	1f
+	move.b	(a0)+,d7
+1:	move.w	d7,(len,a6)
+
+pcolor_setup:
+	tst.b	d6
+	beq.b	len_loop
+	move.w	([arg_palette,a6],d6.l*2),d7	| d7.w: pcolor = _palette[color] (used as long, too)
+	cmp.w	#8,d7
+	bhs.b	len_loop
+	lsl.w	#8,d7				| d7.w: if pcolor < 8: pcolor << 8
+	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
+	and.b	(shadowMask,a6),d6		| 0xFF: no-op; 0x00: clears color, suppresses writes
+len_loop:
+	// do {
+	// 	batch = height < len ? height : len;
+	move.w	(len,a6),d0
+	move.w	(height,a6),d1			| d1.w: batch
+	cmp.w	d0,d1
+	blt.b	1f
+	move.w	d0,d1
+1:	// 	len -= batch;
+	// 	height -= batch;
+	sub.w	d1,(len,a6)
+	sub.w	d1,(height,a6)
+
+	subq.w	#1,d1				| dbra
+	tst.w	d6
+	bmi.b	batch_loop_shadow
+batch_loop:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY)
+	cmp.b	(a3)+,d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	d6
+	beq.b	1f
+	// 					if (!(*mask & maskbit))
+	btst	d4,(a2)
+	bne.b	1f
+	// 						*dst = _palette[color];
+	move.b	d7,(a1)
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	// 		} while (--batch);
+	dbra	d1,batch_loop
+	bra.b	3f
+
+batch_loop_shadow:
+	// 		do {
+	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY) {
+	cmp.b	(a3)+,d5
+	bls.b	2f
+	// 				if (color) {
+	tst.b	d6
+	beq.b	1f
+	// 					if (!(*mask & maskbit)) {
+	btst	d4,(a2)
+	bne.b	1f
+	// 						uint16 pcolor = _palette[color];
+	// 						pcolor = (pcolor << 8) + *dst;
+	// 						*dst = _shadowTable[pcolor];
+	move.b	(a1),d7
+	move.b	(a4,d7.l),(a1)
+	// 					}
+	// 				}
+1:	// 				dst  += pitch;
+	// 				mask += _numStrips;
+	add.l	d3,a1
+	add.l	d2,a2
+	// 			}
+2:	// 		} while (--batch);
+	dbra	d1,batch_loop_shadow
+
+3:	// 	if (height == 0) {
+	tst.w	(height,a6)
+	bne.b	6f
+	// 		if (--compData.skipWidth == 0)
+	subq.w	#1,(compData.skipWidth,a5)
+	bne.b	4f
+	// 			return;
+	movem.l	(sp)+,d2-d7/a2-a5
+	unlk	a6
+	rts
+
+4:	// 		height = _height;
+	move.w	(arg_height,a6),(height,a6)
+	// 		scaleIndexY = compData.scaleYIndex;
+	move.l	(scaleTableY,a6),a3
+	// 		lastColumnX = compData.x;
+	sf.b	(shadowMask,a6)			| assume duplicate column (0x00)
+
+	move.l	(compData.scaleXStep,a5),d1	| d1.l: preload scaleXStep
+
+	// 		dst = compData.destPtr; (moved from the below)
+	move.l	(compData.destPtr,a5),a1	| a1: dst
+	// 		if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+	move.b	(arg_scaleX,a6),d0
+	cmp.b	([scaleTableX,a6]),d0
+	bls.b	5f
+
+	st.b	(shadowMask,a6)			| new column (0xFF)
+
+	// 			compData.x += compData.scaleXStep;
+	move.l	(compData.x,a5),d4
+	add.l	d1,d4
+	move.l	d4,(compData.x,a5)
+	// 			maskbit = revBitMask(compData.x & 7);
+	not.b	d4
+	and.b	#0b00000111,d4			| d4.b: bit to test
+	// 			compData.destPtr += compData.scaleXStep;
+	add.l	d1,a1
+	move.l	a1,(compData.destPtr,a5)
+	// 		}
+5:	// 		compData.scaleXIndex += compData.scaleXStep;
+	add.l	d1,(scaleTableX,a6)
+	// 		mask = compData.maskPtr + compData.x / 8;
+	move.l	(compData.x,a5),d0
+	asr.l	#3,d0
+	move.l	(compData.maskPtr,a5),a2
+	add.l	d0,a2				| a2: mask
+	// 	}
+6:	// StartPos:
+	// } while (len > 0);
+	tst.w	(len,a6)
+	bne.w	len_loop
+	// } while (true);
+	bra.w	main_loop
diff --git a/engines/scumm/module.mk b/engines/scumm/module.mk
index 819efda0e67..0f8996fb794 100644
--- a/engines/scumm/module.mk
+++ b/engines/scumm/module.mk
@@ -106,6 +106,19 @@ MODULE_OBJS += \
 	debugger/resource.o
 endif
 
+ifdef USE_M68K_COSTUME_ASM
+MODULE_OBJS += \
+	m68k/bylerledecode_classic.o \
+	m68k/bylerledecode_mode0.o \
+	m68k/bylerledecode_mode1.o \
+	m68k/bylerledecode_mode3.o \
+	m68k/bylerledecode_scaled_mode0.o \
+	m68k/bylerledecode_scaled_mode0_smask.o \
+	m68k/bylerledecode_scaled_mode1.o \
+	m68k/bylerledecode_scaled_mode1_smask.o \
+	m68k/bylerledecode_scaled_mode3.o
+endif
+
 ifdef ENABLE_SCUMM_7_8
 MODULE_OBJS += \
 	nut_renderer.o \


Commit: 16b6d2a0aead785c32b0a698b75cea54d3c2fdd6
    https://github.com/scummvm/scummvm/commit/16b6d2a0aead785c32b0a698b75cea54d3c2fdd6
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Implement HE>=90 Mode3

Provide ByleRLEDecode_Mode3_HE90 in m68k assembly, too. I haven't been
able to see ByleRLEDecode_Scaled_Mode3_HE90_SMask in use, so keep as a C
function for now.

Changed paths:
    engines/scumm/base-costume-optimised.cpp
    engines/scumm/base-costume.cpp
    engines/scumm/base-costume.h
    engines/scumm/bylerledecodeM68K.h
    engines/scumm/m68k/bylerledecode_mode3.S


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index b35c4632c06..05dad74542a 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -265,13 +265,6 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 	return drawFlag;
 }
 
-enum class ShadowMode : int {
-	Mode0,
-	Mode1,
-	Mode3,
-	Classic
-};
-
 #ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Mode0(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
@@ -513,6 +506,85 @@ void ByleRLEDecode_Mode3(
 }
 #endif
 
+#ifndef USE_M68K_COSTUME_ASM
+void ByleRLEDecode_Mode3_HE90(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
+
+	const byte *src = _srcPtr;
+
+	uint16 len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	byte *dst = compData.destPtr;
+	uint16 height = _height;
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	uint16 batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			if (color) {
+				do {
+					if (!(*mask & maskbit)) {
+						uint16 pcolor = _palette[color];
+						pcolor = (pcolor << 8) + *dst;
+						*dst = _shadowTable[pcolor];
+					}
+					dst  += pitch;
+					mask += _numStrips;
+				} while (--batch);
+			} else {
+				dst  += batch * pitch;
+				mask += batch * _numStrips;
+			}
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+#endif
+
 #ifndef USE_M68K_COSTUME_ASM
 void ByleRLEDecode_Classic(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
@@ -1034,6 +1106,95 @@ void ByleRLEDecode_Scaled_Mode3(
 }
 #endif
 
+void ByleRLEDecode_Scaled_Mode3_HE90_SMask(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	Scumm::BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	warning("%s: unexpected call, save your game and report", __FUNCTION__);
+
+	const byte *src = _srcPtr;
+
+	byte len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	// reset every column
+	byte *dst = compData.destPtr;
+	int lastColumnX = -1;
+	uint16 height = _height;
+	int scaleIndexY = compData.scaleYIndex;
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	byte batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? (byte)height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			do {
+				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
+					if (color) {
+						if (!(*mask & maskbit)) {
+							uint16 pcolor;
+
+							pcolor = _palette[color];
+							if (lastColumnX != compData.x) {
+								pcolor = (pcolor << 8) + *dst;
+								*dst = _shadowTable[pcolor];
+							}
+						}
+					}
+					dst += pitch;
+					mask += _numStrips;
+				}
+			} while (--batch);
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+
+				scaleIndexY = compData.scaleYIndex;
+				lastColumnX = compData.x;
+
+				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					compData.x += compData.scaleXStep;
+					maskbit = revBitMask(compData.x & 7);
+					compData.destPtr += compData.scaleXStep;
+				}
+
+				compData.scaleXIndex = (compData.scaleXIndex + compData.scaleXStep) & compData.scaleIndexMask;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
 void ByleRLEDecode_Scaled_Classic_SMask(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -1118,6 +1279,14 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 	} while (true);
 }
 
+enum class ShadowMode : int {
+	Mode0,
+	Mode1,
+	Mode3,	// COMI: pcolor < 8
+	Mode3_HE,	// HE >= 90
+	Classic
+};
+
 typedef void (*ByleRLEDecodeFunc)(
 	BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX,
@@ -1129,36 +1298,46 @@ typedef void (*ByleRLEDecodeFunc)(
 	const byte *_shadowTable,
 	const uint16 *_palette);
 
-static const ByleRLEDecodeFunc byleRLEDecodeNoScaleTable[4] = {
-	ByleRLEDecode_Mode0,    // 0: Mode0
-	ByleRLEDecode_Mode1,    // 1: Mode1
-	ByleRLEDecode_Mode3,    // 2: Mode3
-	ByleRLEDecode_Classic,  // 3: Classic
+static const ByleRLEDecodeFunc byleRLEDecodeNoScaleTable[5] = {
+	ByleRLEDecode_Mode0,       // 0: Mode0
+	ByleRLEDecode_Mode1,       // 1: Mode1
+	ByleRLEDecode_Mode3,       // 2: Mode3 COMI
+	ByleRLEDecode_Mode3_HE90,  // 3: Mode3 HE >= 90
+	ByleRLEDecode_Classic,     // 4: Classic
 };
 
-static const ByleRLEDecodeFunc byleRLEDecodeScaledTable[8] = {
-	ByleRLEDecode_Scaled_Mode0,         // 0: Mode0,   no scaleIndexMask
-	ByleRLEDecode_Scaled_Mode0_SMask,   // 1: Mode0,   scaleIndexMask
-	ByleRLEDecode_Scaled_Mode1,         // 2: Mode1,   no scaleIndexMask
-	ByleRLEDecode_Scaled_Mode1_SMask,   // 3: Mode1,   scaleIndexMask
-	ByleRLEDecode_Scaled_Mode3,         // 4: Mode3,   no scaleIndexMask
-	nullptr,                            // 5: Mode3,   scaleIndexMask (COMI's Mode3 always uses bigCostumeScaleTable)
-	nullptr,                            // 6: Classic, no scaleIndexMask (_shadowMode & 0x20 always uses smallCostumeScaleTable)
-	ByleRLEDecode_Scaled_Classic_SMask, // 7: Classic, scaleIndexMask
+static const ByleRLEDecodeFunc byleRLEDecodeScaledTable[10] = {
+	ByleRLEDecode_Scaled_Mode0,            // 0: Mode0,   no scaleIndexMask
+	ByleRLEDecode_Scaled_Mode0_SMask,      // 1: Mode0,   scaleIndexMask
+	ByleRLEDecode_Scaled_Mode1,            // 2: Mode1,   no scaleIndexMask
+	ByleRLEDecode_Scaled_Mode1_SMask,      // 3: Mode1,   scaleIndexMask
+	ByleRLEDecode_Scaled_Mode3,            // 4: Mode3,   no scaleIndexMask
+	nullptr,                               // 5: Mode3,   scaleIndexMask (COMI's Mode3 always uses bigCostumeScaleTable)
+	nullptr,                               // 6: HE>=90,  no scaleIndexMask (HE>=90 always uses smallCostumeScaleTable)
+	ByleRLEDecode_Scaled_Mode3_HE90_SMask, // 7: HE>=90,  scaleIndexMask (unconfirmed)
+	nullptr,                               // 8: Classic, no scaleIndexMask (_shadowMode & 0x20 always uses smallCostumeScaleTable)
+	ByleRLEDecode_Scaled_Classic_SMask,    // 9: Classic, scaleIndexMask (unconfirmed)
 };
 
-void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
+void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *xmap) {
 	ShadowMode shadowMode = ShadowMode::Mode0;
+	const byte *shadowTable = _shadowTable;
 	if (!_akosRendering) {
 		if (_shadowMode & 0x20)
 			shadowMode = ShadowMode::Classic;
 		else if (_shadowTable)
 			shadowMode = ShadowMode::Mode1;
 	} else {
-		if (_shadowMode == 1)
+		if (_shadowMode == 1) {
 			shadowMode = ShadowMode::Mode1;
-		else if (_shadowMode == 3)
-			shadowMode = ShadowMode::Mode3;
+		} else if (_shadowMode == 3) {
+			if (_vm->_game.heversion >= 90) {
+				shadowMode = ShadowMode::Mode3_HE;
+				shadowTable = xmap;
+			} else {
+				shadowMode = ShadowMode::Mode3;
+			}
+		}
 	}
 
 	if (compData.y >= compData.boundsRect.top && compData.y + compData.scaledHeight <= compData.boundsRect.bottom) {
@@ -1172,7 +1351,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				_out.pitch,
 				_numStrips,
 				_srcPtr,
-				_shadowTable,
+				shadowTable,
 				_palette);
 		} else {
 			const int useScaleIndexMask = compData.scaleIndexMask != -1;
@@ -1185,7 +1364,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 				_out.pitch,
 				_numStrips,
 				_srcPtr,
-				_shadowTable,
+				shadowTable,
 				_palette);
 		}
 		return;
@@ -1268,6 +1447,14 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData) {
 									*dst = pcolor;
 								}
 								break;
+
+							case ShadowMode::Mode3_HE:
+								pcolor = _palette[color];
+								if (lastColumnX != compData.x) {
+									pcolor = (pcolor << 8) + *dst;
+									*dst = xmap[pcolor];
+								}
+								break;
 							}
 						}
 					}
diff --git a/engines/scumm/base-costume.cpp b/engines/scumm/base-costume.cpp
index 13b32bdf578..b9e7cfbd2a5 100644
--- a/engines/scumm/base-costume.cpp
+++ b/engines/scumm/base-costume.cpp
@@ -291,10 +291,10 @@ byte BaseCostumeRenderer::paintCelByleRLECommon(
 void BaseCostumeRenderer::byleRLEDecode(ByleRLEData &compData, int16 actorHitX, int16 actorHitY, bool *actorHitResult, const uint8 *xmap) {
 #ifdef SCUMM_OPTIMISED_CODE
 	if ((_vm->_bytesPerPixel == 1) &&
-		(!_akosRendering || _shadowMode != 3 || (!(_vm->_game.features & GF_16BIT_COLOR) && _vm->_game.heversion < 90)) &&
+		(!(_vm->_game.features & GF_16BIT_COLOR)) &&
 		(actorHitResult == NULL) &&
 		(compData.maskPtr != NULL)) {
-		byleRLEDecodeFast(compData);
+		byleRLEDecodeFast(compData, xmap);
 		return;
 	}
 	warning("%s: unoptimised version is being executed", __FUNCTION__);
diff --git a/engines/scumm/base-costume.h b/engines/scumm/base-costume.h
index 1268a39a677..16642542c65 100644
--- a/engines/scumm/base-costume.h
+++ b/engines/scumm/base-costume.h
@@ -177,7 +177,7 @@ protected:
 
 	void byleRLEDecode(ByleRLEData &compData, int16 actorHitX = 0, int16 actorHitY = 0, bool *actorHitResult = nullptr, const uint8 *xmap = nullptr);
 #ifdef SCUMM_OPTIMISED_CODE
-	void byleRLEDecodeFast(ByleRLEData &compData);
+	void byleRLEDecodeFast(ByleRLEData &compData, const byte *xmap);
 #endif
 	void skipCelLines(ByleRLEData &compData, int num);
 
diff --git a/engines/scumm/bylerledecodeM68K.h b/engines/scumm/bylerledecodeM68K.h
index a7e5f254140..6495759a256 100644
--- a/engines/scumm/bylerledecodeM68K.h
+++ b/engines/scumm/bylerledecodeM68K.h
@@ -57,6 +57,17 @@ extern "C" void ByleRLEDecode_Mode3(
 	const byte *_shadowTable,
 	const uint16 *_palette);
 
+extern "C" void ByleRLEDecode_Mode3_HE90(
+	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX,
+	const byte _scaleY,
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette);
+
 extern "C" void ByleRLEDecode_Classic(
 	Scumm::BaseCostumeRenderer::ByleRLEData *pcompData,
 	const byte _scaleX, /* unused */
diff --git a/engines/scumm/m68k/bylerledecode_mode3.S b/engines/scumm/m68k/bylerledecode_mode3.S
index fa921e378b1..9a063ab3e61 100644
--- a/engines/scumm/m68k/bylerledecode_mode3.S
+++ b/engines/scumm/m68k/bylerledecode_mode3.S
@@ -24,8 +24,10 @@
 #include "bylerledecode.inc"
 
 	.globl	SYM(ByleRLEDecode_Mode3)
+	.globl	SYM(ByleRLEDecode_Mode3_HE90)
 
 	add_var	len,2+2	// uint16 len; (align stack)
+	add_var	shadow_threshold,2
 
 
 	.text
@@ -41,9 +43,27 @@
 |	const byte *_shadowTable,
 |	const uint16 *_palette);
 SYM(ByleRLEDecode_Mode3):
+	moveq	#8,d0
+	bra.b	mode3_common
+
+| void ByleRLEDecode_Mode3_HE90(
+|	BaseCostumeRenderer::ByleRLEData *pcompData,
+|	const byte _scaleX, /* unused */
+|	const byte _scaleY, /* unused */
+|	const int _height,
+|	const int pitch,
+|	const int _numStrips,
+|	const byte *_srcPtr,
+|	const byte *_shadowTable,
+|	const uint16 *_palette);
+SYM(ByleRLEDecode_Mode3_HE90):
+	move.w	#256,d0
+
+mode3_common:
 // TODO: __FASTCALL__
 	link	a6,#-sizeof_local_vars		| a6: frame pointer
 	movem.l	d2-d7/a2-a5,-(sp)
+	move.w	d0,(shadow_threshold,a6)
 
 	// registers taken so far:
 	// - d7(pcolor), d6(color), d5(height), d4(maskbit), d3(pitch), d2(_numStrips)
@@ -115,7 +135,7 @@ pcolor_setup:
 	tst.b	d6
 	beq.b	len_loop
 	move.w	(a3,d6.l*2),d7			| d7.w: pcolor = _palette[color] (used as long, too)
-	cmp.w	#8,d7
+	cmp.w	(shadow_threshold,a6),d7
 	bhs.b	len_loop
 	lsl.w	#8,d7				| d7.w: if pcolor < 8: pcolor << 8
 	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set


Commit: ff472311ac8c33c0a450747a1afb9775b88b0f29
    https://github.com/scummvm/scummvm/commit/ff472311ac8c33c0a450747a1afb9775b88b0f29
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Prefer non-exported labels with .L

Hatari profiler should be now less confused.

Changed paths:
    engines/scumm/m68k/bylerledecode_classic.S
    engines/scumm/m68k/bylerledecode_mode0.S
    engines/scumm/m68k/bylerledecode_mode1.S
    engines/scumm/m68k/bylerledecode_mode3.S
    engines/scumm/m68k/bylerledecode_scaled_mode0.S
    engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
    engines/scumm/m68k/bylerledecode_scaled_mode1.S
    engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
    engines/scumm/m68k/bylerledecode_scaled_mode3.S


diff --git a/engines/scumm/m68k/bylerledecode_classic.S b/engines/scumm/m68k/bylerledecode_classic.S
index 590062d7d31..01dd94dab49 100644
--- a/engines/scumm/m68k/bylerledecode_classic.S
+++ b/engines/scumm/m68k/bylerledecode_classic.S
@@ -92,8 +92,8 @@ SYM(ByleRLEDecode_Classic):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,d6
-	bgt.b	len_loop
-main_loop:
+	bgt.b	.Llen_loop
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -107,9 +107,9 @@ main_loop:
 	lsr.b	d1,d0				| d0.b: color (only care about zero/nonzero)
 	move.b	d0,(color,a6)
 	and.b	(compData.mask,a5),d6		| d6.w: len
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	move.b	(a0)+,d6
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	d5,d1				| d1.w: batch
@@ -125,7 +125,7 @@ len_loop:
 	beq.b	2f
 
 	subq.w	#1,d1				| dbra
-batch_loop_shadow:
+.Lbatch_loop_shadow:
 	// 		do {
 	// 			if (!(*mask & maskbit))
 	btst	d4,(a2)
@@ -138,7 +138,7 @@ batch_loop_shadow:
 	add.l	d3,a1
 	add.l	d2,a2
 	// 		} while (--batch);
-	dbra	d1,batch_loop_shadow
+	dbra	d1,.Lbatch_loop_shadow
 	bra.b	3f
 
 	// 	} else {
@@ -185,6 +185,6 @@ batch_loop_shadow:
 5:	// StartPos:
 	// } while (len > 0);
 	tst.w	d6
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_mode0.S b/engines/scumm/m68k/bylerledecode_mode0.S
index 4dfb7f0f12b..e808437c2a2 100644
--- a/engines/scumm/m68k/bylerledecode_mode0.S
+++ b/engines/scumm/m68k/bylerledecode_mode0.S
@@ -93,8 +93,8 @@ SYM(ByleRLEDecode_Mode0):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,d6
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -108,11 +108,11 @@ main_loop:
 	lsr.b	d1,d0				| d0.b: color (used as long, too)
 	move.b	d0,(color,a6)
 	and.b	(compData.mask,a5),d6		| d6.w: len
-	bne.b	pcolor_setup
+	bne.b	.Lpcolor_setup
 	move.b	(a0)+,d6
-pcolor_setup:
+.Lpcolor_setup:
 	move.w	(a3,d0.l*2),d7			| d7.w: _palette[color]
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	d5,d1				| d1.w: batch
@@ -128,7 +128,7 @@ len_loop:
 	beq.b	2f
 
 	subq.w	#1,d1				| dbra
-batch_loop:
+.Lbatch_loop:
 	// 		do {
 	// 			if (!(*mask & maskbit))
 	btst	d4,(a2)
@@ -140,7 +140,7 @@ batch_loop:
 	add.l	d3,a1
 	add.l	d2,a2
 	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 	bra.b	3f
 	// 	} else {
 2:	// 		dst  += batch * pitch;
@@ -186,6 +186,6 @@ batch_loop:
 5:	// StartPos:
 	// } while (len > 0);
 	tst.w	d6
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_mode1.S b/engines/scumm/m68k/bylerledecode_mode1.S
index 6b11f50c2e1..a189ad706c6 100644
--- a/engines/scumm/m68k/bylerledecode_mode1.S
+++ b/engines/scumm/m68k/bylerledecode_mode1.S
@@ -93,8 +93,8 @@ SYM(ByleRLEDecode_Mode1):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,(len,a6)
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -111,14 +111,14 @@ main_loop:
 	move.b	(a0)+,d7
 1:	move.w	d7,(len,a6)
 
-pcolor_setup:
+.Lpcolor_setup:
 	tst.b	d6
-	beq.b	len_loop
+	beq.b	.Llen_loop
 	move.w	(a3,d6.l*2),d7			| d7.w: pcolor = _palette[color] (used as long, too)
 	cmp.w	#13,d7
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(len,a6),d0
@@ -136,8 +136,8 @@ len_loop:
 
 	subq.w	#1,d1				| dbra
 	tst.w	d6
-	bmi.b	batch_loop_shadow
-batch_loop:
+	bmi.b	.Lbatch_loop_shadow
+.Lbatch_loop:
 	// 		do {
 	// 			if (!(*mask & maskbit))
 	btst	d4,(a2)
@@ -149,10 +149,10 @@ batch_loop:
 	add.l	d3,a1
 	add.l	d2,a2
 	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 	bra.b	3f
 
-batch_loop_shadow:
+.Lbatch_loop_shadow:
 	// 		do {
 	// 			if (!(*mask & maskbit))
 	btst	d4,(a2)
@@ -165,7 +165,7 @@ batch_loop_shadow:
 	add.l	d3,a1
 	add.l	d2,a2
 	// 		} while (--batch);
-	dbra	d1,batch_loop_shadow
+	dbra	d1,.Lbatch_loop_shadow
 	bra.b	3f
 
 	// 	} else {
@@ -213,6 +213,6 @@ batch_loop_shadow:
 5:	// StartPos:
 	// } while (len > 0);
 	tst.w	(len,a6)
-	bne.w	len_loop
+	bne.w	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_mode3.S b/engines/scumm/m68k/bylerledecode_mode3.S
index 9a063ab3e61..d6cf1794ad1 100644
--- a/engines/scumm/m68k/bylerledecode_mode3.S
+++ b/engines/scumm/m68k/bylerledecode_mode3.S
@@ -44,7 +44,7 @@
 |	const uint16 *_palette);
 SYM(ByleRLEDecode_Mode3):
 	moveq	#8,d0
-	bra.b	mode3_common
+	bra.b	.Lmode3_common
 
 | void ByleRLEDecode_Mode3_HE90(
 |	BaseCostumeRenderer::ByleRLEData *pcompData,
@@ -59,7 +59,7 @@ SYM(ByleRLEDecode_Mode3):
 SYM(ByleRLEDecode_Mode3_HE90):
 	move.w	#256,d0
 
-mode3_common:
+.Lmode3_common:
 // TODO: __FASTCALL__
 	link	a6,#-sizeof_local_vars		| a6: frame pointer
 	movem.l	d2-d7/a2-a5,-(sp)
@@ -113,8 +113,8 @@ mode3_common:
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,(len,a6)
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -131,15 +131,15 @@ main_loop:
 	move.b	(a0)+,d7
 1:	move.w	d7,(len,a6)
 
-pcolor_setup:
+.Lpcolor_setup:
 	tst.b	d6
-	beq.b	len_loop
+	beq.b	.Llen_loop
 	move.w	(a3,d6.l*2),d7			| d7.w: pcolor = _palette[color] (used as long, too)
 	cmp.w	(shadow_threshold,a6),d7
-	bhs.b	len_loop
+	bhs.b	.Llen_loop
 	lsl.w	#8,d7				| d7.w: if pcolor < 8: pcolor << 8
 	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(len,a6),d0
@@ -157,8 +157,8 @@ len_loop:
 
 	subq.w	#1,d1				| dbra
 	tst.w	d6
-	bmi.b	batch_loop_shadow
-batch_loop:
+	bmi.b	.Lbatch_loop_shadow
+.Lbatch_loop:
 	// 		do {
 	// 			if (!(*mask & maskbit))
 	btst	d4,(a2)
@@ -170,10 +170,10 @@ batch_loop:
 	add.l	d3,a1
 	add.l	d2,a2
 	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 	bra.b	3f
 
-batch_loop_shadow:
+.Lbatch_loop_shadow:
 	// 		do {
 	// 			if (!(*mask & maskbit)) {
 	btst	d4,(a2)
@@ -189,7 +189,7 @@ batch_loop_shadow:
 	add.l	d3,a1
 	add.l	d2,a2
 	// 		} while (--batch);
-	dbra	d1,batch_loop_shadow
+	dbra	d1,.Lbatch_loop_shadow
 	bra.b	3f
 
 	// 	} else {
@@ -237,6 +237,6 @@ batch_loop_shadow:
 5:	// StartPos:
 	// } while (len > 0);
 	tst.w	(len,a6)
-	bne.w	len_loop
+	bne.w	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode0.S b/engines/scumm/m68k/bylerledecode_scaled_mode0.S
index e960b0bf68a..ea086e252ea 100644
--- a/engines/scumm/m68k/bylerledecode_scaled_mode0.S
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode0.S
@@ -102,8 +102,8 @@ SYM(ByleRLEDecode_Scaled_Mode0):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,d6
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -117,11 +117,11 @@ main_loop:
 	lsr.b	d1,d0				| d0.b: color
 	move.b	d0,(color,a6)
 	and.b	(compData.mask,a5),d6		| d6.w: len
-	bne.b	pcolor_setup
+	bne.b	.Lpcolor_setup
 	move.b	(a0)+,d6
-pcolor_setup:
+.Lpcolor_setup:
 	move.w	([arg_palette,a6],d0.l*2),d7	| d7.w: _palette[color]
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(height,a6),d1			| d1.w: batch
@@ -134,7 +134,7 @@ len_loop:
 	sub.w	d1,(height,a6)
 
 	subq.w	#1,d1				| dbra
-batch_loop:
+.Lbatch_loop:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY)
 	cmp.b	(a3)+,d5
@@ -154,7 +154,7 @@ batch_loop:
 	add.l	d2,a2
 	// 			}
 2:	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 
 	// 	if (height == 0) {
 	tst.w	(height,a6)
@@ -204,6 +204,6 @@ batch_loop:
 6:	// StartPos:
 	// } while (len > 0);
 	tst.w	d6
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S b/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
index 6a5a39838bf..3357f132e84 100644
--- a/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode0_smask.S
@@ -100,8 +100,8 @@ SYM(ByleRLEDecode_Scaled_Mode0_SMask):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,d6
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -115,11 +115,11 @@ main_loop:
 	lsr.b	d1,d0				| d0.b: color
 	move.b	d0,(color,a6)
 	and.b	(compData.mask,a5),d6		| d6.w: len
-	bne.b	pcolor_setup
+	bne.b	.Lpcolor_setup
 	move.b	(a0)+,d6
-pcolor_setup:
+.Lpcolor_setup:
 	move.w	([arg_palette,a6],d0.l*2),d7	| d7.w: _palette[color]
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(height,a6),d1			| d1.w: batch
@@ -137,7 +137,7 @@ len_loop:
 	clr.l	d0
 	move.b	(scaleIndexY,a6),d0
 
-batch_loop:
+.Lbatch_loop:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY)
 	cmp.b	(a3,d0.l),d5
@@ -158,7 +158,7 @@ batch_loop:
 	// 			}
 2:	addq.b	#1,d0				| scaleIndexY++ (byte wrap)
 	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 
 	move.b	d0,(scaleIndexY,a6)
 	// 	if (height == 0) {
@@ -211,6 +211,6 @@ batch_loop:
 6:	// StartPos:
 	// } while (len > 0);
 	tst.w	d6
-	bne.w	len_loop
+	bne.w	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode1.S b/engines/scumm/m68k/bylerledecode_scaled_mode1.S
index 1c78f1bcc90..866c4c47e7d 100644
--- a/engines/scumm/m68k/bylerledecode_scaled_mode1.S
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode1.S
@@ -105,8 +105,8 @@ SYM(ByleRLEDecode_Scaled_Mode1):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,(len,a6)
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -123,15 +123,15 @@ main_loop:
 	move.b	(a0)+,d7
 1:	move.w	d7,(len,a6)
 
-pcolor_setup:
+.Lpcolor_setup:
 	tst.b	d6
-	beq.b	len_loop
+	beq.b	.Llen_loop
 	move.w	([arg_palette,a6],d6.l*2),d7	| d7.w: pcolor = _palette[color] (used as long, too)
 	cmp.w	#13,d7
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
 	and.b	(shadowMask,a6),d6		| 0xFF: no-op; 0x00: clears color, suppresses writes
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(len,a6),d0
@@ -146,8 +146,8 @@ len_loop:
 
 	subq.w	#1,d1				| dbra
 	tst.w	d6
-	bmi.b	batch_loop_shadow
-batch_loop:
+	bmi.b	.Lbatch_loop_shadow
+.Lbatch_loop:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY)
 	cmp.b	(a3)+,d5
@@ -167,10 +167,10 @@ batch_loop:
 	add.l	d2,a2
 	// 			}
 2:	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 	bra.b	3f
 
-batch_loop_shadow:
+.Lbatch_loop_shadow:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 	cmp.b	(a3)+,d5
@@ -191,7 +191,7 @@ batch_loop_shadow:
 	add.l	d2,a2
 	// 			}
 2:	// 		} while (--batch);
-	dbra	d1,batch_loop_shadow
+	dbra	d1,.Lbatch_loop_shadow
 
 3:	// 	if (height == 0) {
 	tst.w	(height,a6)
@@ -244,6 +244,6 @@ batch_loop_shadow:
 6:	// StartPos:
 	// } while (len > 0);
 	tst.w	(len,a6)
-	bne.w	len_loop
+	bne.w	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S b/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
index 506495963ef..5db44efbd44 100644
--- a/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode1_smask.S
@@ -103,8 +103,8 @@ SYM(ByleRLEDecode_Scaled_Mode1_SMask):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,(len,a6)
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -121,15 +121,15 @@ main_loop:
 	move.b	(a0)+,d7
 1:	move.w	d7,(len,a6)
 
-pcolor_setup:
+.Lpcolor_setup:
 	tst.b	d6
-	beq.b	len_loop
+	beq.b	.Llen_loop
 	move.w	([arg_palette,a6],d6.l*2),d7	| d7.w: pcolor = _palette[color] (used as long, too)
 	cmp.w	#13,d7
-	bne.b	len_loop
+	bne.b	.Llen_loop
 	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
 	and.b	(shadowMask,a6),d6		| 0xFF: no-op; 0x00: clears color, suppresses writes
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(len,a6),d0
@@ -146,8 +146,8 @@ len_loop:
 	clr.l	d0
 	move.b	(scaleIndexY,a6),d0
 	tst.w	d6
-	bmi.b	batch_loop_shadow
-batch_loop:
+	bmi.b	.Lbatch_loop_shadow
+.Lbatch_loop:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY)
 	cmp.b	(a3,d0.l),d5
@@ -168,10 +168,10 @@ batch_loop:
 	// 			}
 2:	addq.b	#1,d0				| scaleIndexY++ (byte wrap)
 	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 	bra.b	3f
 
-batch_loop_shadow:
+.Lbatch_loop_shadow:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 	cmp.b	(a3,d0.l),d5
@@ -193,7 +193,7 @@ batch_loop_shadow:
 	// 			}
 2:	addq.b	#1,d0				| scaleIndexY++ (byte wrap)
 	// 		} while (--batch);
-	dbra	d1,batch_loop_shadow
+	dbra	d1,.Lbatch_loop_shadow
 
 3:	move.b	d0,(scaleIndexY,a6)
 	// 	if (height == 0) {
@@ -249,6 +249,6 @@ batch_loop_shadow:
 6:	// StartPos:
 	// } while (len > 0);
 	tst.w	(len,a6)
-	bne.w	len_loop
+	bne.w	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop
diff --git a/engines/scumm/m68k/bylerledecode_scaled_mode3.S b/engines/scumm/m68k/bylerledecode_scaled_mode3.S
index e30c99011ac..ad51bff2579 100644
--- a/engines/scumm/m68k/bylerledecode_scaled_mode3.S
+++ b/engines/scumm/m68k/bylerledecode_scaled_mode3.S
@@ -105,8 +105,8 @@ SYM(ByleRLEDecode_Scaled_Mode3):
 	// 2      1    jump into len_loop (1x)
 	// 3      2    jump into len_loop (2x)
 	subq.w	#1,(len,a6)
-	bgt.b	pcolor_setup
-main_loop:
+	bgt.b	.Lpcolor_setup
+.Lmain_loop:
 	// do {
 	// len = *src++;
 	// color = len >> compData.shr;
@@ -123,16 +123,16 @@ main_loop:
 	move.b	(a0)+,d7
 1:	move.w	d7,(len,a6)
 
-pcolor_setup:
+.Lpcolor_setup:
 	tst.b	d6
-	beq.b	len_loop
+	beq.b	.Llen_loop
 	move.w	([arg_palette,a6],d6.l*2),d7	| d7.w: pcolor = _palette[color] (used as long, too)
 	cmp.w	#8,d7
-	bhs.b	len_loop
+	bhs.b	.Llen_loop
 	lsl.w	#8,d7				| d7.w: if pcolor < 8: pcolor << 8
 	or.w	#0x8000,d6			| d6.w: "shadow_on" bit set
 	and.b	(shadowMask,a6),d6		| 0xFF: no-op; 0x00: clears color, suppresses writes
-len_loop:
+.Llen_loop:
 	// do {
 	// 	batch = height < len ? height : len;
 	move.w	(len,a6),d0
@@ -147,8 +147,8 @@ len_loop:
 
 	subq.w	#1,d1				| dbra
 	tst.w	d6
-	bmi.b	batch_loop_shadow
-batch_loop:
+	bmi.b	.Lbatch_loop_shadow
+.Lbatch_loop:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY)
 	cmp.b	(a3)+,d5
@@ -168,10 +168,10 @@ batch_loop:
 	add.l	d2,a2
 	// 			}
 2:	// 		} while (--batch);
-	dbra	d1,batch_loop
+	dbra	d1,.Lbatch_loop
 	bra.b	3f
 
-batch_loop_shadow:
+.Lbatch_loop_shadow:
 	// 		do {
 	// 			if (compData.scaleTable[scaleIndexY++] < _scaleY) {
 	cmp.b	(a3)+,d5
@@ -195,7 +195,7 @@ batch_loop_shadow:
 	add.l	d2,a2
 	// 			}
 2:	// 		} while (--batch);
-	dbra	d1,batch_loop_shadow
+	dbra	d1,.Lbatch_loop_shadow
 
 3:	// 	if (height == 0) {
 	tst.w	(height,a6)
@@ -248,6 +248,6 @@ batch_loop_shadow:
 6:	// StartPos:
 	// } while (len > 0);
 	tst.w	(len,a6)
-	bne.w	len_loop
+	bne.w	.Llen_loop
 	// } while (true);
-	bra.w	main_loop
+	bra.w	.Lmain_loop


Commit: de32e66a2a1a6d6453387c8eb4d6b2b719947f6c
    https://github.com/scummvm/scummvm/commit/de32e66a2a1a6d6453387c8eb4d6b2b719947f6c
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Optimise lastColumnX code

Instead of checking for the boolean flag, pretend that the colour has
been set to transparent => skip.

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index 05dad74542a..d80593aec1f 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -847,7 +847,7 @@ void ByleRLEDecode_Scaled_Mode1(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
+	bool isNewColumn = true;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
 	byte maskbit = revBitMask(compData.x & 7);
@@ -866,6 +866,9 @@ void ByleRLEDecode_Scaled_Mode1(
 		if (!len)
 			len = *src++;
 
+		if (color && !isNewColumn && _palette[color] == 13)
+			color = 0;
+
 		do {
 			batch = height < len ? (byte)height : len;
 			len -= batch;
@@ -881,8 +884,7 @@ void ByleRLEDecode_Scaled_Mode1(
 
 							pcolor = _palette[color];
 							if (pcolor == 13) {
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
+								*dst = _shadowTable[*dst];
 							} else {
 								*dst = pcolor;
 							}
@@ -899,9 +901,10 @@ void ByleRLEDecode_Scaled_Mode1(
 				height = _height;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
+				isNewColumn = false;
 
 				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					isNewColumn = true;
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -940,7 +943,7 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
+	bool isNewColumn = true;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
 	byte maskbit = revBitMask(compData.x & 7);
@@ -959,6 +962,9 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 		if (!len)
 			len = *src++;
 
+		if (color && !isNewColumn && _palette[color] == 13)
+			color = 0;
+
 		do {
 			batch = height < len ? (byte)height : len;
 			len -= batch;
@@ -974,8 +980,7 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 
 							pcolor = _palette[color];
 							if (pcolor == 13) {
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
+								*dst = _shadowTable[*dst];
 							} else {
 								*dst = pcolor;
 							}
@@ -992,9 +997,10 @@ void ByleRLEDecode_Scaled_Mode1_SMask(
 				height = _height;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
+				isNewColumn = false;
 
 				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					isNewColumn = true;
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1033,7 +1039,7 @@ void ByleRLEDecode_Scaled_Mode3(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
+	bool isNewColumn = true;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
 	byte maskbit = revBitMask(compData.x & 7);
@@ -1052,6 +1058,9 @@ void ByleRLEDecode_Scaled_Mode3(
 		if (!len)
 			len = *src++;
 
+		if (color && !isNewColumn && _palette[color] < 8)
+			color = 0;
+
 		do {
 			batch = height < len ? (byte)height : len;
 			len -= batch;
@@ -1067,10 +1076,8 @@ void ByleRLEDecode_Scaled_Mode3(
 
 							pcolor = _palette[color];
 							if (pcolor < 8) {
-								if (lastColumnX != compData.x) {
-									pcolor = (pcolor << 8) + *dst;
-									*dst = _shadowTable[pcolor];
-								}
+								pcolor = (pcolor << 8) + *dst;
+								*dst = _shadowTable[pcolor];
 							} else {
 								*dst = pcolor;
 							}
@@ -1087,9 +1094,10 @@ void ByleRLEDecode_Scaled_Mode3(
 				height = _height;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
+				isNewColumn = false;
 
 				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					isNewColumn = true;
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1127,7 +1135,7 @@ void ByleRLEDecode_Scaled_Mode3_HE90_SMask(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
+	bool isNewColumn = true;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
 	byte maskbit = revBitMask(compData.x & 7);
@@ -1146,6 +1154,9 @@ void ByleRLEDecode_Scaled_Mode3_HE90_SMask(
 		if (!len)
 			len = *src++;
 
+		if (color && !isNewColumn)
+			color = 0;
+
 		do {
 			batch = height < len ? (byte)height : len;
 			len -= batch;
@@ -1160,10 +1171,8 @@ void ByleRLEDecode_Scaled_Mode3_HE90_SMask(
 							uint16 pcolor;
 
 							pcolor = _palette[color];
-							if (lastColumnX != compData.x) {
-								pcolor = (pcolor << 8) + *dst;
-								*dst = _shadowTable[pcolor];
-							}
+							pcolor = (pcolor << 8) + *dst;
+							*dst = _shadowTable[pcolor];
 						}
 					}
 					dst += pitch;
@@ -1177,9 +1186,10 @@ void ByleRLEDecode_Scaled_Mode3_HE90_SMask(
 				height = _height;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
+				isNewColumn = false;
 
 				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					isNewColumn = true;
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1216,7 +1226,7 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
+	bool isNewColumn = true;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
 	byte maskbit = revBitMask(compData.x & 7);
@@ -1235,6 +1245,9 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 		if (!len)
 			len = *src++;
 
+		if (color && !isNewColumn)
+			color = 0;
+
 		do {
 			batch = height < len ? (byte)height : len;
 			len -= batch;
@@ -1245,10 +1258,8 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 			do {
 				if (compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
-						if (!(*mask & maskbit)) {
-							if (lastColumnX != compData.x)
-								*dst = _shadowTable[*dst];
-						}
+						if (!(*mask & maskbit))
+							*dst = _shadowTable[*dst];
 					}
 					dst += pitch;
 					mask += _numStrips;
@@ -1261,9 +1272,10 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 				height = _height;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
+				isNewColumn = false;
 
 				if (compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					isNewColumn = true;
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;
@@ -1380,7 +1392,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 
 	// reset every column
 	byte *dst = compData.destPtr;
-	int lastColumnX = -1;
+	bool isNewColumn = true;
 	int y = compData.y;
 	uint16 height = _height;
 	int scaleIndexY = compData.scaleYIndex;
@@ -1400,6 +1412,25 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 		if (!len)
 			len = *src++;
 
+		if (color && !isNewColumn) {
+			switch(shadowMode) {
+			case ShadowMode::Classic:
+			case ShadowMode::Mode3_HE:
+				color = 0;
+				break;
+			case ShadowMode::Mode1:
+				if (_palette[color] == 13)
+					color = 0;
+				break;
+			case ShadowMode::Mode3:
+				if (_palette[color] < 8)
+					color = 0;
+				break;
+			default:
+				break;
+			}
+		}
+
 		do {
 			batch = height < len ? (byte)height : len;
 			len -= batch;
@@ -1411,7 +1442,7 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 				if (_scaleY == 255 || compData.scaleTable[scaleIndexY++ & compData.scaleIndexMask] < _scaleY) {
 					if (color) {
 						const bool masked = (y < compData.boundsRect.top || y >= compData.boundsRect.bottom)
-							|| (*mask & maskbit);
+						|| (*mask & maskbit);
 
 						if (!masked) {
 							uint16 pcolor;
@@ -1422,15 +1453,13 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 								break;
 
 							case ShadowMode::Classic:
-								if (lastColumnX != compData.x)
-									*dst = _shadowTable[*dst];
+								*dst = _shadowTable[*dst];
 								break;
 
 							case ShadowMode::Mode1:
 								pcolor = _palette[color];
 								if (pcolor == 13) {
-									if (lastColumnX != compData.x)
-										*dst = _shadowTable[*dst];
+									*dst = _shadowTable[*dst];
 								} else {
 									*dst = pcolor;
 								}
@@ -1439,10 +1468,8 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 							case ShadowMode::Mode3:
 								pcolor = _palette[color];
 								if (pcolor < 8) {
-									if (lastColumnX != compData.x) {
-										pcolor = (pcolor << 8) + *dst;
-										*dst = _shadowTable[pcolor];
-									}
+									pcolor = (pcolor << 8) + *dst;
+									*dst = _shadowTable[pcolor];
 								} else {
 									*dst = pcolor;
 								}
@@ -1450,10 +1477,8 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 
 							case ShadowMode::Mode3_HE:
 								pcolor = _palette[color];
-								if (lastColumnX != compData.x) {
-									pcolor = (pcolor << 8) + *dst;
-									*dst = xmap[pcolor];
-								}
+								pcolor = (pcolor << 8) + *dst;
+								*dst = xmap[pcolor];
 								break;
 							}
 						}
@@ -1471,9 +1496,10 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 				y = compData.y;
 
 				scaleIndexY = compData.scaleYIndex;
-				lastColumnX = compData.x;
+				isNewColumn = false;
 
 				if (_scaleX == 255 || compData.scaleTable[compData.scaleXIndex] < _scaleX) {
+					isNewColumn = true;
 					compData.x += compData.scaleXStep;
 					maskbit = revBitMask(compData.x & 7);
 					compData.destPtr += compData.scaleXStep;


Commit: fded2d42039355045aa7124b2cedb37dd014368a
    https://github.com/scummvm/scummvm/commit/fded2d42039355045aa7124b2cedb37dd014368a
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: byleRLEDecodeFast: Introduce specialised Mode0&1 Y clipping functions

Some games contain scenes with static objects whose coordinates are
outside top or bottom border. From my tests, so far only Mode0 and Mode1
seem to be affected.

Changed paths:
    engines/scumm/base-costume-optimised.cpp


diff --git a/engines/scumm/base-costume-optimised.cpp b/engines/scumm/base-costume-optimised.cpp
index d80593aec1f..fbeba7182cd 100644
--- a/engines/scumm/base-costume-optimised.cpp
+++ b/engines/scumm/base-costume-optimised.cpp
@@ -1291,6 +1291,249 @@ void ByleRLEDecode_Scaled_Classic_SMask(
 	} while (true);
 }
 
+// Each column spans _height pixels starting at compData.y:
+//
+//   compData.y  ┌────────────────┐
+//               │   blankTop     │  skipped
+//  boundsRect   ├────────────────┤
+//  .top         │                │
+//               │  activeHeight  │  mask check + pixel write
+//               │                │
+//  boundsRect   ├────────────────┤
+//  .bottom      │  blankBottom   │  skipped
+//               └────────────────┘  (compData.y + _height)
+//
+// Optimisation, instead of this:
+//
+// uint16 blankLinesOnBotRem = blankLinesOnBot;
+// ...
+// below = batch < blankLinesOnBotRem ? batch : blankLinesOnBotRem;
+// batch -= below;
+// assert(batch == 0);
+// blankLinesOnBotRem -= below;
+// dst  += below * pitch;
+// mask += below * _numStrips;
+// ...
+// blankLinesOnBotRem = blankLinesOnBot;
+//
+// we use just:
+//
+// dst  += batch * pitch;
+// mask += batch * _numStrips;
+void ByleRLEDecode_Mode0_YClip(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable, /* unused */
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+
+	const byte *src = _srcPtr;
+
+	uint16 len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	const int activeTop = compData.y;
+	const int clipTop   = compData.boundsRect.top;
+	const int clipBot   = compData.boundsRect.bottom;
+
+	const uint16 blankLinesOnTop = (activeTop < clipTop) ? (clipTop - activeTop) : 0;
+	const uint16 blankLinesOnBot = (activeTop + _height > clipBot) ? (activeTop + _height - clipBot) : 0;
+	const uint16 activeLines     = _height - blankLinesOnTop - blankLinesOnBot;
+
+	assert(blankLinesOnTop + activeLines + blankLinesOnBot == _height);
+
+	// reset every column
+	byte *dst = compData.destPtr;
+	uint16 blankLinesOnTopRem = blankLinesOnTop;
+	uint16 activeLinesRem = activeLines;
+	uint16 height = _height;
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	uint16 above, active, batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			// blank top: skip
+			above = batch < blankLinesOnTopRem ? batch : blankLinesOnTopRem;
+			batch -= above;
+			blankLinesOnTopRem -= above;
+			dst  += above * pitch;
+			mask += above * _numStrips;
+
+			// active lines: draw or skip transparent
+			active = batch < activeLinesRem ? batch : activeLinesRem;
+			batch -= active;
+			activeLinesRem -= active;
+			if (color) {
+				while (active--) {
+					if (!(*mask & maskbit))
+						*dst = _palette[color];
+					dst  += pitch;
+					mask += _numStrips;
+				}
+			} else {
+				dst  += active * pitch;
+				mask += active * _numStrips;
+			}
+
+			// blank bottom: skip
+			dst  += batch * pitch;
+			mask += batch * _numStrips;
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+
+				blankLinesOnTopRem = blankLinesOnTop;
+				activeLinesRem     = activeLines;
+
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
+void ByleRLEDecode_Mode1_YClip(
+	BaseCostumeRenderer::ByleRLEData *pcompData,
+	const byte _scaleX, /* unused */
+	const byte _scaleY, /* unused */
+	const int _height,
+	const int pitch,
+	const int _numStrips,
+	const byte *_srcPtr,
+	const byte *_shadowTable,
+	const uint16 *_palette) {
+
+	BaseCostumeRenderer::ByleRLEData &compData = *pcompData;
+	//warning("%s: unexpected call, save your game and report", __FUNCTION__);
+
+	const byte *src = _srcPtr;
+
+	uint16 len = compData.repLen;
+	uint16 color = compData.repColor;
+
+	const int activeTop = compData.y;
+	const int clipTop   = compData.boundsRect.top;
+	const int clipBot   = compData.boundsRect.bottom;
+
+	const uint16 blankLinesOnTop = (activeTop < clipTop) ? (clipTop - activeTop) : 0;
+	const uint16 blankLinesOnBot = (activeTop + _height > clipBot) ? (activeTop + _height - clipBot) : 0;
+	const uint16 activeLines     = _height - blankLinesOnTop - blankLinesOnBot;
+
+	assert(blankLinesOnTop + activeLines + blankLinesOnBot == _height);
+
+	// reset every column
+	byte *dst = compData.destPtr;
+	uint16 blankLinesOnTopRem = blankLinesOnTop;
+	uint16 activeLinesRem = activeLines;
+	uint16 height = _height;
+	byte maskbit = revBitMask(compData.x & 7);
+	const byte *mask = compData.maskPtr + compData.x / 8;
+
+	uint16 above, active, batch;
+	if (len) {
+		--len;
+		goto StartPos;
+	}
+
+	do {
+		len = *src++;
+		color = len >> compData.shr;
+		len &= compData.mask;
+		if (!len)
+			len = *src++;
+
+		do {
+			batch = height < len ? height : len;
+			len -= batch;
+			height -= batch;
+
+			assert(compData.x >= compData.boundsRect.left && compData.x < compData.boundsRect.right);
+
+			// blank top: skip
+			above = batch < blankLinesOnTopRem ? batch : blankLinesOnTopRem;
+			batch -= above;
+			blankLinesOnTopRem -= above;
+			dst  += above * pitch;
+			mask += above * _numStrips;
+
+			// active lines: draw or skip transparent
+			active = batch < activeLinesRem ? batch : activeLinesRem;
+			batch -= active;
+			activeLinesRem -= active;
+
+			if (color) {
+				while (active--) {
+					if (!(*mask & maskbit)) {
+						uint16 pcolor = _palette[color];
+						if (pcolor == 13) {
+							*dst = _shadowTable[*dst];
+						} else {
+							*dst = pcolor;
+						}
+					}
+					dst  += pitch;
+					mask += _numStrips;
+				}
+			} else {
+				dst  += active * pitch;
+				mask += active * _numStrips;
+			}
+
+			// blank bottom: skip
+			dst  += batch * pitch;
+			mask += batch * _numStrips;
+
+			if (height == 0) {
+				if (--compData.skipWidth == 0)
+					return;
+				height = _height;
+
+				blankLinesOnTopRem = blankLinesOnTop;
+				activeLinesRem     = activeLines;
+
+				compData.x += compData.scaleXStep;
+				maskbit = revBitMask(compData.x & 7);
+				compData.destPtr += compData.scaleXStep;
+
+				dst = compData.destPtr;
+				mask = compData.maskPtr + compData.x / 8;
+			}
+		StartPos:;
+		} while (len > 0);
+	} while (true);
+}
+
 enum class ShadowMode : int {
 	Mode0,
 	Mode1,
@@ -1352,8 +1595,8 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 		}
 	}
 
+	const int scaled = (_scaleX != 255 || _scaleY != 255);
 	if (compData.y >= compData.boundsRect.top && compData.y + compData.scaledHeight <= compData.boundsRect.bottom) {
-		const int scaled = (_scaleX != 255 || _scaleY != 255);
 		if (!scaled) {
 			byleRLEDecodeNoScaleTable[static_cast<int>(shadowMode)](
 				&compData,
@@ -1380,9 +1623,36 @@ void BaseCostumeRenderer::byleRLEDecodeFast(ByleRLEData &compData, const byte *x
 				_palette);
 		}
 		return;
+	} else if (!scaled) {
+		if (shadowMode == ShadowMode::Mode0) {
+			ByleRLEDecode_Mode0_YClip(
+				&compData,
+				_scaleX,
+				_scaleY,
+				_height,
+				_out.pitch,
+				_numStrips,
+				_srcPtr,
+				shadowTable,
+				_palette);
+			return;
+		} else if (shadowMode == ShadowMode::Mode1) {
+			ByleRLEDecode_Mode1_YClip(
+				&compData,
+				_scaleX,
+				_scaleY,
+				_height,
+				_out.pitch,
+				_numStrips,
+				_srcPtr,
+				shadowTable,
+				_palette);
+			return;
+		}
 	}
 
-	warning("%s: unexpected call, save your game and report: %d (%d, %d, %d, %d)", __FUNCTION__, (int)shadowMode,
+	if (!scaled)
+		warning("%s: unexpected call, save your game and report: %d (%d, %d, %d, %d)", __FUNCTION__, (int)shadowMode,
 			compData.y, compData.boundsRect.top, compData.y + compData.scaledHeight, compData.boundsRect.bottom);
 
 	const byte *src = _srcPtr;


Commit: 7a854a8bd799d024977ed348c5bb0bccb16d7d77
    https://github.com/scummvm/scummvm/commit/7a854a8bd799d024977ed348c5bb0bccb16d7d77
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: Optimise BaseCostumeRenderer::skipCelLines

There's really no reason to loop over to get the correct value.

Changed paths:
    engines/scumm/base-costume.cpp


diff --git a/engines/scumm/base-costume.cpp b/engines/scumm/base-costume.cpp
index b9e7cfbd2a5..42d5b59eaee 100644
--- a/engines/scumm/base-costume.cpp
+++ b/engines/scumm/base-costume.cpp
@@ -446,10 +446,10 @@ void BaseCostumeRenderer::skipCelLines(ByleRLEData &compData, int num) {
 		if (!compData.repLen)
 			compData.repLen = *_srcPtr++;
 
-		do {
-			if (!--num)
-				return;
-		} while (--compData.repLen);
+		if ((num -= compData.repLen) <= 0) {
+			compData.repLen = 1 - num;
+			return;
+		}
 	} while (true);
 }
 


Commit: da99ff1efdf80d4a564ea3efca10855f325dccfd
    https://github.com/scummvm/scummvm/commit/da99ff1efdf80d4a564ea3efca10855f325dccfd
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2026-06-14T21:59:20+02:00

Commit Message:
SCUMM: Rewrite asmDrawStripToScreen into m68k assembly

Also, remove the odd code guarded by SCUMM_NEED_ALIGNMENT, it provides
neither faster nor safer code.

Changed paths:
  A engines/scumm/gfxM68K.S
    configure
    engines/scumm/gfx.cpp
    engines/scumm/module.mk


diff --git a/configure b/configure
index 0739c9d6ae2..498adf8ad26 100755
--- a/configure
+++ b/configure
@@ -4257,6 +4257,7 @@ case $_backend in
 	atari)
 		append_var DEFINES "-DATARI"
 		define_in_config_if_yes yes "USE_M68K_COSTUME_ASM"
+		define_in_config_if_yes yes "USE_M68K_GFX_ASM"
 		append_var DEFINES "-DDISABLE_NES_APU"
 		append_var LIBS "-lgem"
 		_ogg=no
diff --git a/engines/scumm/gfx.cpp b/engines/scumm/gfx.cpp
index aa4eace00cc..bf85f13c3ab 100644
--- a/engines/scumm/gfx.cpp
+++ b/engines/scumm/gfx.cpp
@@ -33,7 +33,7 @@
 #include "scumm/he/wiz_he.h"
 #include "scumm/util.h"
 
-#ifdef USE_ARM_GFX_ASM
+#if defined(USE_ARM_GFX_ASM)
 
 #ifndef IPHONE
 #define asmDrawStripToScreen _asmDrawStripToScreen
@@ -43,6 +43,12 @@
 extern "C" void asmDrawStripToScreen(int height, int width, void const* text, void const* src, byte* dst,
 	int vsPitch, int vmScreenWidth, int textSurfacePitch);
 extern "C" void asmCopy8Col(byte* dst, int dstPitch, const byte* src, int height, uint8 bitDepth);
+
+#elif defined(USE_M68K_GFX_ASM)
+
+extern "C" void asmDrawStripToScreen(int height, int width, const uint32 *src32, uint32 *dst32, int vsPitch,
+		const uint32 *text32, const int textPitch);
+
 #endif /* USE_ARM_GFX_ASM */
 
 namespace Scumm {
@@ -732,6 +738,9 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
 
 			const uint32 *text32 = (const uint32 *)text;
 			const int textPitch = (_textSurface.pitch - width * m) >> 2;
+#ifdef USE_M68K_GFX_ASM
+			asmDrawStripToScreen(height * m, width * m, src32, dst32, vsPitch, text32, textPitch);
+#else
 			for (int h = height * m; h > 0; --h) {
 				for (int w = width * m; w > 0; w -= 4) {
 					uint32 temp = *text32++;
@@ -754,7 +763,8 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
 				src32 += vsPitch;
 				text32 += textPitch;
 			}
-#endif
+#endif // USE_M68K_GFX_ASM
+#endif // USE_ARM_GFX_ASM
 		}
 		src = _compositeBuf;
 		pitch = width * vs->format.bytesPerPixel;
@@ -1390,18 +1400,8 @@ static void fill(byte *dst, int dstPitch, uint16 color, int w, int h, uint8 bitD
 #else
 
 static void copy8Col(byte *dst, int dstPitch, const byte *src, int height, uint8 bitDepth) {
-
 	do {
-#if defined(SCUMM_NEED_ALIGNMENT)
 		memcpy(dst, src, 8 * bitDepth);
-#else
-		((uint32 *)dst)[0] = ((const uint32 *)src)[0];
-		((uint32 *)dst)[1] = ((const uint32 *)src)[1];
-		if (bitDepth == 2) {
-			((uint32 *)dst)[2] = ((const uint32 *)src)[2];
-			((uint32 *)dst)[3] = ((const uint32 *)src)[3];
-		}
-#endif
 		dst += dstPitch;
 		src += dstPitch;
 	} while (--height);
@@ -1411,23 +1411,10 @@ static void copy8Col(byte *dst, int dstPitch, const byte *src, int height, uint8
 
 static void clear8Col(byte *dst, int dstPitch, int height, uint8 bitDepth) {
 	do {
-#if defined(SCUMM_NEED_ALIGNMENT)
 		if (g_scumm->_game.platform == Common::kPlatformNES)
 			memset(dst, 0x1d, 8 * bitDepth);
 		else
 			memset(dst, 0, 8 * bitDepth);
-#else
-		if (g_scumm->_game.platform == Common::kPlatformNES) {
-			memset(dst, 0x1d, 8 * bitDepth);
-		} else {
-			((uint32*)dst)[0] = 0;
-			((uint32*)dst)[1] = 0;
-			if (bitDepth == 2) {
-				((uint32*)dst)[2] = 0;
-				((uint32*)dst)[3] = 0;
-			}
-		}
-#endif
 		dst += dstPitch;
 	} while (--height);
 }
diff --git a/engines/scumm/gfxM68K.S b/engines/scumm/gfxM68K.S
new file mode 100644
index 00000000000..43f1e68318a
--- /dev/null
+++ b/engines/scumm/gfxM68K.S
@@ -0,0 +1,105 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+| kate: tab-width 8; replace-tabs off; hl Motorola 68k (VASM/Devpac);
+
+#include "../../backends/platform/atari/symbols.h"
+
+	.globl	SYM(asmDrawStripToScreen)
+
+
+	.text
+
+| void asmDrawStripToScreen(
+|     const int height,      4+9*4(sp)
+|     const int width,       4+9*4+4(sp)
+|     const uint32 *src32,   4+9*4+8(sp)
+|     uint32 *dst32,         4+9*4+12(sp)
+|     const int vsPitch,     4+9*4+16(sp)
+|     const uint32 *text32,  4+9*4+20(sp)
+|     const int textPitch    4+9*4+24(sp)
+| );
+SYM(asmDrawStripToScreen):
+// TODO: __FASTCALL__
+	movem.l	d2-d7/a2-a4,-(sp)		| 9 × 4 = 36 bytes
+	move.l	(4+9*4+8,sp),a0			| a0: src32
+	move.l	(4+9*4+12,sp),a1		| a1: dst32
+	move.l	(4+9*4+20,sp),a2		| a2: text32
+
+	move.l	(4+9*4+16,sp),d0		| vsPitch
+	lsl.l	#2,d0				|
+	move.l	d0,a3				| a3: vsPitch in bytes
+
+	move.l	(4+9*4+24,sp),d0		| textPitch
+	lsl.l	#2,d0				|
+	move.l	d0,a4				| a4: textPitch in bytes
+
+	move.l	(4+9*4+4,sp),d5			| width
+	lsr.l	#2,d5				|
+	beq.b	3f
+	subq.l	#1,d5				| d5.w: width/4 - 1 for dbra
+
+	// for (int h = height * m; h > 0; --h) {
+	move.l	(4+9*4,sp),d7
+	beq.b	3f
+
+	move.w	d5,-(sp)
+	move.l	#0x7F7F7F7F,d3
+	move.l	#0x80808080,d4
+	move.l	#0xFDFDFDFD,d5
+	subq.l	#1,d7				| d7.w: height - 1 for dbra
+
+1:	//	for (int w = width * m; w > 0; w -= 4) {
+	move.w	(sp),d6
+2:	//		uint32 temp = *text32++;
+	move.l	(a2)+,d0
+	//		uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
+	move.l	d0,d1
+	eor.l	d5,d1
+	//		mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
+	move.l	d1,d2
+	and.l	d3,d2
+	add.l	d3,d2
+	or.l	d1,d2
+	and.l	d4,d2
+	//		mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
+	lsr.l	#7,d2
+	add.l	d3,d2
+	eor.l	d4,d2
+	//		*dst32++ = ((temp ^ *src32++) & mask) ^ temp;
+	move.l	(a0)+,d1
+	eor.l	d0,d1
+	and.l	d2,d1
+	eor.l	d0,d1
+	move.l	d1,(a1)+
+	//	}
+	dbra	d6,2b
+	//	src32 += vsPitch;
+	add.l	a3,a0
+	//	text32 += textPitch;
+	add.l	a4,a2
+	// }
+	dbra	d7,1b
+
+	addq.l	#2,sp				| pop d5.w
+
+3:	movem.l	(sp)+,d2-d7/a2-a4
+	rts
diff --git a/engines/scumm/module.mk b/engines/scumm/module.mk
index 0f8996fb794..2cee05abc31 100644
--- a/engines/scumm/module.mk
+++ b/engines/scumm/module.mk
@@ -189,6 +189,11 @@ MODULE_OBJS += \
 	gfxARM.o
 endif
 
+ifdef USE_M68K_GFX_ASM
+MODULE_OBJS += \
+	gfxM68K.o
+endif
+
 ifdef ENABLE_HE
 MODULE_OBJS += \
 	he/animation_he.o \




More information about the Scummvm-git-logs mailing list