[Scummvm-git-logs] scummvm master -> 06af761337d3908ac15a4db6554264f063992fd9

Mon Mar 20 21:17:29 UTC 2023

This automated email contains information about 1 new commit which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
06af761337 GRAPHICS: ATARI: Align surface on a 16-byte boundary


Commit: 06af761337d3908ac15a4db6554264f063992fd9
    https://github.com/scummvm/scummvm/commit/06af761337d3908ac15a4db6554264f063992fd9
Author: Miro Kropacek (miro.kropacek at gmail.com)
Date: 2023-03-20T22:17:25+01:00

Commit Message:
GRAPHICS: ATARI: Align surface on a 16-byte boundary

Also implement a CPU-based optimization for the 68040 / 68060.

Changed paths:
    backends/graphics/atari/atari-graphics.cpp
    graphics/blit-atari.cpp

diff --git a/backends/graphics/atari/atari-graphics.cpp b/backends/graphics/atari/atari-graphics.cpp
index 462d7385553..f325a6f4fe4 100644
--- a/backends/graphics/atari/atari-graphics.cpp
+++ b/backends/graphics/atari/atari-graphics.cpp
@@ -1001,7 +1001,7 @@ void AtariGraphicsManager::Cursor::setSurface(const void *buf, int w, int h, int
 	if (surface.w != w || surface.h != h || surface.format != format)
 		surface.create(w, h, format);
 
-	surface.copyRectToSurface(buf, surface.pitch, 0, 0, w, h);
+	surface.copyRectToSurface(buf, w * format.bytesPerPixel, 0, 0, w, h);
 
 	hotspotX = _hotspotX;
 	hotspotY = _hotspotY;
diff --git a/graphics/blit-atari.cpp b/graphics/blit-atari.cpp
index aa80e235290..ca54c7306ce 100644
--- a/graphics/blit-atari.cpp
+++ b/graphics/blit-atari.cpp
@@ -22,15 +22,24 @@
 #include "graphics/blit.h"
 #include "graphics/surface.h"
 
-#include <cstdlib>	// calloc
-#include <cstring>	// memcpy
+#include <cstdlib>	// malloc
+#include <cstring>	// memcpy, memset
 #include <mint/cookie.h>
 #include <mint/falcon.h>
 
 #include "backends/graphics/atari/atari-graphics-superblitter.h"
+#include "common/textconsole.h"	// error
+
+static inline bool hasMove16() {
+	long val;
+	static bool hasMove16 = Getcookie(C__CPU, &val) == C_FOUND && val >= 40;
+	return hasMove16;
+}
 
 namespace Graphics {
 
+constexpr size_t ALIGN = 16;	// 16 bytes
+
 // hijack surface overrides here as well as these are tightly related
 // to the blitting routine below
 void Surface::create(int16 width, int16 height, const PixelFormat &f) {
@@ -40,24 +49,41 @@ void Surface::create(int16 width, int16 height, const PixelFormat &f) {
 	w = width;
 	h = height;
 	format = f;
-	pitch = w * format.bytesPerPixel;
+	// align pitch to a 16-byte boundary for a possible C2P conversion
+	pitch = (w * format.bytesPerPixel + ALIGN - 1) & (-ALIGN);
 
 	if (width && height) {
-		if (VgetMonitor() == MON_VGA && Getcookie(C_SupV, NULL) == C_FOUND)
-			pixels = (void*)ct60_vmalloc(width * height * format.bytesPerPixel);
-		else
-			pixels = calloc(width * height, format.bytesPerPixel);
-		assert(pixels);
+		if (VgetMonitor() == MON_VGA && Getcookie(C_SupV, NULL) == C_FOUND) {
+			pixels = (void *)ct60_vmalloc(height * pitch);
+
+			if (!pixels)
+				error("Not enough SVRAM to allocate a surface");
+
+			assert((uintptr)pixels >= 0xA0000000);
+		} else {
+			// align buffer to a 16-byte boundary for move16 or C2P conversion
+			void *pixelsUnaligned = ::malloc(sizeof(uintptr) + (height * pitch) + ALIGN - 1);
+
+			if (!pixelsUnaligned)
+				error("Not enough memory to allocate a surface");
+
+			pixels = (void *)(((uintptr)pixelsUnaligned + sizeof(uintptr) + ALIGN - 1) & (-ALIGN));
+
+			// store the unaligned pointer for later free()
+			*((uintptr *)pixels - 1) = (uintptr)pixelsUnaligned;
+		}
+
+		memset(pixels, 0, height * pitch);
 	}
 }
 
 void Surface::free() {
 	if (((uintptr)pixels & 0xFF000000) >= 0xA0000000)
 		ct60_vmfree(pixels);
-	else
-		::free(pixels);
+	else if (pixels)
+		::free((void *)*((uintptr *)pixels - 1));
 
-	pixels = 0;
+	pixels = nullptr;
 	w = h = pitch = 0;
 	format = PixelFormat();
 }
@@ -87,12 +113,136 @@ void copyBlit(byte *dst, const byte *src,
 		// wait until we finish otherwise we may overwrite pixels written manually afterwards
 		while (*SV_BLITTER_CONTROL & 1);
 	} else if (dstPitch == srcPitch && ((w * bytesPerPixel) == dstPitch)) {
-		memcpy(dst, src, dstPitch * h);
+		if (hasMove16() && ((uintptr)src & (ALIGN - 1)) == 0 && ((uintptr)dst & (ALIGN - 1)) == 0) {
+			__asm__ volatile(
+			"	move.l	%2,d0\n"
+			"	lsr.l	#4,d0\n"
+			"	beq.b	3f\n"
+
+			"	moveq	#0x0f,d1\n"
+			"	and.l	d0,d1\n"
+			"	neg.l	d1\n"
+			"	lsr.l	#4,d0\n"
+			"	jmp	(2f,pc,d1.l*4)\n"
+			"1:\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"2:\n"
+			"	dbra	d0,1b\n"
+			// handle also the unlikely case when 'dstPitch'
+			// is not divisible by 16 but 'src' and 'dst' are
+			"3:\n"
+			"	moveq	#0x0f,d0\n"
+			"	and.l	%2,d0\n"
+			"	neg.l	d0\n"
+			"	jmp	(4f,pc,d0.l*2)\n"
+			// only 15x move.b as 16 would be handled above
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"4:\n"
+				: // outputs
+				: "a"(src), "a"(dst), "g"(dstPitch * h) // inputs
+				: "d0", "d1", "cc" AND_MEMORY
+			);
+		} else {
+			memcpy(dst, src, dstPitch * h);
+		}
 	} else {
-		for (uint i = 0; i < h; ++i) {
-			memcpy(dst, src, w * bytesPerPixel);
-			dst += dstPitch;
-			src += srcPitch;
+		if (hasMove16() && ((uintptr)src & (ALIGN - 1)) == 0 && ((uintptr)dst & (ALIGN - 1)) == 0) {
+			__asm__ volatile(
+			"0:\n"
+			"	move.l	%2,d0\n"
+			"	lsr.l	#4,d0\n"
+			"	beq.b	3f\n"
+
+			"	moveq	#0x0f,d1\n"
+			"	and.l	d0,d1\n"
+			"	neg.l	d1\n"
+			"	lsr.l	#4,d0\n"
+			"	jmp	(2f,pc,d1.l*4)\n"
+			"1:\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"2:\n"
+			"	dbra	d0,1b\n"
+			// handle (w * bytesPerPixel) % 16
+			"3:\n"
+			"	moveq	#0x0f,d0\n"
+			"	and.l	%2,d0\n"
+			"	neg.l	d0\n"
+			"	jmp	(4f,pc,d0.l*2)\n"
+			// only 15x move.b as 16 would be handled above
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"4:\n"
+			"	add.l	%4,%1\n"
+			"	add.l	%5,%0\n"
+			"	dbra	%3,0b\n"
+				: // outputs
+				: "a"(src), "a"(dst), "g"(w * bytesPerPixel), "d"(h - 1),
+				  "g"(dstPitch - w * bytesPerPixel), "g"(srcPitch - w * bytesPerPixel) // inputs
+				: "d0", "d1", "d2", "cc" AND_MEMORY
+			);
+		} else {
+			for (uint i = 0; i < h; ++i) {
+				memcpy(dst, src, w * bytesPerPixel);
+				dst += dstPitch;
+				src += srcPitch;
+			}
 		}
 	}
 }