[Scummvm-cvs-logs] scummvm master -> 4c02e1974298de32b0c6aa70dfe729089241d8ea

tramboi bertrand_augereau at yahoo.fr
Mon Sep 17 22:16:11 CEST 2012


This automated email contains information about 1 new commit which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
4c02e19742 SCALER: Neon code for aspect correction for OpenPandora


Commit: 4c02e1974298de32b0c6aa70dfe729089241d8ea
    https://github.com/scummvm/scummvm/commit/4c02e1974298de32b0c6aa70dfe729089241d8ea
Author: Bertrand Augereau (bertrand_augereau at yahoo.fr)
Date: 2012-09-17T13:13:34-07:00

Commit Message:
SCALER: Neon code for aspect correction for OpenPandora
It gains 35% on the first function of the profiling on Indy IV
It is now nearly memory-bound (~10%) so it might not be needed to schedule the code better than this

Changed paths:
    graphics/scaler/aspect.cpp



diff --git a/graphics/scaler/aspect.cpp b/graphics/scaler/aspect.cpp
index 2f06b2e..429640f 100644
--- a/graphics/scaler/aspect.cpp
+++ b/graphics/scaler/aspect.cpp
@@ -23,6 +23,13 @@
 #include "graphics/scaler/intern.h"
 #include "graphics/scaler/aspect.h"
 
+#ifdef OPENPANDORA
+#define NEON_ASPECT_CORRECTOR
+#endif
+
+#ifdef NEON_ASPECT_CORRECTOR
+#include <arm_neon.h>
+#endif
 
 #define	kSuperFastAndUglyAspectMode	0	// No interpolation at all, but super-fast
 #define	kVeryFastAndGoodAspectMode	1	// Good quality with very good speed
@@ -55,13 +62,66 @@ static inline void interpolate5Line(uint16 *dst, const uint16 *srcA, const uint1
 
 #if ASPECT_MODE == kVeryFastAndGoodAspectMode
 
+#ifdef NEON_ASPECT_CORRECTOR
+
+template<typename ColorMask>
+static void interpolate5LineNeon(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width, int k1, int k2) {
+	  uint16x4_t kRedBlueMask_4 = vdup_n_u16(ColorMask::kRedBlueMask);
+	  uint16x4_t kGreenMask_4 = vdup_n_u16(ColorMask::kGreenMask);
+	  uint16x4_t k1_4 = vdup_n_u16(k1);
+	  uint16x4_t k2_4 = vdup_n_u16(k2);
+	  while (width >= 4) {
+		  uint16x4_t srcA_4 = vld1_u16(srcA);
+		  uint16x4_t srcB_4 = vld1_u16(srcB);
+		  uint16x4_t p1_4 = srcB_4;
+		  uint16x4_t p2_4 = srcA_4;
+
+		  uint16x4_t p1_rb_4 = vand_u16(p1_4, kRedBlueMask_4);
+		  uint16x4_t p1_g_4  = vand_u16(p1_4, kGreenMask_4);
+		  uint16x4_t p2_rb_4 = vand_u16(p2_4, kRedBlueMask_4);
+		  uint16x4_t p2_g_4  = vand_u16(p2_4, kGreenMask_4);
+
+		  uint32x4_t tmp_rb_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_rb_4, k2_4), p1_rb_4, k1_4), 3);
+		  uint32x4_t tmp_g_4  = vshrq_n_u32(vmlal_u16(vmull_u16(p2_g_4, k2_4), p1_g_4, k1_4), 3);
+		  uint16x4_t p_rb_4 = vmovn_u32(tmp_rb_4);
+		  p_rb_4 = vand_u16(p_rb_4, kRedBlueMask_4);
+		  uint16x4_t p_g_4 = vmovn_u32(tmp_g_4);
+		  p_g_4 = vand_u16(p_g_4, kGreenMask_4);
+
+		  uint16x4_t result_4 = p_rb_4 | p_g_4;
+		  vst1_u16(dst, result_4);
+
+		  dst += 4;
+		  srcA += 4;
+		  srcB += 4;
+		  width -= 4;
+	  }
+}
+#endif
+
 template<typename ColorMask, int scale>
 static void interpolate5Line(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width) {
 	if (scale == 1) {
+#ifdef NEON_ASPECT_CORRECTOR
+		int width4 = width & ~3;
+		interpolate5LineNeon<ColorMask>(dst, srcA, srcB, width4, 7, 1);
+		srcA += width4;
+		srcB += width4;
+		dst += width4;
+		width -= width4;
+#endif
 		while (width--) {
 			*dst++ = interpolate16_7_1<ColorMask>(*srcB++, *srcA++);
 		}
 	} else {
+ #ifdef NEON_ASPECT_CORRECTOR
+		int width4 = width & ~3;
+		interpolate5LineNeon<ColorMask>(dst, srcA, srcB, width4, 5, 3);
+		srcA += width4;
+		srcB += width4;
+		dst += width4;
+		width -= width4;
+#endif
 		while (width--) {
 			*dst++ = interpolate16_5_3<ColorMask>(*srcB++, *srcA++);
 		}






More information about the Scummvm-git-logs mailing list