[Scummvm-cvs-logs] CVS: scummvm/common scaler.cpp,1.41,1.42

Max Horn fingolfin at users.sourceforge.net
Sat Sep 27 11:11:02 CEST 2003


Update of /cvsroot/scummvm/scummvm/common
In directory sc8-pr-cvs1:/tmp/cvs-serv22848

Modified Files:
	scaler.cpp 
Log Message:
added a FIXME (regarding optimization) to INTERPOLATE; added new template function interpolate16 which is used by hq3x (advantage of the old trick which abused Q_INTERPOLATE: smaller & better optimized code; very easy to adapt the function for other uses, like for hq2x/hq3x. Drawback: scaler.cpp takes even longer to compile now :-)

Index: scaler.cpp
===================================================================
RCS file: /cvsroot/scummvm/scummvm/common/scaler.cpp,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -d -r1.41 -r1.42
--- scaler.cpp	22 Sep 2003 23:29:47 -0000	1.41
+++ scaler.cpp	27 Sep 2003 18:09:50 -0000	1.42
@@ -107,6 +107,14 @@
 }
 
 static inline uint32 INTERPOLATE(uint32 A, uint32 B) {
+	// FIXME: Is an "if" (and thus introducing a conditional (even hard to predict)
+	// jump inside a hot code section really good for a speed improvement? 
+	// Maybe it would actually be faster to remove it. This would remove the jump
+	// at the expense of 4 ANDs, 2 shifts and 2 additions. But if that means
+	// avoiding a pipeline flush (and don't forget, those instructions might be
+	// interleaved, thus "free")...
+	// Answering this will require some careful analysis, and could actually vary
+	// when running on different processors.
 	if (A != B) {
 		return (((A & colorMask) >> 1) + ((B & colorMask) >> 1) + (A & B & lowPixelMask));
 	} else
@@ -667,74 +675,72 @@
 
 static int   RGBtoYUV[65536];
 
-#define INTERPOLATE_3_1(x, y)	Q_INTERPOLATE(x, x, x, y)
-
-#define INTERPOLATE_2_1_1(x, y, z)	Q_INTERPOLATE(x, x, y, z)
-
-static inline uint16 INTERPOLATE_7_1(uint16 w1, uint16 w2) {
-	return ((((w1 & redblueMask) * 7 + (w2 & redblueMask)) >> 3)  & redblueMask) |
-	       ((((w1 & greenMask) * 7 + (w2 & greenMask)) >> 3)  & greenMask);
+// Interpolate two 16 bit pixels with the given weights.
+template<int w1, int w2>
+static inline uint16 interpolate16(uint16 p1, uint16 p2) {
+	return ((((p1 & redblueMask) * w1 + (p2 & redblueMask) * w2) / (w1 + w2)) & redblueMask) |
+	       ((((p1 & greenMask) * w1 + (p2 & greenMask) * w2) / (w1 + w2)) & greenMask);
 }
 
-static inline uint16 INTERPOLATE_2_7_7(uint16 w1, uint16 w2, uint16 w3) {
-	return ((((w1 & redblueMask) * 2 + ((w2 & redblueMask) + (w3 & redblueMask)) * 7) >> 4)  & redblueMask) |
-	       ((((w1 & greenMask) * 2 + ((w2 & greenMask) + (w3 & greenMask)) * 7) >> 4)  & greenMask);
+// Interpolate three 16 bit pixels with the given weights.
+template<int w1, int w2, int w3>
+static inline uint16 interpolate16(uint16 p1, uint16 p2, uint16 p3) {
+	return ((((p1 & redblueMask) * w1 + (p2 & redblueMask) * w2 + (p3 & redblueMask) * w3) / (w1 + w2 + w3)) & redblueMask) |
+	       ((((p1 & greenMask) * w1 + (p2 & greenMask) * w2 + (p3 & greenMask) * w3) / (w1 + w2 + w3)) & greenMask);
 }
 
-#define INTERPOLATE_1_1(x, y) INTERPOLATE(x, y)
-
-#define PIXEL00_1M  *(q) = INTERPOLATE_3_1(w[5], w[1]);
-#define PIXEL00_1U  *(q) = INTERPOLATE_3_1(w[5], w[2]);
-#define PIXEL00_1L  *(q) = INTERPOLATE_3_1(w[5], w[4]);
-#define PIXEL00_2   *(q) = INTERPOLATE_2_1_1(w[5], w[4], w[2]);
-#define PIXEL00_4   *(q) = INTERPOLATE_2_7_7(w[5], w[4], w[2]);
-#define PIXEL00_5   *(q) = INTERPOLATE_1_1(w[4], w[2]);
+#define PIXEL00_1M  *(q) = interpolate16<3,1>(w[5], w[1]);
+#define PIXEL00_1U  *(q) = interpolate16<3,1>(w[5], w[2]);
+#define PIXEL00_1L  *(q) = interpolate16<3,1>(w[5], w[4]);
+#define PIXEL00_2   *(q) = interpolate16<2,1,1>(w[5], w[4], w[2]);
+#define PIXEL00_4   *(q) = interpolate16<2,7,7>(w[5], w[4], w[2]);
+#define PIXEL00_5   *(q) = interpolate16<1,1>(w[4], w[2]);
 #define PIXEL00_C   *(q) = w[5];
 
-#define PIXEL01_1   *(q+1) = INTERPOLATE_3_1(w[5], w[2]);
-#define PIXEL01_3   *(q+1) = INTERPOLATE_7_1(w[5], w[2]);
-#define PIXEL01_6   *(q+1) = INTERPOLATE_3_1(w[2], w[5]);
+#define PIXEL01_1   *(q+1) = interpolate16<3,1>(w[5], w[2]);
+#define PIXEL01_3   *(q+1) = interpolate16<7,1>(w[5], w[2]);
+#define PIXEL01_6   *(q+1) = interpolate16<3,1>(w[2], w[5]);
 #define PIXEL01_C   *(q+1) = w[5];
 
-#define PIXEL02_1M  *(q+2) = INTERPOLATE_3_1(w[5], w[3]);
-#define PIXEL02_1U  *(q+2) = INTERPOLATE_3_1(w[5], w[2]);
-#define PIXEL02_1R  *(q+2) = INTERPOLATE_3_1(w[5], w[6]);
-#define PIXEL02_2   *(q+2) = INTERPOLATE_2_1_1(w[5], w[2], w[6]);
-#define PIXEL02_4   *(q+2) = INTERPOLATE_2_7_7(w[5], w[2], w[6]);
-#define PIXEL02_5   *(q+2) = INTERPOLATE_1_1(w[2], w[6]);
+#define PIXEL02_1M  *(q+2) = interpolate16<3,1>(w[5], w[3]);
+#define PIXEL02_1U  *(q+2) = interpolate16<3,1>(w[5], w[2]);
+#define PIXEL02_1R  *(q+2) = interpolate16<3,1>(w[5], w[6]);
+#define PIXEL02_2   *(q+2) = interpolate16<2,1,1>(w[5], w[2], w[6]);
+#define PIXEL02_4   *(q+2) = interpolate16<2,7,7>(w[5], w[2], w[6]);
+#define PIXEL02_5   *(q+2) = interpolate16<1,1>(w[2], w[6]);
 #define PIXEL02_C   *(q+2) = w[5];
 
-#define PIXEL10_1   *(q+nextlineDst) = INTERPOLATE_3_1(w[5], w[4]);
-#define PIXEL10_3   *(q+nextlineDst) = INTERPOLATE_7_1(w[5], w[4]);
-#define PIXEL10_6   *(q+nextlineDst) = INTERPOLATE_3_1(w[4], w[5]);
+#define PIXEL10_1   *(q+nextlineDst) = interpolate16<3,1>(w[5], w[4]);
+#define PIXEL10_3   *(q+nextlineDst) = interpolate16<7,1>(w[5], w[4]);
+#define PIXEL10_6   *(q+nextlineDst) = interpolate16<3,1>(w[4], w[5]);
 #define PIXEL10_C   *(q+nextlineDst) = w[5];
 
 #define PIXEL11     *(q+1+nextlineDst) = w[5];
 
-#define PIXEL12_1   *(q+2+nextlineDst) = INTERPOLATE_3_1(w[5], w[6]);
-#define PIXEL12_3   *(q+2+nextlineDst) = INTERPOLATE_7_1(w[5], w[6]);
-#define PIXEL12_6   *(q+2+nextlineDst) = INTERPOLATE_3_1(w[6], w[5]);
+#define PIXEL12_1   *(q+2+nextlineDst) = interpolate16<3,1>(w[5], w[6]);
+#define PIXEL12_3   *(q+2+nextlineDst) = interpolate16<7,1>(w[5], w[6]);
+#define PIXEL12_6   *(q+2+nextlineDst) = interpolate16<3,1>(w[6], w[5]);
 #define PIXEL12_C   *(q+2+nextlineDst) = w[5];
 
-#define PIXEL20_1M  *(q+nextlineDst2) = INTERPOLATE_3_1(w[5], w[7]);
-#define PIXEL20_1D  *(q+nextlineDst2) = INTERPOLATE_3_1(w[5], w[8]);
-#define PIXEL20_1L  *(q+nextlineDst2) = INTERPOLATE_3_1(w[5], w[4]);
-#define PIXEL20_2   *(q+nextlineDst2) = INTERPOLATE_2_1_1(w[5], w[8], w[4]);
-#define PIXEL20_4   *(q+nextlineDst2) = INTERPOLATE_2_7_7(w[5], w[8], w[4]);
-#define PIXEL20_5   *(q+nextlineDst2) = INTERPOLATE_1_1(w[8], w[4]);
+#define PIXEL20_1M  *(q+nextlineDst2) = interpolate16<3,1>(w[5], w[7]);
+#define PIXEL20_1D  *(q+nextlineDst2) = interpolate16<3,1>(w[5], w[8]);
+#define PIXEL20_1L  *(q+nextlineDst2) = interpolate16<3,1>(w[5], w[4]);
+#define PIXEL20_2   *(q+nextlineDst2) = interpolate16<2,1,1>(w[5], w[8], w[4]);
+#define PIXEL20_4   *(q+nextlineDst2) = interpolate16<2,7,7>(w[5], w[8], w[4]);
+#define PIXEL20_5   *(q+nextlineDst2) = interpolate16<1,1>(w[8], w[4]);
 #define PIXEL20_C   *(q+nextlineDst2) = w[5];
 
-#define PIXEL21_1   *(q+1+nextlineDst2) = INTERPOLATE_3_1(w[5], w[8]);
-#define PIXEL21_3   *(q+1+nextlineDst2) = INTERPOLATE_7_1(w[5], w[8]);
-#define PIXEL21_6   *(q+1+nextlineDst2) = INTERPOLATE_3_1(w[8], w[5]);
+#define PIXEL21_1   *(q+1+nextlineDst2) = interpolate16<3,1>(w[5], w[8]);
+#define PIXEL21_3   *(q+1+nextlineDst2) = interpolate16<7,1>(w[5], w[8]);
+#define PIXEL21_6   *(q+1+nextlineDst2) = interpolate16<3,1>(w[8], w[5]);
 #define PIXEL21_C   *(q+1+nextlineDst2) = w[5];
 
-#define PIXEL22_1M  *(q+2+nextlineDst2) = INTERPOLATE_3_1(w[5], w[9]);
-#define PIXEL22_1D  *(q+2+nextlineDst2) = INTERPOLATE_3_1(w[5], w[8]);
-#define PIXEL22_1R  *(q+2+nextlineDst2) = INTERPOLATE_3_1(w[5], w[6]);
-#define PIXEL22_2   *(q+2+nextlineDst2) = INTERPOLATE_2_1_1(w[5], w[6], w[8]);
-#define PIXEL22_4   *(q+2+nextlineDst2) = INTERPOLATE_2_7_7(w[5], w[6], w[8]);
-#define PIXEL22_5   *(q+2+nextlineDst2) = INTERPOLATE_1_1(w[6], w[8]);
+#define PIXEL22_1M  *(q+2+nextlineDst2) = interpolate16<3,1>(w[5], w[9]);
+#define PIXEL22_1D  *(q+2+nextlineDst2) = interpolate16<3,1>(w[5], w[8]);
+#define PIXEL22_1R  *(q+2+nextlineDst2) = interpolate16<3,1>(w[5], w[6]);
+#define PIXEL22_2   *(q+2+nextlineDst2) = interpolate16<2,1,1>(w[5], w[6], w[8]);
+#define PIXEL22_4   *(q+2+nextlineDst2) = interpolate16<2,7,7>(w[5], w[6], w[8]);
+#define PIXEL22_5   *(q+2+nextlineDst2) = interpolate16<1,1>(w[6], w[8]);
 #define PIXEL22_C   *(q+2+nextlineDst2) = w[5];
 
 static inline bool diffYUV(int yuv1, int yuv2) {





More information about the Scummvm-git-logs mailing list