From c41d4398e9623e35b1506e37221a0e1e13f1f0de Mon Sep 17 00:00:00 2001
From: Nicola Vigano <nicola.vigano@esrf.fr>
Date: Tue, 17 Jul 2012 15:22:40 +0000
Subject: [PATCH] C++/gtImgMeanVal: updated version that outperforms matlab

By a factor 1.5 on big images, and by a factor of 2 on small images

This mainly served as a test bench to improve my SSE2 and OpenMP skills :) no real need otherwise.

Signed-off-by: Nicola Vigano <nicola.vigano@esrf.fr>

git-svn-id: https://svn.code.sf.net/p/dct/code/trunk@664 4c865b51-4357-4376-afb4-474e03ccb993
---
 zUtil_Cxx/gtImgMeanValue.c   | 91 ----------------------------------
 zUtil_Cxx/gtImgMeanValue.cpp | 94 ++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 91 deletions(-)
 delete mode 100755 zUtil_Cxx/gtImgMeanValue.c
 create mode 100755 zUtil_Cxx/gtImgMeanValue.cpp

diff --git a/zUtil_Cxx/gtImgMeanValue.c b/zUtil_Cxx/gtImgMeanValue.c
deleted file mode 100755
index 5518c575..00000000
--- a/zUtil_Cxx/gtImgMeanValue.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Does exactly the same as 'mean2' function from Imaging toolbox.
- *
- * The aim was to get rid of license requirements, and test possible
- * optimization techniques.
- *
- * Nicola Vigano', 2012, ID11 @ ESRF vigano@esrf.eu
- */
-
-#include "mex.h"
-
-#define ROUND_DOWN(x, s)  ((x) & ~((s)-1))
-#define ROUND_DOWN_2(x)   ((x) & ~1)
-#define ROUND_DOWN_4(x)   ((x) & ~3)
-#define ROUND_DOWN_8(x)   ((x) & ~7)
-#define ROUND_DOWN_16(x)  ((x) & ~15)
-
-
-#ifdef HAVE_OMP
-# include <omp.h>
-#else
-# ifdef HAVE_SSE2
-typedef double v2df __attribute__ ((vector_size (16)));
-# endif
-#endif
-
-void mexFunction( int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[] )
-{
-  /* Incoming image dimensions */
-  const unsigned int num_cols = mxGetM(prhs[0]), num_rows = mxGetN(prhs[0]);
-  const unsigned int tot_pixels = num_cols * num_rows;
-
-  /* Pointers to incoming matrices: */
-  const double * const image_in = mxGetPr(prhs[0]);
-
-  double temp_av = 0;
-  const double * const endPixels = image_in + tot_pixels;
-  const unsigned int totUnroll4Pixels = ROUND_DOWN_4(tot_pixels);
-
-#ifdef HAVE_OMP
-  unsigned int count = 0;
-# pragma omp parallel for private(count) reduction(+:temp_av)
-    for(count = 0; count < totUnroll4Pixels; count += 4) {
-      temp_av += ( image_in[count+0] + image_in[count+1]
-                 + image_in[count+2] + image_in[count+3]);
-    }
-
-    switch(tot_pixels - totUnroll4Pixels) {
-      case 3:
-        temp_av += image_in[totUnroll4Pixels+2];
-      case 2:
-        temp_av += image_in[totUnroll4Pixels+1];
-      case 1:
-        temp_av += image_in[totUnroll4Pixels];
-      default:
-        break;
-    }
-#else
-  const double * pixel = image_in;
-  const double * const endUnroll4Pixels = image_in + totUnroll4Pixels;
-# ifdef HAVE_SSE2
-  v2df accumul = {0, 0};
-# endif
-
-  for(; pixel < endUnroll4Pixels; pixel += 4) {
-# ifdef HAVE_SSE2
-    accumul = __builtin_ia32_addpd(accumul, __builtin_ia32_loadupd(pixel) );
-    accumul = __builtin_ia32_addpd(accumul, __builtin_ia32_loadupd(pixel+2) );
-# else
-    temp_av += (pixel[0] + pixel[1] + pixel[2] + pixel[3]);
-# endif
-  }
-
-  for(; pixel < endPixels;) {
-    temp_av += *pixel++;
-  }
-
-# ifdef HAVE_SSE2
-  {
-    double res[2] = {0,0};
-    __builtin_ia32_storeupd(res, accumul);
-    temp_av += res[0] + res[1];
-  }
-# endif
-#endif
-
-  temp_av /= tot_pixels;
-
-  /* Create a matrix for the return argument */
-  plhs[0] = mxCreateDoubleScalar(temp_av);
-}
diff --git a/zUtil_Cxx/gtImgMeanValue.cpp b/zUtil_Cxx/gtImgMeanValue.cpp
new file mode 100755
index 00000000..ffedf0e8
--- /dev/null
+++ b/zUtil_Cxx/gtImgMeanValue.cpp
@@ -0,0 +1,94 @@
+/**
+ * Does exactly the same as 'mean2' function from Imaging toolbox.
+ *
+ * The aim was to get rid of license requirements, and test possible
+ * optimization techniques.
+ *
+ * Nicola Vigano', 2012, ID11 @ ESRF vigano@esrf.eu
+ */
+
+#include "mex.h"
+
+#define ROUND_DOWN(x, s)  ((x) & ~((s)-1))
+#define ROUND_DOWN_2(x)   ((x) & ~1)
+#define ROUND_DOWN_4(x)   ((x) & ~3)
+#define ROUND_DOWN_8(x)   ((x) & ~7)
+#define ROUND_DOWN_16(x)  ((x) & ~15)
+#define ROUND_DOWN_32(x)  ((x) & ~31)
+
+#ifdef HAVE_OMP
+# include <omp.h>
+#endif
+typedef double v2df __attribute__ ((vector_size (16)));
+
+void mexFunction( int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[] )
+{
+  /* Incoming image dimensions */
+  const mwSize tot_pixels = mxGetNumberOfElements(prhs[0]);
+  const mwSize totUnroll32Pixels = ROUND_DOWN_32(tot_pixels);
+
+  /* Pointers to incoming matrices: */
+  const double * const __restrict image_in = mxGetPr(prhs[0]);
+
+  double temp_av = 0;
+  v2df accumul = {0, 0};
+
+#pragma omp parallel for reduction(+:accumul)
+  for(mwIndex count = 0; count < totUnroll32Pixels; count += 32) {
+    accumul += ( *((v2df *)&image_in[count+0]) + *((v2df *)&image_in[count+2])
+               + *((v2df *)&image_in[count+4]) + *((v2df *)&image_in[count+6])
+               + *((v2df *)&image_in[count+8]) + *((v2df *)&image_in[count+10])
+               + *((v2df *)&image_in[count+12]) + *((v2df *)&image_in[count+14])
+               + *((v2df *)&image_in[count+16]) + *((v2df *)&image_in[count+18])
+               + *((v2df *)&image_in[count+20]) + *((v2df *)&image_in[count+22])
+               + *((v2df *)&image_in[count+24]) + *((v2df *)&image_in[count+26])
+               + *((v2df *)&image_in[count+28]) + *((v2df *)&image_in[count+30]) );
+  }
+  for(mwIndex count = totUnroll32Pixels; count < tot_pixels; count++) {
+    temp_av += image_in[count];
+  }
+  const double * const __restrict accumuld = (double *)&accumul;
+
+  /* Create a matrix for the return argument */
+  plhs[0] = mxCreateDoubleScalar((temp_av + accumuld[0] + accumuld[1]) / tot_pixels);
+
+//  double temp_av = 0;
+//# ifdef __SSE2__
+//  v2df accumul = {0, 0};
+//# endif
+//
+//#ifdef HAVE_OMP
+//# ifdef __SSE2__
+//#   pragma omp parallel private(accumul)
+//# else
+//#   pragma omp parallel
+//# endif
+//#endif
+//  {
+//#ifdef __SSE2__
+//# pragma omp for
+//#else
+//# pragma omp for reduction(+:temp_av)
+//#endif
+//    for(mwIndex count = 0; count < totUnroll4Pixels; count += 4) {
+//#ifdef __SSE2__
+//      accumul += ( *((v2df *)&image_in[count]) + *((v2df *)&image_in[count+2]) );
+//#else
+//      temp_av += ( image_in[count+0] + image_in[count+1]
+//                 + image_in[count+2] + image_in[count+3]);
+//#endif
+//    }
+//    for(mwIndex count = totUnroll4Pixels; count < tot_pixels; count++) {
+//      temp_av += image_in[count];
+//    }
+//#ifdef __SSE2__
+//  const double * const accumuld = (double *)&accumul;
+//# pragma omp atomic
+//  temp_av += accumuld[0] + accumuld[1];
+//#endif
+//  }
+//  temp_av /= tot_pixels;
+//
+//  /* Create a matrix for the return argument */
+//  plhs[0] = mxCreateDoubleScalar(temp_av);
+}
-- 
GitLab