C functions / Mean value: added single precision support

Signed-off-by: Nicola Vigano <nicola.vigano@esrf.fr> git-svn-id: https://svn.code.sf.net/p/dct/code/trunk@841 4c865b51-4357-4376-afb4-474e03ccb993

C functions / Mean value: added single precision support
8e0c31d5 · Nicola Vigano · 9e78cdd3 · 8e0c31d5 · 8e0c31d5
Commit 8e0c31d5 authored 12 years ago by Nicola Vigano
--- a/zUtil_Cxx/gtImgMeanValue.cpp
+++ b/zUtil_Cxx/gtImgMeanValue.cpp
@@ -7,88 +7,50 @@
 * Nicola Vigano', 2012, ID11 @ ESRF vigano@esrf.eu
 */

+#ifndef __SSE2__
+# error "This function requires SSE2 SIMD extensions to be enabled!"
+#endif
+
 #include "mex.h"

-#define ROUND_DOWN(x, s)  ((x) & ~((s)-1))
-#define ROUND_DOWN_2(x)   ((x) & ~1)
-#define ROUND_DOWN_4(x)   ((x) & ~3)
-#define ROUND_DOWN_8(x)   ((x) & ~7)
-#define ROUND_DOWN_16(x)  ((x) & ~15)
-#define ROUND_DOWN_32(x)  ((x) & ~31)
-
-#ifdef HAVE_OMP
-# include <omp.h>
-#endif
-typedef double v2df __attribute__ ((vector_size (16)));
+#include "internal_array_defs.h"

 void mexFunction( int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[] )
 {
-  /* Incoming image dimensions */
-  const mwSize tot_pixels = mxGetNumberOfElements(prhs[0]);
-  const mwSize totUnroll32Pixels = ROUND_DOWN_32(tot_pixels);
-
-  /* Pointers to incoming matrices: */
-  const double * const __restrict image_in = mxGetPr(prhs[0]);
-
-  double temp_av = 0;
-  v2df accumul = {0, 0};
-
-#pragma omp parallel for reduction(+:accumul)
-  for(mwIndex count = 0; count < totUnroll32Pixels; count += 32) {
-    accumul += ( *((v2df *)&image_in[count+0]) + *((v2df *)&image_in[count+2])
-               + *((v2df *)&image_in[count+4]) + *((v2df *)&image_in[count+6])
-               + *((v2df *)&image_in[count+8]) + *((v2df *)&image_in[count+10])
-               + *((v2df *)&image_in[count+12]) + *((v2df *)&image_in[count+14])
-               + *((v2df *)&image_in[count+16]) + *((v2df *)&image_in[count+18])
-               + *((v2df *)&image_in[count+20]) + *((v2df *)&image_in[count+22])
-               + *((v2df *)&image_in[count+24]) + *((v2df *)&image_in[count+26])
-               + *((v2df *)&image_in[count+28]) + *((v2df *)&image_in[count+30]) );
+  if (nrhs != 1) {
+    mexErrMsgIdAndTxt("C_FUN:gtImgMeanValue:wrong_argument",
+        "Not enough arguments! you need to provide a array");
  }
-  for(mwIndex count = totUnroll32Pixels; count < tot_pixels; count++) {
-    temp_av += image_in[count];
+  switch (mxGetClassID(prhs[0])) {
+    case mxDOUBLE_CLASS: {
+      /* Pointer to incoming matrix: */
+      const double * const __restrict array_in = mxGetPr(prhs[0]);
+
+      /* Incoming image dimensions */
+      const mwSize numElems = mxGetNumberOfElements(prhs[0]);
+
+      /* Create a matrix for the return argument */
+      plhs[0] = mxCreateDoubleScalar(computeAverage(array_in, numElems));
+
+      break;
+    }
+    case mxSINGLE_CLASS: {
+      /* Pointer to incoming matrix: */
+      const float * const __restrict array_in = (const float *)mxGetData(prhs[0]);
+
+      /* Incoming image dimensions */
+      const mwSize numElems = mxGetNumberOfElements(prhs[0]);
+
+      /* Create a matrix for the return argument */
+      plhs[0] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+      *(float *)mxGetData(plhs[0]) = computeAverage(array_in, numElems);
+
+      break;
+    }
+    default: {
+      mexErrMsgIdAndTxt("C_FUN:gtImgMeanValue:wrong_argument",
+          "Unsupported type for array! Please use floating point types");
+    }
  }
-  const double * const __restrict accumuld = (double *)&accumul;
-
-  /* Create a matrix for the return argument */
-  plhs[0] = mxCreateDoubleScalar((temp_av + accumuld[0] + accumuld[1]) / tot_pixels);
-
-//  double temp_av = 0;
-//# ifdef __SSE2__
-//  v2df accumul = {0, 0};
-//# endif
-//
-//#ifdef HAVE_OMP
-//# ifdef __SSE2__
-//#   pragma omp parallel private(accumul)
-//# else
-//#   pragma omp parallel
-//# endif
-//#endif
-//  {
-//#ifdef __SSE2__
-//# pragma omp for
-//#else
-//# pragma omp for reduction(+:temp_av)
-//#endif
-//    for(mwIndex count = 0; count < totUnroll4Pixels; count += 4) {
-//#ifdef __SSE2__
-//      accumul += ( *((v2df *)&image_in[count]) + *((v2df *)&image_in[count+2]) );
-//#else
-//      temp_av += ( image_in[count+0] + image_in[count+1]
-//                 + image_in[count+2] + image_in[count+3]);
-//#endif
-//    }
-//    for(mwIndex count = totUnroll4Pixels; count < tot_pixels; count++) {
-//      temp_av += image_in[count];
-//    }
-//#ifdef __SSE2__
-//  const double * const accumuld = (double *)&accumul;
-//# pragma omp atomic
-//  temp_av += accumuld[0] + accumuld[1];
-//#endif
-//  }
-//  temp_av /= tot_pixels;
-//
-//  /* Create a matrix for the return argument */
-//  plhs[0] = mxCreateDoubleScalar(temp_av);
 }
+
--- a/zUtil_Cxx/include/internal_array_defs.h
+++ b/zUtil_Cxx/include/internal_array_defs.h
@@ -402,4 +402,62 @@ soft_threshold<SizeType, Type>::operator()(Type * const __restrict outArray,
  }
 }

+////////////////////////////////////////////////////////////////////////////////
+/// computeAverage
+
+template<typename Type, typename SizeType>
+Type
+computeAverage(const Type * const inArray, const SizeType & numElems)
+{
+  typedef Type vVvf __attribute__((vector_size(16))) __attribute__((aligned(16)));
+  const SizeType unrolling = 16;
+  const SizeType shift = 16 / sizeof(Type);
+  const SizeType block = shift * unrolling;
+
+  Type temp_av = 0;
+  vVvf accumul;
+  switch (shift) {
+    case 2: {
+      accumul = (vVvf)_mm_set1_pd(0.0);
+      break;
+    }
+    case 4: {
+      accumul = (vVvf)_mm_set1_ps(0.0);
+      break;
+    }
+  }
+
+  const SizeType numUnrolledElems = ROUND_DOWN(numElems, block);
+#pragma omp parallel
+  {
+#pragma omp for reduction(+:accumul) nowait
+    for(SizeType count = 0; count < numUnrolledElems; count += block) {
+      accumul += ( *((vVvf *)&inArray[count +  0 * shift]) + *((vVvf *)&inArray[count +  1 * shift])
+                 + *((vVvf *)&inArray[count +  2 * shift]) + *((vVvf *)&inArray[count +  3 * shift])
+                 + *((vVvf *)&inArray[count +  4 * shift]) + *((vVvf *)&inArray[count +  5 * shift])
+                 + *((vVvf *)&inArray[count +  6 * shift]) + *((vVvf *)&inArray[count +  7 * shift])
+                 + *((vVvf *)&inArray[count +  8 * shift]) + *((vVvf *)&inArray[count +  9 * shift])
+                 + *((vVvf *)&inArray[count + 10 * shift]) + *((vVvf *)&inArray[count + 11 * shift])
+                 + *((vVvf *)&inArray[count + 12 * shift]) + *((vVvf *)&inArray[count + 13 * shift])
+                 + *((vVvf *)&inArray[count + 14 * shift]) + *((vVvf *)&inArray[count + 15 * shift]) );
+    }
+#pragma omp for reduction(+:temp_av) nowait
+    for(mwIndex count = numUnrolledElems; count < numElems; count++) {
+      temp_av += inArray[count];
+    }
+  }
+
+  const Type * const __restrict accumuld = (Type *)&accumul;
+  switch (shift) {
+    case 2: {
+      return (temp_av + accumuld[0] + accumuld[1]) / numElems;
+    }
+    case 4: {
+      return (temp_av + accumuld[0] + accumuld[1] + accumuld[2] + accumuld[3]
+          ) / numElems;
+    }
+  }
+}
+
+
 #endif /* INTERNAL_ARRAY_DEFS_H_ */