From c41d4398e9623e35b1506e37221a0e1e13f1f0de Mon Sep 17 00:00:00 2001 From: Nicola Vigano <nicola.vigano@esrf.fr> Date: Tue, 17 Jul 2012 15:22:40 +0000 Subject: [PATCH] C++/gtImgMeanVal: updated version that outperforms matlab By a factor 1.5 on big images, and by a factor of 2 on small images This mainly served as a test bench to improve my SSE2 and OpenMP skills :) no real need otherwise. Signed-off-by: Nicola Vigano <nicola.vigano@esrf.fr> git-svn-id: https://svn.code.sf.net/p/dct/code/trunk@664 4c865b51-4357-4376-afb4-474e03ccb993 --- zUtil_Cxx/gtImgMeanValue.c | 91 ---------------------------------- zUtil_Cxx/gtImgMeanValue.cpp | 94 ++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 91 deletions(-) delete mode 100755 zUtil_Cxx/gtImgMeanValue.c create mode 100755 zUtil_Cxx/gtImgMeanValue.cpp diff --git a/zUtil_Cxx/gtImgMeanValue.c b/zUtil_Cxx/gtImgMeanValue.c deleted file mode 100755 index 5518c575..00000000 --- a/zUtil_Cxx/gtImgMeanValue.c +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Does exactly the same as 'mean2' function from Imaging toolbox. - * - * The aim was to get rid of license requirements, and test possible - * optimization techniques. - * - * Nicola Vigano', 2012, ID11 @ ESRF vigano@esrf.eu - */ - -#include "mex.h" - -#define ROUND_DOWN(x, s) ((x) & ~((s)-1)) -#define ROUND_DOWN_2(x) ((x) & ~1) -#define ROUND_DOWN_4(x) ((x) & ~3) -#define ROUND_DOWN_8(x) ((x) & ~7) -#define ROUND_DOWN_16(x) ((x) & ~15) - - -#ifdef HAVE_OMP -# include <omp.h> -#else -# ifdef HAVE_SSE2 -typedef double v2df __attribute__ ((vector_size (16))); -# endif -#endif - -void mexFunction( int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[] ) -{ - /* Incoming image dimensions */ - const unsigned int num_cols = mxGetM(prhs[0]), num_rows = mxGetN(prhs[0]); - const unsigned int tot_pixels = num_cols * num_rows; - - /* Pointers to incoming matrices: */ - const double * const image_in = mxGetPr(prhs[0]); - - double temp_av = 0; - const double * const endPixels = image_in + tot_pixels; - const unsigned int totUnroll4Pixels = ROUND_DOWN_4(tot_pixels); - -#ifdef HAVE_OMP - unsigned int count = 0; -# pragma omp parallel for private(count) reduction(+:temp_av) - for(count = 0; count < totUnroll4Pixels; count += 4) { - temp_av += ( image_in[count+0] + image_in[count+1] - + image_in[count+2] + image_in[count+3]); - } - - switch(tot_pixels - totUnroll4Pixels) { - case 3: - temp_av += image_in[totUnroll4Pixels+2]; - case 2: - temp_av += image_in[totUnroll4Pixels+1]; - case 1: - temp_av += image_in[totUnroll4Pixels]; - default: - break; - } -#else - const double * pixel = image_in; - const double * const endUnroll4Pixels = image_in + totUnroll4Pixels; -# ifdef HAVE_SSE2 - v2df accumul = {0, 0}; -# endif - - for(; pixel < endUnroll4Pixels; pixel += 4) { -# ifdef HAVE_SSE2 - accumul = __builtin_ia32_addpd(accumul, __builtin_ia32_loadupd(pixel) ); - accumul = __builtin_ia32_addpd(accumul, __builtin_ia32_loadupd(pixel+2) ); -# else - temp_av += (pixel[0] + pixel[1] + pixel[2] + pixel[3]); -# endif - } - - for(; pixel < endPixels;) { - temp_av += *pixel++; - } - -# ifdef HAVE_SSE2 - { - double res[2] = {0,0}; - __builtin_ia32_storeupd(res, accumul); - temp_av += res[0] + res[1]; - } -# endif -#endif - - temp_av /= tot_pixels; - - /* Create a matrix for the return argument */ - plhs[0] = mxCreateDoubleScalar(temp_av); -} diff --git a/zUtil_Cxx/gtImgMeanValue.cpp b/zUtil_Cxx/gtImgMeanValue.cpp new file mode 100755 index 00000000..ffedf0e8 --- /dev/null +++ b/zUtil_Cxx/gtImgMeanValue.cpp @@ -0,0 +1,94 @@ +/** + * Does exactly the same as 'mean2' function from Imaging toolbox. + * + * The aim was to get rid of license requirements, and test possible + * optimization techniques. + * + * Nicola Vigano', 2012, ID11 @ ESRF vigano@esrf.eu + */ + +#include "mex.h" + +#define ROUND_DOWN(x, s) ((x) & ~((s)-1)) +#define ROUND_DOWN_2(x) ((x) & ~1) +#define ROUND_DOWN_4(x) ((x) & ~3) +#define ROUND_DOWN_8(x) ((x) & ~7) +#define ROUND_DOWN_16(x) ((x) & ~15) +#define ROUND_DOWN_32(x) ((x) & ~31) + +#ifdef HAVE_OMP +# include <omp.h> +#endif +typedef double v2df __attribute__ ((vector_size (16))); + +void mexFunction( int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[] ) +{ + /* Incoming image dimensions */ + const mwSize tot_pixels = mxGetNumberOfElements(prhs[0]); + const mwSize totUnroll32Pixels = ROUND_DOWN_32(tot_pixels); + + /* Pointers to incoming matrices: */ + const double * const __restrict image_in = mxGetPr(prhs[0]); + + double temp_av = 0; + v2df accumul = {0, 0}; + +#pragma omp parallel for reduction(+:accumul) + for(mwIndex count = 0; count < totUnroll32Pixels; count += 32) { + accumul += ( *((v2df *)&image_in[count+0]) + *((v2df *)&image_in[count+2]) + + *((v2df *)&image_in[count+4]) + *((v2df *)&image_in[count+6]) + + *((v2df *)&image_in[count+8]) + *((v2df *)&image_in[count+10]) + + *((v2df *)&image_in[count+12]) + *((v2df *)&image_in[count+14]) + + *((v2df *)&image_in[count+16]) + *((v2df *)&image_in[count+18]) + + *((v2df *)&image_in[count+20]) + *((v2df *)&image_in[count+22]) + + *((v2df *)&image_in[count+24]) + *((v2df *)&image_in[count+26]) + + *((v2df *)&image_in[count+28]) + *((v2df *)&image_in[count+30]) ); + } + for(mwIndex count = totUnroll32Pixels; count < tot_pixels; count++) { + temp_av += image_in[count]; + } + const double * const __restrict accumuld = (double *)&accumul; + + /* Create a matrix for the return argument */ + plhs[0] = mxCreateDoubleScalar((temp_av + accumuld[0] + accumuld[1]) / tot_pixels); + +// double temp_av = 0; +//# ifdef __SSE2__ +// v2df accumul = {0, 0}; +//# endif +// +//#ifdef HAVE_OMP +//# ifdef __SSE2__ +//# pragma omp parallel private(accumul) +//# else +//# pragma omp parallel +//# endif +//#endif +// { +//#ifdef __SSE2__ +//# pragma omp for +//#else +//# pragma omp for reduction(+:temp_av) +//#endif +// for(mwIndex count = 0; count < totUnroll4Pixels; count += 4) { +//#ifdef __SSE2__ +// accumul += ( *((v2df *)&image_in[count]) + *((v2df *)&image_in[count+2]) ); +//#else +// temp_av += ( image_in[count+0] + image_in[count+1] +// + image_in[count+2] + image_in[count+3]); +//#endif +// } +// for(mwIndex count = totUnroll4Pixels; count < tot_pixels; count++) { +// temp_av += image_in[count]; +// } +//#ifdef __SSE2__ +// const double * const accumuld = (double *)&accumul; +//# pragma omp atomic +// temp_av += accumuld[0] + accumuld[1]; +//#endif +// } +// temp_av /= tot_pixels; +// +// /* Create a matrix for the return argument */ +// plhs[0] = mxCreateDoubleScalar(temp_av); +} -- GitLab