Skip to content
Snippets Groups Projects
Commit 8e0c31d5 authored by Nicola Vigano's avatar Nicola Vigano
Browse files

C functions / Mean value: added single precision support


Signed-off-by: default avatarNicola Vigano <nicola.vigano@esrf.fr>

git-svn-id: https://svn.code.sf.net/p/dct/code/trunk@841 4c865b51-4357-4376-afb4-474e03ccb993
parent 9e78cdd3
No related branches found
No related tags found
No related merge requests found
......@@ -7,88 +7,50 @@
* Nicola Vigano', 2012, ID11 @ ESRF vigano@esrf.eu
*/
#ifndef __SSE2__
# error "This function requires SSE2 SIMD extensions to be enabled!"
#endif
#include "mex.h"
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
#define ROUND_DOWN_2(x) ((x) & ~1)
#define ROUND_DOWN_4(x) ((x) & ~3)
#define ROUND_DOWN_8(x) ((x) & ~7)
#define ROUND_DOWN_16(x) ((x) & ~15)
#define ROUND_DOWN_32(x) ((x) & ~31)
#ifdef HAVE_OMP
# include <omp.h>
#endif
typedef double v2df __attribute__ ((vector_size (16)));
#include "internal_array_defs.h"
void mexFunction( int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[] )
{
/* Incoming image dimensions */
const mwSize tot_pixels = mxGetNumberOfElements(prhs[0]);
const mwSize totUnroll32Pixels = ROUND_DOWN_32(tot_pixels);
/* Pointers to incoming matrices: */
const double * const __restrict image_in = mxGetPr(prhs[0]);
double temp_av = 0;
v2df accumul = {0, 0};
#pragma omp parallel for reduction(+:accumul)
for(mwIndex count = 0; count < totUnroll32Pixels; count += 32) {
accumul += ( *((v2df *)&image_in[count+0]) + *((v2df *)&image_in[count+2])
+ *((v2df *)&image_in[count+4]) + *((v2df *)&image_in[count+6])
+ *((v2df *)&image_in[count+8]) + *((v2df *)&image_in[count+10])
+ *((v2df *)&image_in[count+12]) + *((v2df *)&image_in[count+14])
+ *((v2df *)&image_in[count+16]) + *((v2df *)&image_in[count+18])
+ *((v2df *)&image_in[count+20]) + *((v2df *)&image_in[count+22])
+ *((v2df *)&image_in[count+24]) + *((v2df *)&image_in[count+26])
+ *((v2df *)&image_in[count+28]) + *((v2df *)&image_in[count+30]) );
if (nrhs != 1) {
mexErrMsgIdAndTxt("C_FUN:gtImgMeanValue:wrong_argument",
"Not enough arguments! you need to provide a array");
}
for(mwIndex count = totUnroll32Pixels; count < tot_pixels; count++) {
temp_av += image_in[count];
switch (mxGetClassID(prhs[0])) {
case mxDOUBLE_CLASS: {
/* Pointer to incoming matrix: */
const double * const __restrict array_in = mxGetPr(prhs[0]);
/* Incoming image dimensions */
const mwSize numElems = mxGetNumberOfElements(prhs[0]);
/* Create a matrix for the return argument */
plhs[0] = mxCreateDoubleScalar(computeAverage(array_in, numElems));
break;
}
case mxSINGLE_CLASS: {
/* Pointer to incoming matrix: */
const float * const __restrict array_in = (const float *)mxGetData(prhs[0]);
/* Incoming image dimensions */
const mwSize numElems = mxGetNumberOfElements(prhs[0]);
/* Create a matrix for the return argument */
plhs[0] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
*(float *)mxGetData(plhs[0]) = computeAverage(array_in, numElems);
break;
}
default: {
mexErrMsgIdAndTxt("C_FUN:gtImgMeanValue:wrong_argument",
"Unsupported type for array! Please use floating point types");
}
}
const double * const __restrict accumuld = (double *)&accumul;
/* Create a matrix for the return argument */
plhs[0] = mxCreateDoubleScalar((temp_av + accumuld[0] + accumuld[1]) / tot_pixels);
// double temp_av = 0;
//# ifdef __SSE2__
// v2df accumul = {0, 0};
//# endif
//
//#ifdef HAVE_OMP
//# ifdef __SSE2__
//# pragma omp parallel private(accumul)
//# else
//# pragma omp parallel
//# endif
//#endif
// {
//#ifdef __SSE2__
//# pragma omp for
//#else
//# pragma omp for reduction(+:temp_av)
//#endif
// for(mwIndex count = 0; count < totUnroll4Pixels; count += 4) {
//#ifdef __SSE2__
// accumul += ( *((v2df *)&image_in[count]) + *((v2df *)&image_in[count+2]) );
//#else
// temp_av += ( image_in[count+0] + image_in[count+1]
// + image_in[count+2] + image_in[count+3]);
//#endif
// }
// for(mwIndex count = totUnroll4Pixels; count < tot_pixels; count++) {
// temp_av += image_in[count];
// }
//#ifdef __SSE2__
// const double * const accumuld = (double *)&accumul;
//# pragma omp atomic
// temp_av += accumuld[0] + accumuld[1];
//#endif
// }
// temp_av /= tot_pixels;
//
// /* Create a matrix for the return argument */
// plhs[0] = mxCreateDoubleScalar(temp_av);
}
......@@ -402,4 +402,62 @@ soft_threshold<SizeType, Type>::operator()(Type * const __restrict outArray,
}
}
////////////////////////////////////////////////////////////////////////////////
/// computeAverage
template<typename Type, typename SizeType>
Type
computeAverage(const Type * const inArray, const SizeType & numElems)
{
typedef Type vVvf __attribute__((vector_size(16))) __attribute__((aligned(16)));
const SizeType unrolling = 16;
const SizeType shift = 16 / sizeof(Type);
const SizeType block = shift * unrolling;
Type temp_av = 0;
vVvf accumul;
switch (shift) {
case 2: {
accumul = (vVvf)_mm_set1_pd(0.0);
break;
}
case 4: {
accumul = (vVvf)_mm_set1_ps(0.0);
break;
}
}
const SizeType numUnrolledElems = ROUND_DOWN(numElems, block);
#pragma omp parallel
{
#pragma omp for reduction(+:accumul) nowait
for(SizeType count = 0; count < numUnrolledElems; count += block) {
accumul += ( *((vVvf *)&inArray[count + 0 * shift]) + *((vVvf *)&inArray[count + 1 * shift])
+ *((vVvf *)&inArray[count + 2 * shift]) + *((vVvf *)&inArray[count + 3 * shift])
+ *((vVvf *)&inArray[count + 4 * shift]) + *((vVvf *)&inArray[count + 5 * shift])
+ *((vVvf *)&inArray[count + 6 * shift]) + *((vVvf *)&inArray[count + 7 * shift])
+ *((vVvf *)&inArray[count + 8 * shift]) + *((vVvf *)&inArray[count + 9 * shift])
+ *((vVvf *)&inArray[count + 10 * shift]) + *((vVvf *)&inArray[count + 11 * shift])
+ *((vVvf *)&inArray[count + 12 * shift]) + *((vVvf *)&inArray[count + 13 * shift])
+ *((vVvf *)&inArray[count + 14 * shift]) + *((vVvf *)&inArray[count + 15 * shift]) );
}
#pragma omp for reduction(+:temp_av) nowait
for(mwIndex count = numUnrolledElems; count < numElems; count++) {
temp_av += inArray[count];
}
}
const Type * const __restrict accumuld = (Type *)&accumul;
switch (shift) {
case 2: {
return (temp_av + accumuld[0] + accumuld[1]) / numElems;
}
case 4: {
return (temp_av + accumuld[0] + accumuld[1] + accumuld[2] + accumuld[3]
) / numElems;
}
}
}
#endif /* INTERNAL_ARRAY_DEFS_H_ */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment