diff --git a/zUtil_Cxx/include/internal_cell_defs.h b/zUtil_Cxx/include/internal_cell_defs.h index daa7fc6ad5dff23cc85d4b82e49277dc3529a9a7..e52d42add95978d7d81f02d57cfa111cff8575aa 100755 --- a/zUtil_Cxx/include/internal_cell_defs.h +++ b/zUtil_Cxx/include/internal_cell_defs.h @@ -12,126 +12,139 @@ #include <omp.h> #include "mex.h" -#if defined(__SSE2__) || defined(__SSE2_MATH__) # define ROUND_DOWN(x, s) ((x) & ~((s)-1)) +#if defined(__SSE2__) || defined(__SSE2_MATH__) typedef double v2df __attribute__ ((vector_size (16))); #endif -template<typename TypeIn, typename TypeOut> +template<typename Type1, typename Type2> class inner_assign { public: - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (inData); + return (inData2); } }; -template<typename TypeIn, typename TypeOut> +template<typename Type1, typename Type2> class inner_pow { const double exponent; public: inner_pow(const double & _exp) : exponent(_exp) { } - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return pow(inData, exponent); + return pow(inData2, exponent); } }; -template<typename TypeIn, typename TypeOut> +template<typename Type1, typename Type2> class inner_sum { public: - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (outData + inData); + return (inData1 + inData2); } }; -template<typename TypeIn, typename TypeOut> +template<typename Type1, typename Type2> class inner_prod { public: - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (outData * inData); + return (inData1 * inData2); } }; -template<typename TypeIn, typename TypeOut> +template<typename Type1, typename Type2> class inner_sub { public: - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (outData - inData); + return (inData1 - inData2); } }; -template<typename TypeIn, typename TypeOut> +template<typename Type1, typename Type2> class inner_div { public: - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (outData / inData); + return (inData1 / inData2); } }; -template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn> +template<typename Type1, typename Type2, typename TypeScalar = Type1> class inner_sum_scale { const TypeScalar scale; public: inner_sum_scale(const TypeScalar & _scale) : scale(_scale) { } - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() + { + return (inData1 + scale * inData2); + } +}; + +template<typename Type1, typename Type2, typename TypeScalar = Type1> +class inner_sum_FISTA_scale { + const TypeScalar scale; +public: + inner_sum_FISTA_scale(const TypeScalar & _scale) : scale(_scale) { } + + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (outData + scale * inData); + return (inData1 + scale * (inData1 - inData2)); } }; -template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn> +template<typename Type1, typename Type2, typename TypeScalar = Type1> class inner_sub_scale { const TypeScalar scale; public: inner_sub_scale(const TypeScalar & _scale) : scale(_scale) { } - const TypeOut - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type2 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (inData * scale - outData); + return (inData1 - inData2 * scale); } }; -template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn> +template<typename Type1, typename Type2, typename TypeScalar = Type1> class inner_sum_scalar { const TypeScalar scalar; public: inner_sum_scalar(const TypeScalar & _scalar) : scalar(_scalar) { } - const TypeIn - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type1 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (inData + scalar); + return (inData2 + scalar); } }; -template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn> +template<typename Type1, typename Type2, typename TypeScalar = Type1> class inner_prod_scalar { const TypeScalar scalar; public: inner_prod_scalar(const TypeScalar & _scalar) : scalar(_scalar) { } - const TypeIn - operator()(TypeIn & inData, TypeOut & outData) const throw() + const Type1 + operator()(Type1 & inData1, Type2 & inData2) const throw() { - return (inData * scalar); + return (inData2 * scalar); } }; @@ -159,21 +172,23 @@ inline void cell_iteration_sse(mxArray * & outArray, const mxArray * const inArray, const Function & func, const Function2 & func2) { - const mwSize numCells = mxGetNumberOfElements(inArray); if (allocate) { cell_allocate<mxDOUBLE_CLASS>(outArray, inArray); } -#pragma omp parallel for - for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) { - const mxArray * const inCell = mxGetCell(inArray, cellIdx); - mxArray * const outCell = mxGetCell(outArray, cellIdx); + const mwSize numCells = mxGetNumberOfElements(inArray); +#pragma omp parallel + { + for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) { + const mxArray * const inCell = mxGetCell(inArray, cellIdx); + mxArray * const outCell = mxGetCell(outArray, cellIdx); - const mwSize numElems = mxGetNumberOfElements(inCell); - const double * const inData = mxGetPr(inCell); - double * const outData = mxGetPr(outCell); + const mwSize numElems = mxGetNumberOfElements(inCell); + const double * const inData = mxGetPr(inCell); + double * const outData = mxGetPr(outCell); - cell_inner_cycle_sse(outData, inData, outData, numElems, func, func2); + cell_inner_cycle_sse(outData, outData, inData, numElems, func, func2); + } } } @@ -183,21 +198,23 @@ cell_iteration_copy_sse(mxArray * & outArray, const mxArray * const inArray1, const mxArray * const inArray2, const Function & func, const Function2 & func2) { - const mwSize numCells = mxGetNumberOfElements(inArray1); cell_allocate<mxDOUBLE_CLASS>(outArray, inArray1); -#pragma omp parallel for - for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) { - const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx); - const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx); - mxArray * const outCell = mxGetCell(outArray, cellIdx); + const mwSize numCells = mxGetNumberOfElements(inArray1); +#pragma omp parallel + { + for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) { + const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx); + const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx); + mxArray * const outCell = mxGetCell(outArray, cellIdx); - const mwSize numElems = mxGetNumberOfElements(inCell1); - const double * const inData1 = mxGetPr(inCell1); - const double * const inData2 = mxGetPr(inCell2); - double * const outData = mxGetPr(outCell); + const mwSize numElems = mxGetNumberOfElements(inCell1); + const double * const inData1 = mxGetPr(inCell1); + const double * const inData2 = mxGetPr(inCell2); + double * const outData = mxGetPr(outCell); - cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2); + cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2); + } } } @@ -208,17 +225,26 @@ cell_inner_cycle_sse(Type * const __restrict outData, const Type * const __restrict inData2, const mwSize & numElems, FunctionVect & func, FunctionNonVect & func2) { - const mwSize num4Elems = ROUND_DOWN(numElems, 4); - for(mwIndex elemIdx = 0; elemIdx < num4Elems; elemIdx += 4) { - const v2df inV11 = *((v2df *)&inData1[elemIdx]); - const v2df inV12 = *((v2df *)&inData1[elemIdx+2]); - const v2df inV21 = *((v2df *)&inData2[elemIdx]); + const mwSize num8Elems = ROUND_DOWN(numElems, 8); +#pragma omp for nowait + for(mwIndex elemIdx = 0; elemIdx < num8Elems; elemIdx += 8) { + const v2df inV11 = *((v2df *)&inData1[elemIdx+0]); + const v2df inV12 = *((v2df *)&inData1[elemIdx+2]); + const v2df inV13 = *((v2df *)&inData1[elemIdx+4]); + const v2df inV14 = *((v2df *)&inData1[elemIdx+6]); + + const v2df inV21 = *((v2df *)&inData2[elemIdx+0]); const v2df inV22 = *((v2df *)&inData2[elemIdx+2]); + const v2df inV23 = *((v2df *)&inData2[elemIdx+4]); + const v2df inV24 = *((v2df *)&inData2[elemIdx+6]); - *((v2df *)&outData[elemIdx]) = func(inV11, inV21); + *((v2df *)&outData[elemIdx+0]) = func(inV11, inV21); *((v2df *)&outData[elemIdx+2]) = func(inV12, inV22); + *((v2df *)&outData[elemIdx+4]) = func(inV13, inV23); + *((v2df *)&outData[elemIdx+6]) = func(inV14, inV24); } - for(mwIndex elemIdx = num4Elems; elemIdx < numElems; elemIdx++) { +#pragma omp for nowait + for(mwIndex elemIdx = num8Elems; elemIdx < numElems; elemIdx++) { outData[elemIdx] = func2(inData1[elemIdx], inData2[elemIdx]); } } @@ -244,7 +270,7 @@ cell_iteration(mxArray * & outArray, const mxArray * const inArray, double * const outData = mxGetPr(outCell); for(mwIndex elemIdx = 0; elemIdx < numElems; elemIdx++) { - outData[elemIdx] = func(inData[elemIdx], outData[elemIdx]); + outData[elemIdx] = func(outData[elemIdx], inData[elemIdx]); } } } diff --git a/zUtil_DataStructures/GtCellBenchmarks.m b/zUtil_DataStructures/GtCellBenchmarks.m index ab5296568a8de45399ddde44d174ab8bfe561762..38e3961b6533376ebba627501ffc724b0384893f 100644 --- a/zUtil_DataStructures/GtCellBenchmarks.m +++ b/zUtil_DataStructures/GtCellBenchmarks.m @@ -34,7 +34,10 @@ classdef GtCellBenchmarks GtCellBenchmarks.printHeader('internal_cell_copy', bytes); tic(); newVols = internal_cell_copy(vols); - GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes); + GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); + for ii = 1:length(newVols) + newVols{ii} = permute(newVols{ii}, [2 1 3]); + end end function benchmarkDiv(vols, newVols) @@ -49,7 +52,7 @@ classdef GtCellBenchmarks tic(); dummy3 = internal_cell_div_copy(dummy3, newVols); GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); - + dummy = vols; tic(); dummy = cellfun(@(x, y){x ./ y}, dummy, newVols); @@ -78,7 +81,7 @@ classdef GtCellBenchmarks tic(); dummy3 = internal_cell_prod_copy(dummy3, newVols); GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); - + dummy = vols; tic(); dummy = cellfun(@(x, y){x .* y}, dummy, newVols); @@ -107,7 +110,7 @@ classdef GtCellBenchmarks tic(); dummy3 = internal_cell_prod_scalar_copy(dummy3, beta); GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); - + dummy = vols; tic(); dummy = cellfun(@(x){x * beta}, dummy); @@ -124,6 +127,35 @@ classdef GtCellBenchmarks GtCellBenchmarks.verifyError(dummy, dummy3); end + function benchmarkSumScalar(vols, beta) + bytes = GtCellBenchmarks.getSizeVariable(vols); + % internal_cell_sum_scalar_assign.cpp + GtCellBenchmarks.printHeader('internal_cell_sum_scalar_[assign, copy]', bytes); + dummy2 = internal_cell_copy(vols); + tic(); + dummy2 = internal_cell_sum_scalar_assign(dummy2, beta); + GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes); + dummy3 = internal_cell_copy(vols); + tic(); + dummy3 = internal_cell_sum_scalar_copy(dummy3, beta); + GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); + + dummy = vols; + tic(); + dummy = cellfun(@(x){x + beta}, dummy); + GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes); + + dummy = vols; + tic(); + for ii = 1:length(dummy) + dummy{ii} = dummy{ii} + beta; + end + GtCellBenchmarks.printResult('for loop: %f', toc(), bytes); + + GtCellBenchmarks.verifyError(dummy, dummy2); + GtCellBenchmarks.verifyError(dummy, dummy3); + end + function benchmarkSum(vols, newVols) bytes = 2 * GtCellBenchmarks.getSizeVariable(vols); % internal_cell_sum_assign.cpp @@ -136,7 +168,7 @@ classdef GtCellBenchmarks tic(); dummy3 = internal_cell_sum_copy(dummy3, newVols); GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); - + dummy = vols; tic(); dummy = cellfun(@(x, y){x + y}, dummy, newVols); @@ -153,8 +185,37 @@ classdef GtCellBenchmarks GtCellBenchmarks.verifyError(dummy, dummy3); end + function benchmarkSub(vols, newVols) + bytes = 2 * GtCellBenchmarks.getSizeVariable(vols); + % internal_cell_sum_assign.cpp + GtCellBenchmarks.printHeader('internal_cell_sub_[assign, copy]', bytes); + dummy2 = internal_cell_copy(vols); + tic(); + dummy2 = internal_cell_sub_assign(dummy2, newVols); + GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes); + dummy3 = internal_cell_copy(vols); + tic(); + dummy3 = internal_cell_sub_copy(dummy3, newVols); + GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); + + dummy = vols; + tic(); + dummy = cellfun(@(x, y){x - y}, dummy, newVols); + GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes); + + dummy = vols; + tic(); + for ii = 1:length(dummy) + dummy{ii} = dummy{ii} - newVols{ii}; + end + GtCellBenchmarks.printResult('for loop: %f', toc(), bytes); + + GtCellBenchmarks.verifyError(dummy, dummy2); + GtCellBenchmarks.verifyError(dummy, dummy3); + end + function benchmarkScaledSum(vols, newVols, beta) - bytes = GtCellBenchmarks.getSizeVariable(vols); + bytes = 2 * GtCellBenchmarks.getSizeVariable(vols); % internal_cell_sum_scaled_assign.cpp GtCellBenchmarks.printHeader('internal_cell_sum_scaled_[assign, copy]', bytes); dummy2 = internal_cell_copy(vols); @@ -165,7 +226,7 @@ classdef GtCellBenchmarks tic(); dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta); GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); - + dummy = vols; tic(); dummy = cellfun(@(x, y){x + y * beta}, dummy, newVols); @@ -182,6 +243,64 @@ classdef GtCellBenchmarks GtCellBenchmarks.verifyError(dummy, dummy3); end + function benchmarkScaledSub(vols, newVols, beta) + bytes = 2 * GtCellBenchmarks.getSizeVariable(vols); + % internal_cell_sub_scaled_assign.cpp + GtCellBenchmarks.printHeader('internal_cell_sub_scaled_[assign, copy]', bytes); + dummy2 = internal_cell_copy(vols); + tic(); + dummy2 = internal_cell_sub_scaled_assign(dummy2, newVols, beta); + GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes); + dummy3 = internal_cell_copy(vols); + tic(); + dummy3 = internal_cell_sub_scaled_copy(dummy3, newVols, beta); + GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); + + dummy = vols; + tic(); + dummy = cellfun(@(x, y){x - y * beta}, dummy, newVols); + GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes); + + dummy = vols; + tic(); + for ii = 1:length(dummy) + dummy{ii} = dummy{ii} - newVols{ii} * beta; + end + GtCellBenchmarks.printResult('for loop: %f', toc(), bytes); + + GtCellBenchmarks.verifyError(dummy, dummy2); + GtCellBenchmarks.verifyError(dummy, dummy3); + end + + function benchmarkScaledFISTASum(vols, newVols, beta) + bytes = 2 * GtCellBenchmarks.getSizeVariable(vols); + % internal_cell_sum_scaled_assign.cpp + GtCellBenchmarks.printHeader('internal_cell_sum_FISTA_scaled_[assign, copy]', bytes); + dummy2 = internal_cell_copy(vols); + tic(); + dummy2 = internal_cell_sum_FISTA_scaled_assign(dummy2, newVols, beta); + GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes); +% dummy3 = internal_cell_copy(vols); +% tic(); +% dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta); +% GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes); + + dummy = vols; + tic(); + dummy = cellfun(@(x, y){x + (x - y) * beta}, dummy, newVols); + GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes); + + dummy = vols; + tic(); + for ii = 1:length(dummy) + dummy{ii} = dummy{ii} + (dummy{ii} - newVols{ii}) * beta; + end + GtCellBenchmarks.printResult('for loop: %f', toc(), bytes); + + GtCellBenchmarks.verifyError(dummy, dummy2); +% GtCellBenchmarks.verifyError(dummy, dummy3); + end + function allBenchmarks(vols) beta = 2.0; @@ -191,12 +310,20 @@ classdef GtCellBenchmarks GtCellBenchmarks.benchmarkProd(vols, newVols); + GtCellBenchmarks.benchmarkProdScalar(vols, beta); + GtCellBenchmarks.benchmarkSum(vols, newVols); - GtCellBenchmarks.benchmarkProdScalar(vols, beta); + GtCellBenchmarks.benchmarkSumScalar(vols, beta); + + GtCellBenchmarks.benchmarkSub(vols, newVols); GtCellBenchmarks.benchmarkScaledSum(vols, newVols, beta); + GtCellBenchmarks.benchmarkScaledSub(vols, newVols, beta); + + GtCellBenchmarks.benchmarkScaledFISTASum(vols, newVols, beta); + % % internal_cell_exponent.cpp % fprintf('\ninternal_cell_exponent\n'); % tic();