Skip to content
Snippets Groups Projects
Commit 2af16018 authored by Nicola Vigano's avatar Nicola Vigano
Browse files

Cell Operations: fixed copies and updated benchmarks


Signed-off-by: default avatarNicola Vigano <nicola.vigano@esrf.fr>

git-svn-id: https://svn.code.sf.net/p/dct/code/trunk@662 4c865b51-4357-4376-afb4-474e03ccb993
parent 9d8f8e49
No related branches found
No related tags found
No related merge requests found
......@@ -12,126 +12,139 @@
#include <omp.h>
#include "mex.h"
#if defined(__SSE2__) || defined(__SSE2_MATH__)
# define ROUND_DOWN(x, s) ((x) & ~((s)-1))
#if defined(__SSE2__) || defined(__SSE2_MATH__)
typedef double v2df __attribute__ ((vector_size (16)));
#endif
template<typename TypeIn, typename TypeOut>
template<typename Type1, typename Type2>
class inner_assign {
public:
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (inData);
return (inData2);
}
};
template<typename TypeIn, typename TypeOut>
template<typename Type1, typename Type2>
class inner_pow {
const double exponent;
public:
inner_pow(const double & _exp) : exponent(_exp) { }
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return pow(inData, exponent);
return pow(inData2, exponent);
}
};
template<typename TypeIn, typename TypeOut>
template<typename Type1, typename Type2>
class inner_sum {
public:
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (outData + inData);
return (inData1 + inData2);
}
};
template<typename TypeIn, typename TypeOut>
template<typename Type1, typename Type2>
class inner_prod {
public:
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (outData * inData);
return (inData1 * inData2);
}
};
template<typename TypeIn, typename TypeOut>
template<typename Type1, typename Type2>
class inner_sub {
public:
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (outData - inData);
return (inData1 - inData2);
}
};
template<typename TypeIn, typename TypeOut>
template<typename Type1, typename Type2>
class inner_div {
public:
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (outData / inData);
return (inData1 / inData2);
}
};
template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
template<typename Type1, typename Type2, typename TypeScalar = Type1>
class inner_sum_scale {
const TypeScalar scale;
public:
inner_sum_scale(const TypeScalar & _scale) : scale(_scale) { }
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (inData1 + scale * inData2);
}
};
template<typename Type1, typename Type2, typename TypeScalar = Type1>
class inner_sum_FISTA_scale {
const TypeScalar scale;
public:
inner_sum_FISTA_scale(const TypeScalar & _scale) : scale(_scale) { }
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (outData + scale * inData);
return (inData1 + scale * (inData1 - inData2));
}
};
template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
template<typename Type1, typename Type2, typename TypeScalar = Type1>
class inner_sub_scale {
const TypeScalar scale;
public:
inner_sub_scale(const TypeScalar & _scale) : scale(_scale) { }
const TypeOut
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type2
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (inData * scale - outData);
return (inData1 - inData2 * scale);
}
};
template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
template<typename Type1, typename Type2, typename TypeScalar = Type1>
class inner_sum_scalar {
const TypeScalar scalar;
public:
inner_sum_scalar(const TypeScalar & _scalar) : scalar(_scalar) { }
const TypeIn
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type1
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (inData + scalar);
return (inData2 + scalar);
}
};
template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
template<typename Type1, typename Type2, typename TypeScalar = Type1>
class inner_prod_scalar {
const TypeScalar scalar;
public:
inner_prod_scalar(const TypeScalar & _scalar) : scalar(_scalar) { }
const TypeIn
operator()(TypeIn & inData, TypeOut & outData) const throw()
const Type1
operator()(Type1 & inData1, Type2 & inData2) const throw()
{
return (inData * scalar);
return (inData2 * scalar);
}
};
......@@ -159,21 +172,23 @@ inline void
cell_iteration_sse(mxArray * & outArray, const mxArray * const inArray,
const Function & func, const Function2 & func2)
{
const mwSize numCells = mxGetNumberOfElements(inArray);
if (allocate) {
cell_allocate<mxDOUBLE_CLASS>(outArray, inArray);
}
#pragma omp parallel for
for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
const mxArray * const inCell = mxGetCell(inArray, cellIdx);
mxArray * const outCell = mxGetCell(outArray, cellIdx);
const mwSize numCells = mxGetNumberOfElements(inArray);
#pragma omp parallel
{
for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
const mxArray * const inCell = mxGetCell(inArray, cellIdx);
mxArray * const outCell = mxGetCell(outArray, cellIdx);
const mwSize numElems = mxGetNumberOfElements(inCell);
const double * const inData = mxGetPr(inCell);
double * const outData = mxGetPr(outCell);
const mwSize numElems = mxGetNumberOfElements(inCell);
const double * const inData = mxGetPr(inCell);
double * const outData = mxGetPr(outCell);
cell_inner_cycle_sse(outData, inData, outData, numElems, func, func2);
cell_inner_cycle_sse(outData, outData, inData, numElems, func, func2);
}
}
}
......@@ -183,21 +198,23 @@ cell_iteration_copy_sse(mxArray * & outArray, const mxArray * const inArray1,
const mxArray * const inArray2,
const Function & func, const Function2 & func2)
{
const mwSize numCells = mxGetNumberOfElements(inArray1);
cell_allocate<mxDOUBLE_CLASS>(outArray, inArray1);
#pragma omp parallel for
for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx);
const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx);
mxArray * const outCell = mxGetCell(outArray, cellIdx);
const mwSize numCells = mxGetNumberOfElements(inArray1);
#pragma omp parallel
{
for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx);
const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx);
mxArray * const outCell = mxGetCell(outArray, cellIdx);
const mwSize numElems = mxGetNumberOfElements(inCell1);
const double * const inData1 = mxGetPr(inCell1);
const double * const inData2 = mxGetPr(inCell2);
double * const outData = mxGetPr(outCell);
const mwSize numElems = mxGetNumberOfElements(inCell1);
const double * const inData1 = mxGetPr(inCell1);
const double * const inData2 = mxGetPr(inCell2);
double * const outData = mxGetPr(outCell);
cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2);
cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2);
}
}
}
......@@ -208,17 +225,26 @@ cell_inner_cycle_sse(Type * const __restrict outData,
const Type * const __restrict inData2,
const mwSize & numElems, FunctionVect & func, FunctionNonVect & func2)
{
const mwSize num4Elems = ROUND_DOWN(numElems, 4);
for(mwIndex elemIdx = 0; elemIdx < num4Elems; elemIdx += 4) {
const v2df inV11 = *((v2df *)&inData1[elemIdx]);
const v2df inV12 = *((v2df *)&inData1[elemIdx+2]);
const v2df inV21 = *((v2df *)&inData2[elemIdx]);
const mwSize num8Elems = ROUND_DOWN(numElems, 8);
#pragma omp for nowait
for(mwIndex elemIdx = 0; elemIdx < num8Elems; elemIdx += 8) {
const v2df inV11 = *((v2df *)&inData1[elemIdx+0]);
const v2df inV12 = *((v2df *)&inData1[elemIdx+2]);
const v2df inV13 = *((v2df *)&inData1[elemIdx+4]);
const v2df inV14 = *((v2df *)&inData1[elemIdx+6]);
const v2df inV21 = *((v2df *)&inData2[elemIdx+0]);
const v2df inV22 = *((v2df *)&inData2[elemIdx+2]);
const v2df inV23 = *((v2df *)&inData2[elemIdx+4]);
const v2df inV24 = *((v2df *)&inData2[elemIdx+6]);
*((v2df *)&outData[elemIdx]) = func(inV11, inV21);
*((v2df *)&outData[elemIdx+0]) = func(inV11, inV21);
*((v2df *)&outData[elemIdx+2]) = func(inV12, inV22);
*((v2df *)&outData[elemIdx+4]) = func(inV13, inV23);
*((v2df *)&outData[elemIdx+6]) = func(inV14, inV24);
}
for(mwIndex elemIdx = num4Elems; elemIdx < numElems; elemIdx++) {
#pragma omp for nowait
for(mwIndex elemIdx = num8Elems; elemIdx < numElems; elemIdx++) {
outData[elemIdx] = func2(inData1[elemIdx], inData2[elemIdx]);
}
}
......@@ -244,7 +270,7 @@ cell_iteration(mxArray * & outArray, const mxArray * const inArray,
double * const outData = mxGetPr(outCell);
for(mwIndex elemIdx = 0; elemIdx < numElems; elemIdx++) {
outData[elemIdx] = func(inData[elemIdx], outData[elemIdx]);
outData[elemIdx] = func(outData[elemIdx], inData[elemIdx]);
}
}
}
......
......@@ -34,7 +34,10 @@ classdef GtCellBenchmarks
GtCellBenchmarks.printHeader('internal_cell_copy', bytes);
tic();
newVols = internal_cell_copy(vols);
GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
for ii = 1:length(newVols)
newVols{ii} = permute(newVols{ii}, [2 1 3]);
end
end
function benchmarkDiv(vols, newVols)
......@@ -49,7 +52,7 @@ classdef GtCellBenchmarks
tic();
dummy3 = internal_cell_div_copy(dummy3, newVols);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x ./ y}, dummy, newVols);
......@@ -78,7 +81,7 @@ classdef GtCellBenchmarks
tic();
dummy3 = internal_cell_prod_copy(dummy3, newVols);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x .* y}, dummy, newVols);
......@@ -107,7 +110,7 @@ classdef GtCellBenchmarks
tic();
dummy3 = internal_cell_prod_scalar_copy(dummy3, beta);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x){x * beta}, dummy);
......@@ -124,6 +127,35 @@ classdef GtCellBenchmarks
GtCellBenchmarks.verifyError(dummy, dummy3);
end
function benchmarkSumScalar(vols, beta)
bytes = GtCellBenchmarks.getSizeVariable(vols);
% internal_cell_sum_scalar_assign.cpp
GtCellBenchmarks.printHeader('internal_cell_sum_scalar_[assign, copy]', bytes);
dummy2 = internal_cell_copy(vols);
tic();
dummy2 = internal_cell_sum_scalar_assign(dummy2, beta);
GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes);
dummy3 = internal_cell_copy(vols);
tic();
dummy3 = internal_cell_sum_scalar_copy(dummy3, beta);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x){x + beta}, dummy);
GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes);
dummy = vols;
tic();
for ii = 1:length(dummy)
dummy{ii} = dummy{ii} + beta;
end
GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
GtCellBenchmarks.verifyError(dummy, dummy2);
GtCellBenchmarks.verifyError(dummy, dummy3);
end
function benchmarkSum(vols, newVols)
bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
% internal_cell_sum_assign.cpp
......@@ -136,7 +168,7 @@ classdef GtCellBenchmarks
tic();
dummy3 = internal_cell_sum_copy(dummy3, newVols);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x + y}, dummy, newVols);
......@@ -153,8 +185,37 @@ classdef GtCellBenchmarks
GtCellBenchmarks.verifyError(dummy, dummy3);
end
function benchmarkSub(vols, newVols)
bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
% internal_cell_sum_assign.cpp
GtCellBenchmarks.printHeader('internal_cell_sub_[assign, copy]', bytes);
dummy2 = internal_cell_copy(vols);
tic();
dummy2 = internal_cell_sub_assign(dummy2, newVols);
GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes);
dummy3 = internal_cell_copy(vols);
tic();
dummy3 = internal_cell_sub_copy(dummy3, newVols);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x - y}, dummy, newVols);
GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes);
dummy = vols;
tic();
for ii = 1:length(dummy)
dummy{ii} = dummy{ii} - newVols{ii};
end
GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
GtCellBenchmarks.verifyError(dummy, dummy2);
GtCellBenchmarks.verifyError(dummy, dummy3);
end
function benchmarkScaledSum(vols, newVols, beta)
bytes = GtCellBenchmarks.getSizeVariable(vols);
bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
% internal_cell_sum_scaled_assign.cpp
GtCellBenchmarks.printHeader('internal_cell_sum_scaled_[assign, copy]', bytes);
dummy2 = internal_cell_copy(vols);
......@@ -165,7 +226,7 @@ classdef GtCellBenchmarks
tic();
dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x + y * beta}, dummy, newVols);
......@@ -182,6 +243,64 @@ classdef GtCellBenchmarks
GtCellBenchmarks.verifyError(dummy, dummy3);
end
function benchmarkScaledSub(vols, newVols, beta)
bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
% internal_cell_sub_scaled_assign.cpp
GtCellBenchmarks.printHeader('internal_cell_sub_scaled_[assign, copy]', bytes);
dummy2 = internal_cell_copy(vols);
tic();
dummy2 = internal_cell_sub_scaled_assign(dummy2, newVols, beta);
GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes);
dummy3 = internal_cell_copy(vols);
tic();
dummy3 = internal_cell_sub_scaled_copy(dummy3, newVols, beta);
GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x - y * beta}, dummy, newVols);
GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes);
dummy = vols;
tic();
for ii = 1:length(dummy)
dummy{ii} = dummy{ii} - newVols{ii} * beta;
end
GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
GtCellBenchmarks.verifyError(dummy, dummy2);
GtCellBenchmarks.verifyError(dummy, dummy3);
end
function benchmarkScaledFISTASum(vols, newVols, beta)
bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
% internal_cell_sum_scaled_assign.cpp
GtCellBenchmarks.printHeader('internal_cell_sum_FISTA_scaled_[assign, copy]', bytes);
dummy2 = internal_cell_copy(vols);
tic();
dummy2 = internal_cell_sum_FISTA_scaled_assign(dummy2, newVols, beta);
GtCellBenchmarks.printResult('C (ass): %f', toc(), bytes);
% dummy3 = internal_cell_copy(vols);
% tic();
% dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta);
% GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
dummy = vols;
tic();
dummy = cellfun(@(x, y){x + (x - y) * beta}, dummy, newVols);
GtCellBenchmarks.printResult('cellfun: %f', toc(), bytes);
dummy = vols;
tic();
for ii = 1:length(dummy)
dummy{ii} = dummy{ii} + (dummy{ii} - newVols{ii}) * beta;
end
GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
GtCellBenchmarks.verifyError(dummy, dummy2);
% GtCellBenchmarks.verifyError(dummy, dummy3);
end
function allBenchmarks(vols)
beta = 2.0;
......@@ -191,12 +310,20 @@ classdef GtCellBenchmarks
GtCellBenchmarks.benchmarkProd(vols, newVols);
GtCellBenchmarks.benchmarkProdScalar(vols, beta);
GtCellBenchmarks.benchmarkSum(vols, newVols);
GtCellBenchmarks.benchmarkProdScalar(vols, beta);
GtCellBenchmarks.benchmarkSumScalar(vols, beta);
GtCellBenchmarks.benchmarkSub(vols, newVols);
GtCellBenchmarks.benchmarkScaledSum(vols, newVols, beta);
GtCellBenchmarks.benchmarkScaledSub(vols, newVols, beta);
GtCellBenchmarks.benchmarkScaledFISTASum(vols, newVols, beta);
% % internal_cell_exponent.cpp
% fprintf('\ninternal_cell_exponent\n');
% tic();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment