Cell Operations: fixed copies and updated benchmarks

Signed-off-by: Nicola Vigano <nicola.vigano@esrf.fr> git-svn-id: https://svn.code.sf.net/p/dct/code/trunk@662 4c865b51-4357-4376-afb4-474e03ccb993

Cell Operations: fixed copies and updated benchmarks
2af16018 · Nicola Vigano · 9d8f8e49 · 2af16018 · 2af16018
Commit 2af16018 authored 12 years ago by Nicola Vigano
--- a/zUtil_Cxx/include/internal_cell_defs.h
+++ b/zUtil_Cxx/include/internal_cell_defs.h
@@ -12,126 +12,139 @@
 #include <omp.h>
 #include "mex.h"

-#if defined(__SSE2__) || defined(__SSE2_MATH__)
 # define ROUND_DOWN(x, s) ((x) & ~((s)-1))

+#if defined(__SSE2__) || defined(__SSE2_MATH__)
  typedef double v2df __attribute__ ((vector_size (16)));
 #endif

-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_assign {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (inData);
+    return (inData2);
  }
 };

-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_pow {
  const double exponent;
 public:
  inner_pow(const double & _exp) : exponent(_exp) { }

-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return pow(inData, exponent);
+    return pow(inData2, exponent);
  }
 };

-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_sum {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (outData + inData);
+    return (inData1 + inData2);
  }
 };

-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_prod {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (outData * inData);
+    return (inData1 * inData2);
  }
 };

-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_sub {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (outData - inData);
+    return (inData1 - inData2);
  }
 };

-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_div {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (outData / inData);
+    return (inData1 / inData2);
  }
 };


-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_sum_scale {
  const TypeScalar scale;
 public:
  inner_sum_scale(const TypeScalar & _scale) : scale(_scale) { }

-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
+  {
+    return (inData1 + scale * inData2);
+  }
+};
+
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
+class inner_sum_FISTA_scale {
+  const TypeScalar scale;
+public:
+  inner_sum_FISTA_scale(const TypeScalar & _scale) : scale(_scale) { }
+
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (outData + scale * inData);
+    return (inData1 + scale * (inData1 - inData2));
  }
 };

-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_sub_scale {
  const TypeScalar scale;
 public:
  inner_sub_scale(const TypeScalar & _scale) : scale(_scale) { }

-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (inData * scale - outData);
+    return (inData1 - inData2 * scale);
  }
 };


-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_sum_scalar {
  const TypeScalar scalar;
 public:
  inner_sum_scalar(const TypeScalar & _scalar) : scalar(_scalar) { }

-  const TypeIn
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type1
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (inData + scalar);
+    return (inData2 + scalar);
  }
 };

-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_prod_scalar {
  const TypeScalar scalar;
 public:
  inner_prod_scalar(const TypeScalar & _scalar) : scalar(_scalar) { }

-  const TypeIn
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type1
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
  {
-    return (inData * scalar);
+    return (inData2 * scalar);
  }
 };

@@ -159,21 +172,23 @@ inline void
 cell_iteration_sse(mxArray * & outArray, const mxArray * const inArray,
    const Function & func, const Function2 & func2)
 {
-  const mwSize numCells = mxGetNumberOfElements(inArray);
  if (allocate) {
    cell_allocate<mxDOUBLE_CLASS>(outArray, inArray);
  }

-#pragma omp parallel for
-  for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
-    const mxArray * const inCell = mxGetCell(inArray, cellIdx);
-    mxArray * const outCell = mxGetCell(outArray, cellIdx);
+  const mwSize numCells = mxGetNumberOfElements(inArray);
+#pragma omp parallel
+  {
+    for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
+      const mxArray * const inCell = mxGetCell(inArray, cellIdx);
+      mxArray * const outCell = mxGetCell(outArray, cellIdx);

-    const mwSize numElems = mxGetNumberOfElements(inCell);
-    const double * const inData = mxGetPr(inCell);
-    double * const outData = mxGetPr(outCell);
+      const mwSize numElems = mxGetNumberOfElements(inCell);
+      const double * const inData = mxGetPr(inCell);
+      double * const outData = mxGetPr(outCell);

-    cell_inner_cycle_sse(outData, inData, outData, numElems, func, func2);
+      cell_inner_cycle_sse(outData, outData, inData, numElems, func, func2);
+    }
  }
 }

@@ -183,21 +198,23 @@ cell_iteration_copy_sse(mxArray * & outArray, const mxArray * const inArray1,
    const mxArray * const inArray2,
    const Function & func, const Function2 & func2)
 {
-  const mwSize numCells = mxGetNumberOfElements(inArray1);
  cell_allocate<mxDOUBLE_CLASS>(outArray, inArray1);

-#pragma omp parallel for
-  for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
-    const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx);
-    const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx);
-    mxArray * const outCell = mxGetCell(outArray, cellIdx);
+  const mwSize numCells = mxGetNumberOfElements(inArray1);
+#pragma omp parallel
+  {
+    for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
+      const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx);
+      const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx);
+      mxArray * const outCell = mxGetCell(outArray, cellIdx);

-    const mwSize numElems = mxGetNumberOfElements(inCell1);
-    const double * const inData1 = mxGetPr(inCell1);
-    const double * const inData2 = mxGetPr(inCell2);
-    double * const outData = mxGetPr(outCell);
+      const mwSize numElems = mxGetNumberOfElements(inCell1);
+      const double * const inData1 = mxGetPr(inCell1);
+      const double * const inData2 = mxGetPr(inCell2);
+      double * const outData = mxGetPr(outCell);

-    cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2);
+      cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2);
+    }
  }
 }

@@ -208,17 +225,26 @@ cell_inner_cycle_sse(Type * const __restrict outData,
    const Type * const __restrict inData2,
    const mwSize & numElems, FunctionVect & func, FunctionNonVect & func2)
 {
-  const mwSize num4Elems = ROUND_DOWN(numElems, 4);
-  for(mwIndex elemIdx = 0; elemIdx < num4Elems; elemIdx += 4) {
-    const v2df inV11  = *((v2df *)&inData1[elemIdx]);
-    const v2df inV12  = *((v2df *)&inData1[elemIdx+2]);
-    const v2df inV21 = *((v2df *)&inData2[elemIdx]);
+  const mwSize num8Elems = ROUND_DOWN(numElems, 8);
+#pragma omp for nowait
+  for(mwIndex elemIdx = 0; elemIdx < num8Elems; elemIdx += 8) {
+    const v2df inV11 = *((v2df *)&inData1[elemIdx+0]);
+    const v2df inV12 = *((v2df *)&inData1[elemIdx+2]);
+    const v2df inV13 = *((v2df *)&inData1[elemIdx+4]);
+    const v2df inV14 = *((v2df *)&inData1[elemIdx+6]);
+
+    const v2df inV21 = *((v2df *)&inData2[elemIdx+0]);
    const v2df inV22 = *((v2df *)&inData2[elemIdx+2]);
+    const v2df inV23 = *((v2df *)&inData2[elemIdx+4]);
+    const v2df inV24 = *((v2df *)&inData2[elemIdx+6]);

-    *((v2df *)&outData[elemIdx]) = func(inV11, inV21);
+    *((v2df *)&outData[elemIdx+0]) = func(inV11, inV21);
    *((v2df *)&outData[elemIdx+2]) = func(inV12, inV22);
+    *((v2df *)&outData[elemIdx+4]) = func(inV13, inV23);
+    *((v2df *)&outData[elemIdx+6]) = func(inV14, inV24);
  }
-  for(mwIndex elemIdx = num4Elems; elemIdx < numElems; elemIdx++) {
+#pragma omp for nowait
+  for(mwIndex elemIdx = num8Elems; elemIdx < numElems; elemIdx++) {
    outData[elemIdx] = func2(inData1[elemIdx], inData2[elemIdx]);
  }
 }
@@ -244,7 +270,7 @@ cell_iteration(mxArray * & outArray, const mxArray * const inArray,
    double * const outData = mxGetPr(outCell);

    for(mwIndex elemIdx = 0; elemIdx < numElems; elemIdx++) {
-      outData[elemIdx] = func(inData[elemIdx], outData[elemIdx]);
+      outData[elemIdx] = func(outData[elemIdx], inData[elemIdx]);
    }
  }
 }

--- a/zUtil_DataStructures/GtCellBenchmarks.m
+++ b/zUtil_DataStructures/GtCellBenchmarks.m
@@ -34,7 +34,10 @@ classdef GtCellBenchmarks
            GtCellBenchmarks.printHeader('internal_cell_copy', bytes);
            tic();
            newVols = internal_cell_copy(vols);
-            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+            for ii = 1:length(newVols)
+                newVols{ii} = permute(newVols{ii}, [2 1 3]);
+            end
        end

        function benchmarkDiv(vols, newVols)
@@ -49,7 +52,7 @@ classdef GtCellBenchmarks
            tic();
            dummy3 = internal_cell_div_copy(dummy3, newVols);
            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
            dummy = vols;
            tic();
            dummy = cellfun(@(x, y){x ./ y}, dummy, newVols);
@@ -78,7 +81,7 @@ classdef GtCellBenchmarks
            tic();
            dummy3 = internal_cell_prod_copy(dummy3, newVols);
            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
            dummy = vols;
            tic();
            dummy = cellfun(@(x, y){x .* y}, dummy, newVols);
@@ -107,7 +110,7 @@ classdef GtCellBenchmarks
            tic();
            dummy3 = internal_cell_prod_scalar_copy(dummy3, beta);
            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
            dummy = vols;
            tic();
            dummy = cellfun(@(x){x * beta}, dummy);
@@ -124,6 +127,35 @@ classdef GtCellBenchmarks
            GtCellBenchmarks.verifyError(dummy, dummy3);
        end

+        function benchmarkSumScalar(vols, beta)
+            bytes = GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sum_scalar_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sum_scalar_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sum_scalar_assign(dummy2, beta);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+            dummy3 = internal_cell_copy(vols);
+            tic();
+            dummy3 = internal_cell_sum_scalar_copy(dummy3, beta);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x){x + beta}, dummy);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} + beta;
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+            GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
        function benchmarkSum(vols, newVols)
            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
            % internal_cell_sum_assign.cpp
@@ -136,7 +168,7 @@ classdef GtCellBenchmarks
            tic();
            dummy3 = internal_cell_sum_copy(dummy3, newVols);
            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
            dummy = vols;
            tic();
            dummy = cellfun(@(x, y){x + y}, dummy, newVols);
@@ -153,8 +185,37 @@ classdef GtCellBenchmarks
            GtCellBenchmarks.verifyError(dummy, dummy3);
        end

+        function benchmarkSub(vols, newVols)
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sum_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sub_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sub_assign(dummy2, newVols);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+            dummy3 = internal_cell_copy(vols);
+            tic();
+            dummy3 = internal_cell_sub_copy(dummy3, newVols);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x, y){x - y}, dummy, newVols);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} - newVols{ii};
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+            GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
        function benchmarkScaledSum(vols, newVols, beta)
-            bytes = GtCellBenchmarks.getSizeVariable(vols);
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
            % internal_cell_sum_scaled_assign.cpp
            GtCellBenchmarks.printHeader('internal_cell_sum_scaled_[assign, copy]', bytes);
            dummy2 = internal_cell_copy(vols);
@@ -165,7 +226,7 @@ classdef GtCellBenchmarks
            tic();
            dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta);
            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
            dummy = vols;
            tic();
            dummy = cellfun(@(x, y){x + y * beta}, dummy, newVols);
@@ -182,6 +243,64 @@ classdef GtCellBenchmarks
            GtCellBenchmarks.verifyError(dummy, dummy3);
        end

+        function benchmarkScaledSub(vols, newVols, beta)
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sub_scaled_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sub_scaled_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sub_scaled_assign(dummy2, newVols, beta);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+            dummy3 = internal_cell_copy(vols);
+            tic();
+            dummy3 = internal_cell_sub_scaled_copy(dummy3, newVols, beta);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x, y){x - y * beta}, dummy, newVols);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} - newVols{ii} * beta;
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+            GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
+        function benchmarkScaledFISTASum(vols, newVols, beta)
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sum_scaled_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sum_FISTA_scaled_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sum_FISTA_scaled_assign(dummy2, newVols, beta);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+%             dummy3 = internal_cell_copy(vols);
+%             tic();
+%             dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta);
+%             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x, y){x + (x - y) * beta}, dummy, newVols);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} + (dummy{ii} - newVols{ii}) * beta;
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+%             GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
        function allBenchmarks(vols)
            beta = 2.0;

@@ -191,12 +310,20 @@ classdef GtCellBenchmarks

            GtCellBenchmarks.benchmarkProd(vols, newVols);

+            GtCellBenchmarks.benchmarkProdScalar(vols, beta);
+
            GtCellBenchmarks.benchmarkSum(vols, newVols);

-            GtCellBenchmarks.benchmarkProdScalar(vols, beta);
+            GtCellBenchmarks.benchmarkSumScalar(vols, beta);
+
+            GtCellBenchmarks.benchmarkSub(vols, newVols);

            GtCellBenchmarks.benchmarkScaledSum(vols, newVols, beta);

+            GtCellBenchmarks.benchmarkScaledSub(vols, newVols, beta);
+
+            GtCellBenchmarks.benchmarkScaledFISTASum(vols, newVols, beta);
+
 %             % internal_cell_exponent.cpp
 %             fprintf('\ninternal_cell_exponent\n');
 %             tic();