diff --git a/zUtil_Cxx/include/internal_cell_defs.h b/zUtil_Cxx/include/internal_cell_defs.h
index daa7fc6ad5dff23cc85d4b82e49277dc3529a9a7..e52d42add95978d7d81f02d57cfa111cff8575aa 100755
--- a/zUtil_Cxx/include/internal_cell_defs.h
+++ b/zUtil_Cxx/include/internal_cell_defs.h
@@ -12,126 +12,139 @@
 #include <omp.h>
 #include "mex.h"
 
-#if defined(__SSE2__) || defined(__SSE2_MATH__)
 # define ROUND_DOWN(x, s) ((x) & ~((s)-1))
 
+#if defined(__SSE2__) || defined(__SSE2_MATH__)
   typedef double v2df __attribute__ ((vector_size (16)));
 #endif
 
-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_assign {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (inData);
+    return (inData2);
   }
 };
 
-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_pow {
   const double exponent;
 public:
   inner_pow(const double & _exp) : exponent(_exp) { }
 
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return pow(inData, exponent);
+    return pow(inData2, exponent);
   }
 };
 
-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_sum {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (outData + inData);
+    return (inData1 + inData2);
   }
 };
 
-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_prod {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (outData * inData);
+    return (inData1 * inData2);
   }
 };
 
-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_sub {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (outData - inData);
+    return (inData1 - inData2);
   }
 };
 
-template<typename TypeIn, typename TypeOut>
+template<typename Type1, typename Type2>
 class inner_div {
 public:
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (outData / inData);
+    return (inData1 / inData2);
   }
 };
 
 
-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_sum_scale {
   const TypeScalar scale;
 public:
   inner_sum_scale(const TypeScalar & _scale) : scale(_scale) { }
 
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
+  {
+    return (inData1 + scale * inData2);
+  }
+};
+
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
+class inner_sum_FISTA_scale {
+  const TypeScalar scale;
+public:
+  inner_sum_FISTA_scale(const TypeScalar & _scale) : scale(_scale) { }
+
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (outData + scale * inData);
+    return (inData1 + scale * (inData1 - inData2));
   }
 };
 
-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_sub_scale {
   const TypeScalar scale;
 public:
   inner_sub_scale(const TypeScalar & _scale) : scale(_scale) { }
 
-  const TypeOut
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type2
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (inData * scale - outData);
+    return (inData1 - inData2 * scale);
   }
 };
 
 
-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_sum_scalar {
   const TypeScalar scalar;
 public:
   inner_sum_scalar(const TypeScalar & _scalar) : scalar(_scalar) { }
 
-  const TypeIn
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type1
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (inData + scalar);
+    return (inData2 + scalar);
   }
 };
 
-template<typename TypeIn, typename TypeOut, typename TypeScalar = TypeIn>
+template<typename Type1, typename Type2, typename TypeScalar = Type1>
 class inner_prod_scalar {
   const TypeScalar scalar;
 public:
   inner_prod_scalar(const TypeScalar & _scalar) : scalar(_scalar) { }
 
-  const TypeIn
-  operator()(TypeIn & inData, TypeOut & outData) const throw()
+  const Type1
+  operator()(Type1 & inData1, Type2 & inData2) const throw()
   {
-    return (inData * scalar);
+    return (inData2 * scalar);
   }
 };
 
@@ -159,21 +172,23 @@ inline void
 cell_iteration_sse(mxArray * & outArray, const mxArray * const inArray,
     const Function & func, const Function2 & func2)
 {
-  const mwSize numCells = mxGetNumberOfElements(inArray);
   if (allocate) {
     cell_allocate<mxDOUBLE_CLASS>(outArray, inArray);
   }
 
-#pragma omp parallel for
-  for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
-    const mxArray * const inCell = mxGetCell(inArray, cellIdx);
-    mxArray * const outCell = mxGetCell(outArray, cellIdx);
+  const mwSize numCells = mxGetNumberOfElements(inArray);
+#pragma omp parallel
+  {
+    for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
+      const mxArray * const inCell = mxGetCell(inArray, cellIdx);
+      mxArray * const outCell = mxGetCell(outArray, cellIdx);
 
-    const mwSize numElems = mxGetNumberOfElements(inCell);
-    const double * const inData = mxGetPr(inCell);
-    double * const outData = mxGetPr(outCell);
+      const mwSize numElems = mxGetNumberOfElements(inCell);
+      const double * const inData = mxGetPr(inCell);
+      double * const outData = mxGetPr(outCell);
 
-    cell_inner_cycle_sse(outData, inData, outData, numElems, func, func2);
+      cell_inner_cycle_sse(outData, outData, inData, numElems, func, func2);
+    }
   }
 }
 
@@ -183,21 +198,23 @@ cell_iteration_copy_sse(mxArray * & outArray, const mxArray * const inArray1,
     const mxArray * const inArray2,
     const Function & func, const Function2 & func2)
 {
-  const mwSize numCells = mxGetNumberOfElements(inArray1);
   cell_allocate<mxDOUBLE_CLASS>(outArray, inArray1);
 
-#pragma omp parallel for
-  for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
-    const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx);
-    const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx);
-    mxArray * const outCell = mxGetCell(outArray, cellIdx);
+  const mwSize numCells = mxGetNumberOfElements(inArray1);
+#pragma omp parallel
+  {
+    for(mwIndex cellIdx = 0; cellIdx < numCells; cellIdx++) {
+      const mxArray * const inCell1 = mxGetCell(inArray1, cellIdx);
+      const mxArray * const inCell2 = mxGetCell(inArray2, cellIdx);
+      mxArray * const outCell = mxGetCell(outArray, cellIdx);
 
-    const mwSize numElems = mxGetNumberOfElements(inCell1);
-    const double * const inData1 = mxGetPr(inCell1);
-    const double * const inData2 = mxGetPr(inCell2);
-    double * const outData = mxGetPr(outCell);
+      const mwSize numElems = mxGetNumberOfElements(inCell1);
+      const double * const inData1 = mxGetPr(inCell1);
+      const double * const inData2 = mxGetPr(inCell2);
+      double * const outData = mxGetPr(outCell);
 
-    cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2);
+      cell_inner_cycle_sse(outData, inData1, inData2, numElems, func, func2);
+    }
   }
 }
 
@@ -208,17 +225,26 @@ cell_inner_cycle_sse(Type * const __restrict outData,
     const Type * const __restrict inData2,
     const mwSize & numElems, FunctionVect & func, FunctionNonVect & func2)
 {
-  const mwSize num4Elems = ROUND_DOWN(numElems, 4);
-  for(mwIndex elemIdx = 0; elemIdx < num4Elems; elemIdx += 4) {
-    const v2df inV11  = *((v2df *)&inData1[elemIdx]);
-    const v2df inV12  = *((v2df *)&inData1[elemIdx+2]);
-    const v2df inV21 = *((v2df *)&inData2[elemIdx]);
+  const mwSize num8Elems = ROUND_DOWN(numElems, 8);
+#pragma omp for nowait
+  for(mwIndex elemIdx = 0; elemIdx < num8Elems; elemIdx += 8) {
+    const v2df inV11 = *((v2df *)&inData1[elemIdx+0]);
+    const v2df inV12 = *((v2df *)&inData1[elemIdx+2]);
+    const v2df inV13 = *((v2df *)&inData1[elemIdx+4]);
+    const v2df inV14 = *((v2df *)&inData1[elemIdx+6]);
+
+    const v2df inV21 = *((v2df *)&inData2[elemIdx+0]);
     const v2df inV22 = *((v2df *)&inData2[elemIdx+2]);
+    const v2df inV23 = *((v2df *)&inData2[elemIdx+4]);
+    const v2df inV24 = *((v2df *)&inData2[elemIdx+6]);
 
-    *((v2df *)&outData[elemIdx]) = func(inV11, inV21);
+    *((v2df *)&outData[elemIdx+0]) = func(inV11, inV21);
     *((v2df *)&outData[elemIdx+2]) = func(inV12, inV22);
+    *((v2df *)&outData[elemIdx+4]) = func(inV13, inV23);
+    *((v2df *)&outData[elemIdx+6]) = func(inV14, inV24);
   }
-  for(mwIndex elemIdx = num4Elems; elemIdx < numElems; elemIdx++) {
+#pragma omp for nowait
+  for(mwIndex elemIdx = num8Elems; elemIdx < numElems; elemIdx++) {
     outData[elemIdx] = func2(inData1[elemIdx], inData2[elemIdx]);
   }
 }
@@ -244,7 +270,7 @@ cell_iteration(mxArray * & outArray, const mxArray * const inArray,
     double * const outData = mxGetPr(outCell);
 
     for(mwIndex elemIdx = 0; elemIdx < numElems; elemIdx++) {
-      outData[elemIdx] = func(inData[elemIdx], outData[elemIdx]);
+      outData[elemIdx] = func(outData[elemIdx], inData[elemIdx]);
     }
   }
 }
diff --git a/zUtil_DataStructures/GtCellBenchmarks.m b/zUtil_DataStructures/GtCellBenchmarks.m
index ab5296568a8de45399ddde44d174ab8bfe561762..38e3961b6533376ebba627501ffc724b0384893f 100644
--- a/zUtil_DataStructures/GtCellBenchmarks.m
+++ b/zUtil_DataStructures/GtCellBenchmarks.m
@@ -34,7 +34,10 @@ classdef GtCellBenchmarks
             GtCellBenchmarks.printHeader('internal_cell_copy', bytes);
             tic();
             newVols = internal_cell_copy(vols);
-            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+            for ii = 1:length(newVols)
+                newVols{ii} = permute(newVols{ii}, [2 1 3]);
+            end
         end
 
         function benchmarkDiv(vols, newVols)
@@ -49,7 +52,7 @@ classdef GtCellBenchmarks
             tic();
             dummy3 = internal_cell_div_copy(dummy3, newVols);
             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
             dummy = vols;
             tic();
             dummy = cellfun(@(x, y){x ./ y}, dummy, newVols);
@@ -78,7 +81,7 @@ classdef GtCellBenchmarks
             tic();
             dummy3 = internal_cell_prod_copy(dummy3, newVols);
             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
             dummy = vols;
             tic();
             dummy = cellfun(@(x, y){x .* y}, dummy, newVols);
@@ -107,7 +110,7 @@ classdef GtCellBenchmarks
             tic();
             dummy3 = internal_cell_prod_scalar_copy(dummy3, beta);
             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
             dummy = vols;
             tic();
             dummy = cellfun(@(x){x * beta}, dummy);
@@ -124,6 +127,35 @@ classdef GtCellBenchmarks
             GtCellBenchmarks.verifyError(dummy, dummy3);
         end
 
+        function benchmarkSumScalar(vols, beta)
+            bytes = GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sum_scalar_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sum_scalar_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sum_scalar_assign(dummy2, beta);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+            dummy3 = internal_cell_copy(vols);
+            tic();
+            dummy3 = internal_cell_sum_scalar_copy(dummy3, beta);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x){x + beta}, dummy);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} + beta;
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+            GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
         function benchmarkSum(vols, newVols)
             bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
             % internal_cell_sum_assign.cpp
@@ -136,7 +168,7 @@ classdef GtCellBenchmarks
             tic();
             dummy3 = internal_cell_sum_copy(dummy3, newVols);
             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
             dummy = vols;
             tic();
             dummy = cellfun(@(x, y){x + y}, dummy, newVols);
@@ -153,8 +185,37 @@ classdef GtCellBenchmarks
             GtCellBenchmarks.verifyError(dummy, dummy3);
         end
 
+        function benchmarkSub(vols, newVols)
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sum_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sub_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sub_assign(dummy2, newVols);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+            dummy3 = internal_cell_copy(vols);
+            tic();
+            dummy3 = internal_cell_sub_copy(dummy3, newVols);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x, y){x - y}, dummy, newVols);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} - newVols{ii};
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+            GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
         function benchmarkScaledSum(vols, newVols, beta)
-            bytes = GtCellBenchmarks.getSizeVariable(vols);
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
             % internal_cell_sum_scaled_assign.cpp
             GtCellBenchmarks.printHeader('internal_cell_sum_scaled_[assign, copy]', bytes);
             dummy2 = internal_cell_copy(vols);
@@ -165,7 +226,7 @@ classdef GtCellBenchmarks
             tic();
             dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta);
             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
-            
+
             dummy = vols;
             tic();
             dummy = cellfun(@(x, y){x + y * beta}, dummy, newVols);
@@ -182,6 +243,64 @@ classdef GtCellBenchmarks
             GtCellBenchmarks.verifyError(dummy, dummy3);
         end
 
+        function benchmarkScaledSub(vols, newVols, beta)
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sub_scaled_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sub_scaled_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sub_scaled_assign(dummy2, newVols, beta);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+            dummy3 = internal_cell_copy(vols);
+            tic();
+            dummy3 = internal_cell_sub_scaled_copy(dummy3, newVols, beta);
+            GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x, y){x - y * beta}, dummy, newVols);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} - newVols{ii} * beta;
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+            GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
+        function benchmarkScaledFISTASum(vols, newVols, beta)
+            bytes = 2 * GtCellBenchmarks.getSizeVariable(vols);
+            % internal_cell_sum_scaled_assign.cpp
+            GtCellBenchmarks.printHeader('internal_cell_sum_FISTA_scaled_[assign, copy]', bytes);
+            dummy2 = internal_cell_copy(vols);
+            tic();
+            dummy2 = internal_cell_sum_FISTA_scaled_assign(dummy2, newVols, beta);
+            GtCellBenchmarks.printResult('C (ass):  %f', toc(), bytes);
+%             dummy3 = internal_cell_copy(vols);
+%             tic();
+%             dummy3 = internal_cell_sum_scaled_copy(dummy3, newVols, beta);
+%             GtCellBenchmarks.printResult('C (copy): %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            dummy = cellfun(@(x, y){x + (x - y) * beta}, dummy, newVols);
+            GtCellBenchmarks.printResult('cellfun:  %f', toc(), bytes);
+
+            dummy = vols;
+            tic();
+            for ii = 1:length(dummy)
+                dummy{ii} = dummy{ii} + (dummy{ii} - newVols{ii}) * beta;
+            end
+            GtCellBenchmarks.printResult('for loop: %f', toc(), bytes);
+
+            GtCellBenchmarks.verifyError(dummy, dummy2);
+%             GtCellBenchmarks.verifyError(dummy, dummy3);
+        end
+
         function allBenchmarks(vols)
             beta = 2.0;
 
@@ -191,12 +310,20 @@ classdef GtCellBenchmarks
 
             GtCellBenchmarks.benchmarkProd(vols, newVols);
 
+            GtCellBenchmarks.benchmarkProdScalar(vols, beta);
+
             GtCellBenchmarks.benchmarkSum(vols, newVols);
 
-            GtCellBenchmarks.benchmarkProdScalar(vols, beta);
+            GtCellBenchmarks.benchmarkSumScalar(vols, beta);
+
+            GtCellBenchmarks.benchmarkSub(vols, newVols);
 
             GtCellBenchmarks.benchmarkScaledSum(vols, newVols, beta);
 
+            GtCellBenchmarks.benchmarkScaledSub(vols, newVols, beta);
+
+            GtCellBenchmarks.benchmarkScaledFISTASum(vols, newVols, beta);
+
 %             % internal_cell_exponent.cpp
 %             fprintf('\ninternal_cell_exponent\n');
 %             tic();