From 8eaf1c41349ad264420bf63373d6f989be04df05 Mon Sep 17 00:00:00 2001
From: Nicola Vigano <nicola.vigano@esrf.fr>
Date: Mon, 13 Apr 2015 15:59:52 +0200
Subject: [PATCH] 6D-C++-vectorization: round of simplifications/inlining/small
 tweaking

Signed-off-by: Nicola Vigano <nicola.vigano@esrf.fr>
---
 zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h | 39 +++++----
 zUtil_Cxx/include/gt6DUpdateDualL1Ops.h       | 12 +--
 zUtil_Cxx/include/gt6DUpdateDualTVOps.h       | 12 +--
 zUtil_Cxx/include/gt6DUpdatePrimalOps.h       | 38 ++++----
 zUtil_Cxx/include/internal_cell_defs.h        | 87 ++++++++++---------
 5 files changed, 102 insertions(+), 86 deletions(-)

diff --git a/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h b/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h
index c40e0ac3..19795d28 100644
--- a/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h
+++ b/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h
@@ -16,12 +16,12 @@ namespace GT6D {
 
 #define APPLY_FUNC_5FOLD_DUAL_DETECTOR(shift_val) \
 { \
-  const vVvf inV11 = access.load(&in_data1[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV21 = access.load(&in_data2[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV31 = access.load(&in_data3[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV41 = access.load(&in_data4[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV51 = access.load(&in_data5[elemIdx + shift_val * simd_unroll.shift]);\
-  access.store(&out_data[elemIdx + shift_val * simd_unroll.shift], func(inV11, inV21, inV31, inV41, inV51));\
+  const vVvf inV11 = access.load(in1 + shift_val * simd_8.shift);\
+  const vVvf inV21 = access.load(in2 + shift_val * simd_8.shift);\
+  const vVvf inV31 = access.load(in3 + shift_val * simd_8.shift);\
+  const vVvf inV41 = access.load(in4 + shift_val * simd_8.shift);\
+  const vVvf inV51 = access.load(in5 + shift_val * simd_8.shift);\
+  access.store(out + shift_val * simd_8.shift, func(inV11, inV21, inV31, inV41, inV51));\
 }
 
   const char * dual_detector_error_id = "C_FUN:gt6DUpdateDualDetector:wrong_argument";
@@ -156,20 +156,28 @@ namespace GT6D {
       const Type * const __restrict in_data3,
       const Type * const __restrict in_data4,
       const Type * const __restrict in_data5,
-      const mwSize & numElems, Function & func)
+      const mwSize & num_elems, Function & func)
   {
     typedef typename SIMDUnrolling<Type>::vVvf vVvf;
 
-    const mwSize unrolling = 8;
-    const SIMDUnrolling<Type> simd_unroll(unrolling);
-    const SIMDUnrolling<Type> simd(1);
+    const SIMDUnrolling<Type> simd_8(8);
+    const SIMDUnrolling<Type> simd_1(1);
 
-    AccessType access;
+    const mwSize num_elems_unroll_8 = simd_8.get_unroll(num_elems);
+    const mwSize num_elems_unroll_1 = simd_1.get_unroll(num_elems);
+
+    const AccessType access;
 
   #pragma omp for nowait
-    for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems);
-        elemIdx += simd_unroll.block)
+    for(mwIndex elemIdx = 0; elemIdx < num_elems_unroll_8; elemIdx += simd_8.block)
     {
+      const Type * const in1 = in_data1 + elemIdx;
+      const Type * const in2 = in_data2 + elemIdx;
+      const Type * const in3 = in_data3 + elemIdx;
+      const Type * const in4 = in_data4 + elemIdx;
+      const Type * const in5 = in_data5 + elemIdx;
+      Type * const out = out_data + elemIdx;
+
       APPLY_FUNC_5FOLD_DUAL_DETECTOR(0);
       APPLY_FUNC_5FOLD_DUAL_DETECTOR(1);
       APPLY_FUNC_5FOLD_DUAL_DETECTOR(2);
@@ -180,8 +188,7 @@ namespace GT6D {
       APPLY_FUNC_5FOLD_DUAL_DETECTOR(7);
     }
   #pragma omp for nowait
-    for(mwIndex elemIdx = simd_unroll.get_unroll(numElems);
-        elemIdx < simd.get_unroll(numElems); elemIdx += simd.block)
+    for(mwIndex elemIdx = num_elems_unroll_8; elemIdx < num_elems_unroll_1; elemIdx += simd_1.block)
     {
       const vVvf inV11 = access.load(&in_data1[elemIdx]);
       const vVvf inV21 = access.load(&in_data2[elemIdx]);
@@ -192,7 +199,7 @@ namespace GT6D {
       access.store(&out_data[elemIdx], func(inV11, inV21, inV31, inV41, inV51));
     }
   #pragma omp for nowait
-    for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++)
+    for(mwIndex elemIdx = num_elems_unroll_1; elemIdx < num_elems; elemIdx++)
     {
       out_data[elemIdx] = func(in_data1[elemIdx], in_data2[elemIdx], in_data3[elemIdx], in_data4[elemIdx], in_data5[elemIdx]);
     }
diff --git a/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h b/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h
index 6d45db93..c53310ac 100644
--- a/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h
+++ b/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h
@@ -68,7 +68,7 @@ namespace GT6D {
   { }
 
   template<>
-  const update_dual_l1<float>::vVvf
+  inline const update_dual_l1<float>::vVvf
   update_dual_l1<float>::abs(
       const update_dual_l1<float>::vVvf & val)
   const throw()
@@ -78,7 +78,7 @@ namespace GT6D {
   }
 
   template<>
-  const update_dual_l1<double>::vVvf
+  inline const update_dual_l1<double>::vVvf
   update_dual_l1<double>::abs(
       const update_dual_l1<double>::vVvf & val)
   const throw()
@@ -88,7 +88,7 @@ namespace GT6D {
   }
 
   template<>
-  const float
+  inline const float
   update_dual_l1<float>::abs(const float & val)
   const throw()
   {
@@ -96,7 +96,7 @@ namespace GT6D {
   }
 
   template<>
-  const double
+  inline const double
   update_dual_l1<double>::abs(const double & val)
   const throw()
   {
@@ -104,7 +104,7 @@ namespace GT6D {
   }
 
   template<>
-  const update_dual_l1<float>::vVvf
+  inline const update_dual_l1<float>::vVvf
   update_dual_l1<float>::max(
       const update_dual_l1<float>::vVvf & val1,
       const update_dual_l1<float>::vVvf & val2)
@@ -115,7 +115,7 @@ namespace GT6D {
   }
 
   template<>
-  const update_dual_l1<double>::vVvf
+  inline const update_dual_l1<double>::vVvf
   update_dual_l1<double>::max(
       const update_dual_l1<double>::vVvf & val1,
       const update_dual_l1<double>::vVvf & val2)
diff --git a/zUtil_Cxx/include/gt6DUpdateDualTVOps.h b/zUtil_Cxx/include/gt6DUpdateDualTVOps.h
index 9e9807ef..b405834b 100644
--- a/zUtil_Cxx/include/gt6DUpdateDualTVOps.h
+++ b/zUtil_Cxx/include/gt6DUpdateDualTVOps.h
@@ -63,7 +63,7 @@ namespace GT6D {
   { }
 
   template<>
-  const update_dual_tv<float>::vVvf
+  inline const update_dual_tv<float>::vVvf
   update_dual_tv<float>::abs(
       const update_dual_tv<float>::vVvf & val)
   const throw()
@@ -73,7 +73,7 @@ namespace GT6D {
   }
 
   template<>
-  const update_dual_tv<double>::vVvf
+  inline const update_dual_tv<double>::vVvf
   update_dual_tv<double>::abs(
       const update_dual_tv<double>::vVvf & val)
   const throw()
@@ -83,7 +83,7 @@ namespace GT6D {
   }
 
   template<>
-  const float
+  inline const float
   update_dual_tv<float>::abs(const float & val)
   const throw()
   {
@@ -91,7 +91,7 @@ namespace GT6D {
   }
 
   template<>
-  const double
+  inline const double
   update_dual_tv<double>::abs(const double & val)
   const throw()
   {
@@ -99,7 +99,7 @@ namespace GT6D {
   }
 
   template<>
-  const update_dual_tv<float>::vVvf
+  inline const update_dual_tv<float>::vVvf
   update_dual_tv<float>::max(
       const update_dual_tv<float>::vVvf & val1,
       const update_dual_tv<float>::vVvf & val2)
@@ -110,7 +110,7 @@ namespace GT6D {
   }
 
   template<>
-  const update_dual_tv<double>::vVvf
+  inline const update_dual_tv<double>::vVvf
   update_dual_tv<double>::max(
       const update_dual_tv<double>::vVvf & val1,
       const update_dual_tv<double>::vVvf & val2)
diff --git a/zUtil_Cxx/include/gt6DUpdatePrimalOps.h b/zUtil_Cxx/include/gt6DUpdatePrimalOps.h
index b6a9e8a4..6231930c 100644
--- a/zUtil_Cxx/include/gt6DUpdatePrimalOps.h
+++ b/zUtil_Cxx/include/gt6DUpdatePrimalOps.h
@@ -16,14 +16,14 @@ namespace GT6D {
 
 #define APPLY_FUNC_5FOLD_PRIMAL(shift_val) \
 { \
-  vVvf inV11 = access.load(&out_data1[elemIdx + shift_val * simd_unroll.shift]);\
-  vVvf inV21 = access.load(&out_data2[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV31 = access.load(&in_data3[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV41 = access.load(&in_data4[elemIdx + shift_val * simd_unroll.shift]);\
-  const vVvf inV51 = access.load(&in_data5[elemIdx + shift_val * simd_unroll.shift]);\
+  vVvf inV11 = access.load(out1 + shift_val * simd_8.shift);\
+  vVvf inV21 = access.load(out2 + shift_val * simd_8.shift);\
+  const vVvf inV31 = access.load(in3 + shift_val * simd_8.shift);\
+  const vVvf inV41 = access.load(in4 + shift_val * simd_8.shift);\
+  const vVvf inV51 = access.load(in5 + shift_val * simd_8.shift);\
   func(inV11, inV21, inV31, inV41, inV51);\
-  access.store(&out_data1[elemIdx + shift_val * simd_unroll.shift], inV11);\
-  access.store(&out_data2[elemIdx + shift_val * simd_unroll.shift], inV21);\
+  access.store(out1 + shift_val * simd_8.shift, inV11);\
+  access.store(out2 + shift_val * simd_8.shift, inV21);\
 }
 
   const char * primal_error_id = "C_FUN:gt6DUpdatePrimal:wrong_argument";
@@ -104,20 +104,27 @@ namespace GT6D {
       const Type * const __restrict in_data3,
       const Type * const __restrict in_data4,
       const Type * const __restrict in_data5,
-      const mwSize & numElems, Function & func)
+      const mwSize & num_elems, Function & func)
   {
     typedef typename SIMDUnrolling<Type>::vVvf vVvf;
 
-    const mwSize unrolling = 8;
-    const SIMDUnrolling<Type> simd_unroll(unrolling);
-    const SIMDUnrolling<Type> simd(1);
+    const SIMDUnrolling<Type> simd_8(8);
+    const SIMDUnrolling<Type> simd_1(1);
+
+    const mwSize num_elems_unroll_8 = simd_8.get_unroll(num_elems);
+    const mwSize num_elems_unroll_1 = simd_1.get_unroll(num_elems);
 
     AccessType access;
 
   #pragma omp for nowait
-    for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems);
-        elemIdx += simd_unroll.block)
+    for(mwIndex elemIdx = 0; elemIdx < num_elems_unroll_8; elemIdx += simd_8.block)
     {
+      Type * const out1 = out_data1 + elemIdx;
+      Type * const out2 = out_data2 + elemIdx;
+      const Type * const in3 = in_data3 + elemIdx;
+      const Type * const in4 = in_data4 + elemIdx;
+      const Type * const in5 = in_data5 + elemIdx;
+
       APPLY_FUNC_5FOLD_PRIMAL(0);
       APPLY_FUNC_5FOLD_PRIMAL(1);
       APPLY_FUNC_5FOLD_PRIMAL(2);
@@ -128,8 +135,7 @@ namespace GT6D {
       APPLY_FUNC_5FOLD_PRIMAL(7);
     }
   #pragma omp for nowait
-    for(mwIndex elemIdx = simd_unroll.get_unroll(numElems);
-        elemIdx < simd.get_unroll(numElems); elemIdx += simd.block)
+    for(mwIndex elemIdx = num_elems_unroll_8; elemIdx < num_elems_unroll_1; elemIdx += simd_1.block)
     {
       vVvf inV11 = access.load(&out_data1[elemIdx]);
       vVvf inV21 = access.load(&out_data2[elemIdx]);
@@ -144,7 +150,7 @@ namespace GT6D {
       access.store(&out_data2[elemIdx], inV21);
     }
   #pragma omp for nowait
-    for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++)
+    for(mwIndex elemIdx = num_elems_unroll_1; elemIdx < num_elems; elemIdx++)
     {
        func(out_data1[elemIdx], out_data2[elemIdx], in_data3[elemIdx], in_data4[elemIdx], in_data5[elemIdx]);
     }
diff --git a/zUtil_Cxx/include/internal_cell_defs.h b/zUtil_Cxx/include/internal_cell_defs.h
index 72db4d96..8b076165 100644
--- a/zUtil_Cxx/include/internal_cell_defs.h
+++ b/zUtil_Cxx/include/internal_cell_defs.h
@@ -477,7 +477,7 @@ protected:
 };
 
 template<>
-const inner_non_neg<float>::vVvf
+inline const inner_non_neg<float>::vVvf
 inner_non_neg<float>::operator ()(
     const inner_non_neg<float>::vVvf & inData1,
     const inner_non_neg<float>::vVvf & inData2)
@@ -492,7 +492,7 @@ const throw()
 }
 
 template<>
-const inner_non_neg<double>::vVvf
+inline const inner_non_neg<double>::vVvf
 inner_non_neg<double>::operator ()(
     const inner_non_neg<double>::vVvf & inData1,
     const inner_non_neg<double>::vVvf & inData2)
@@ -530,7 +530,7 @@ protected:
 };
 
 template<>
-const inner_sum_FISTA_scale_non_neg<float>::vVvf
+inline const inner_sum_FISTA_scale_non_neg<float>::vVvf
 inner_sum_FISTA_scale_non_neg<float>::operator ()(
     const inner_sum_FISTA_scale_non_neg<float>::vVvf & inData1,
     const inner_sum_FISTA_scale_non_neg<float>::vVvf & inData2)
@@ -546,7 +546,7 @@ const throw()
 }
 
 template<>
-const inner_sum_FISTA_scale_non_neg<double>::vVvf
+inline const inner_sum_FISTA_scale_non_neg<double>::vVvf
 inner_sum_FISTA_scale_non_neg<double>::operator ()(
     const inner_sum_FISTA_scale_non_neg<double>::vVvf & inData1,
     const inner_sum_FISTA_scale_non_neg<double>::vVvf & inData2)
@@ -724,49 +724,52 @@ cell_inner_cycle_sse(Type * const __restrict outData,
 {
   typedef typename SIMDUnrolling<Type>::vVvf vVvf;
 
-  const mwSize unrolling = 8;
-  const SIMDUnrolling<Type> simd_unroll(unrolling);
-  const SIMDUnrolling<Type> simd(1);
+  const SIMDUnrolling<Type> simd_8(8);
+  const SIMDUnrolling<Type> simd_1(1);
 
   AccessType access;
 
 #pragma omp for nowait
-  for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems);
-      elemIdx += simd_unroll.block)
-  {
-    const vVvf inV11 = access.load(&inData1[elemIdx + 0 * simd_unroll.shift]);
-    const vVvf inV12 = access.load(&inData1[elemIdx + 1 * simd_unroll.shift]);
-    const vVvf inV13 = access.load(&inData1[elemIdx + 2 * simd_unroll.shift]);
-    const vVvf inV14 = access.load(&inData1[elemIdx + 3 * simd_unroll.shift]);
-
-    const vVvf inV21 = access.load(&inData2[elemIdx + 0 * simd_unroll.shift]);
-    const vVvf inV22 = access.load(&inData2[elemIdx + 1 * simd_unroll.shift]);
-    const vVvf inV23 = access.load(&inData2[elemIdx + 2 * simd_unroll.shift]);
-    const vVvf inV24 = access.load(&inData2[elemIdx + 3 * simd_unroll.shift]);
-
-    access.store(&outData[elemIdx + 0 * simd_unroll.shift], func(inV11, inV21));
-    access.store(&outData[elemIdx + 1 * simd_unroll.shift], func(inV12, inV22));
-    access.store(&outData[elemIdx + 2 * simd_unroll.shift], func(inV13, inV23));
-    access.store(&outData[elemIdx + 3 * simd_unroll.shift], func(inV14, inV24));
-
-    const vVvf inV31 = access.load(&inData1[elemIdx + 4 * simd_unroll.shift]);
-    const vVvf inV32 = access.load(&inData1[elemIdx + 5 * simd_unroll.shift]);
-    const vVvf inV33 = access.load(&inData1[elemIdx + 6 * simd_unroll.shift]);
-    const vVvf inV34 = access.load(&inData1[elemIdx + 7 * simd_unroll.shift]);
-
-    const vVvf inV41 = access.load(&inData2[elemIdx + 4 * simd_unroll.shift]);
-    const vVvf inV42 = access.load(&inData2[elemIdx + 5 * simd_unroll.shift]);
-    const vVvf inV43 = access.load(&inData2[elemIdx + 6 * simd_unroll.shift]);
-    const vVvf inV44 = access.load(&inData2[elemIdx + 7 * simd_unroll.shift]);
-
-    access.store(&outData[elemIdx + 4 * simd_unroll.shift], func(inV31, inV41));
-    access.store(&outData[elemIdx + 5 * simd_unroll.shift], func(inV32, inV42));
-    access.store(&outData[elemIdx + 6 * simd_unroll.shift], func(inV33, inV43));
-    access.store(&outData[elemIdx + 7 * simd_unroll.shift], func(inV34, inV44));
+  for(mwIndex elemIdx = 0; elemIdx < simd_8.get_unroll(numElems);
+      elemIdx += simd_8.block)
+  {
+    const Type * const in1 = inData1 + elemIdx;
+    const Type * const in2 = inData2 + elemIdx;
+    Type * const out = outData + elemIdx;
+
+    const vVvf inV11 = access.load(in1 + 0 * simd_8.shift);
+    const vVvf inV12 = access.load(in1 + 1 * simd_8.shift);
+    const vVvf inV13 = access.load(in1 + 2 * simd_8.shift);
+    const vVvf inV14 = access.load(in1 + 3 * simd_8.shift);
+
+    const vVvf inV21 = access.load(in2 + 0 * simd_8.shift);
+    const vVvf inV22 = access.load(in2 + 1 * simd_8.shift);
+    const vVvf inV23 = access.load(in2 + 2 * simd_8.shift);
+    const vVvf inV24 = access.load(in2 + 3 * simd_8.shift);
+
+    access.store(out + 0 * simd_8.shift, func(inV11, inV21));
+    access.store(out + 1 * simd_8.shift, func(inV12, inV22));
+    access.store(out + 2 * simd_8.shift, func(inV13, inV23));
+    access.store(out + 3 * simd_8.shift, func(inV14, inV24));
+
+    const vVvf inV31 = access.load(in1 + 4 * simd_8.shift);
+    const vVvf inV32 = access.load(in1 + 5 * simd_8.shift);
+    const vVvf inV33 = access.load(in1 + 6 * simd_8.shift);
+    const vVvf inV34 = access.load(in1 + 7 * simd_8.shift);
+
+    const vVvf inV41 = access.load(in2 + 4 * simd_8.shift);
+    const vVvf inV42 = access.load(in2 + 5 * simd_8.shift);
+    const vVvf inV43 = access.load(in2 + 6 * simd_8.shift);
+    const vVvf inV44 = access.load(in2 + 7 * simd_8.shift);
+
+    access.store(out + 4 * simd_8.shift, func(inV31, inV41));
+    access.store(out + 5 * simd_8.shift, func(inV32, inV42));
+    access.store(out + 6 * simd_8.shift, func(inV33, inV43));
+    access.store(out + 7 * simd_8.shift, func(inV34, inV44));
   }
 #pragma omp for nowait
-  for(mwIndex elemIdx = simd_unroll.get_unroll(numElems);
-      elemIdx < simd.get_unroll(numElems); elemIdx += simd.block)
+  for(mwIndex elemIdx = simd_8.get_unroll(numElems);
+      elemIdx < simd_1.get_unroll(numElems); elemIdx += simd_1.block)
   {
     const vVvf inV11 = access.load(&inData1[elemIdx]);
     const vVvf inV21 = access.load(&inData2[elemIdx]);
@@ -774,7 +777,7 @@ cell_inner_cycle_sse(Type * const __restrict outData,
     access.store(&outData[elemIdx], func(inV11, inV21));
   }
 #pragma omp for nowait
-  for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++)
+  for(mwIndex elemIdx = simd_1.get_unroll(numElems); elemIdx < numElems; elemIdx++)
   {
     outData[elemIdx] = func(inData1[elemIdx], inData2[elemIdx]);
   }
-- 
GitLab