From 8eaf1c41349ad264420bf63373d6f989be04df05 Mon Sep 17 00:00:00 2001 From: Nicola Vigano <nicola.vigano@esrf.fr> Date: Mon, 13 Apr 2015 15:59:52 +0200 Subject: [PATCH] 6D-C++-vectorization: round of simplifications/inlining/small tweaking Signed-off-by: Nicola Vigano <nicola.vigano@esrf.fr> --- zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h | 39 +++++---- zUtil_Cxx/include/gt6DUpdateDualL1Ops.h | 12 +-- zUtil_Cxx/include/gt6DUpdateDualTVOps.h | 12 +-- zUtil_Cxx/include/gt6DUpdatePrimalOps.h | 38 ++++---- zUtil_Cxx/include/internal_cell_defs.h | 87 ++++++++++--------- 5 files changed, 102 insertions(+), 86 deletions(-) diff --git a/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h b/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h index c40e0ac3..19795d28 100644 --- a/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h +++ b/zUtil_Cxx/include/gt6DUpdateDualDetectorOps.h @@ -16,12 +16,12 @@ namespace GT6D { #define APPLY_FUNC_5FOLD_DUAL_DETECTOR(shift_val) \ { \ - const vVvf inV11 = access.load(&in_data1[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV21 = access.load(&in_data2[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV31 = access.load(&in_data3[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV41 = access.load(&in_data4[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV51 = access.load(&in_data5[elemIdx + shift_val * simd_unroll.shift]);\ - access.store(&out_data[elemIdx + shift_val * simd_unroll.shift], func(inV11, inV21, inV31, inV41, inV51));\ + const vVvf inV11 = access.load(in1 + shift_val * simd_8.shift);\ + const vVvf inV21 = access.load(in2 + shift_val * simd_8.shift);\ + const vVvf inV31 = access.load(in3 + shift_val * simd_8.shift);\ + const vVvf inV41 = access.load(in4 + shift_val * simd_8.shift);\ + const vVvf inV51 = access.load(in5 + shift_val * simd_8.shift);\ + access.store(out + shift_val * simd_8.shift, func(inV11, inV21, inV31, inV41, inV51));\ } const char * dual_detector_error_id = "C_FUN:gt6DUpdateDualDetector:wrong_argument"; @@ -156,20 +156,28 @@ namespace GT6D { const Type * const __restrict in_data3, const Type * const __restrict in_data4, const Type * const __restrict in_data5, - const mwSize & numElems, Function & func) + const mwSize & num_elems, Function & func) { typedef typename SIMDUnrolling<Type>::vVvf vVvf; - const mwSize unrolling = 8; - const SIMDUnrolling<Type> simd_unroll(unrolling); - const SIMDUnrolling<Type> simd(1); + const SIMDUnrolling<Type> simd_8(8); + const SIMDUnrolling<Type> simd_1(1); - AccessType access; + const mwSize num_elems_unroll_8 = simd_8.get_unroll(num_elems); + const mwSize num_elems_unroll_1 = simd_1.get_unroll(num_elems); + + const AccessType access; #pragma omp for nowait - for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems); - elemIdx += simd_unroll.block) + for(mwIndex elemIdx = 0; elemIdx < num_elems_unroll_8; elemIdx += simd_8.block) { + const Type * const in1 = in_data1 + elemIdx; + const Type * const in2 = in_data2 + elemIdx; + const Type * const in3 = in_data3 + elemIdx; + const Type * const in4 = in_data4 + elemIdx; + const Type * const in5 = in_data5 + elemIdx; + Type * const out = out_data + elemIdx; + APPLY_FUNC_5FOLD_DUAL_DETECTOR(0); APPLY_FUNC_5FOLD_DUAL_DETECTOR(1); APPLY_FUNC_5FOLD_DUAL_DETECTOR(2); @@ -180,8 +188,7 @@ namespace GT6D { APPLY_FUNC_5FOLD_DUAL_DETECTOR(7); } #pragma omp for nowait - for(mwIndex elemIdx = simd_unroll.get_unroll(numElems); - elemIdx < simd.get_unroll(numElems); elemIdx += simd.block) + for(mwIndex elemIdx = num_elems_unroll_8; elemIdx < num_elems_unroll_1; elemIdx += simd_1.block) { const vVvf inV11 = access.load(&in_data1[elemIdx]); const vVvf inV21 = access.load(&in_data2[elemIdx]); @@ -192,7 +199,7 @@ namespace GT6D { access.store(&out_data[elemIdx], func(inV11, inV21, inV31, inV41, inV51)); } #pragma omp for nowait - for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++) + for(mwIndex elemIdx = num_elems_unroll_1; elemIdx < num_elems; elemIdx++) { out_data[elemIdx] = func(in_data1[elemIdx], in_data2[elemIdx], in_data3[elemIdx], in_data4[elemIdx], in_data5[elemIdx]); } diff --git a/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h b/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h index 6d45db93..c53310ac 100644 --- a/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h +++ b/zUtil_Cxx/include/gt6DUpdateDualL1Ops.h @@ -68,7 +68,7 @@ namespace GT6D { { } template<> - const update_dual_l1<float>::vVvf + inline const update_dual_l1<float>::vVvf update_dual_l1<float>::abs( const update_dual_l1<float>::vVvf & val) const throw() @@ -78,7 +78,7 @@ namespace GT6D { } template<> - const update_dual_l1<double>::vVvf + inline const update_dual_l1<double>::vVvf update_dual_l1<double>::abs( const update_dual_l1<double>::vVvf & val) const throw() @@ -88,7 +88,7 @@ namespace GT6D { } template<> - const float + inline const float update_dual_l1<float>::abs(const float & val) const throw() { @@ -96,7 +96,7 @@ namespace GT6D { } template<> - const double + inline const double update_dual_l1<double>::abs(const double & val) const throw() { @@ -104,7 +104,7 @@ namespace GT6D { } template<> - const update_dual_l1<float>::vVvf + inline const update_dual_l1<float>::vVvf update_dual_l1<float>::max( const update_dual_l1<float>::vVvf & val1, const update_dual_l1<float>::vVvf & val2) @@ -115,7 +115,7 @@ namespace GT6D { } template<> - const update_dual_l1<double>::vVvf + inline const update_dual_l1<double>::vVvf update_dual_l1<double>::max( const update_dual_l1<double>::vVvf & val1, const update_dual_l1<double>::vVvf & val2) diff --git a/zUtil_Cxx/include/gt6DUpdateDualTVOps.h b/zUtil_Cxx/include/gt6DUpdateDualTVOps.h index 9e9807ef..b405834b 100644 --- a/zUtil_Cxx/include/gt6DUpdateDualTVOps.h +++ b/zUtil_Cxx/include/gt6DUpdateDualTVOps.h @@ -63,7 +63,7 @@ namespace GT6D { { } template<> - const update_dual_tv<float>::vVvf + inline const update_dual_tv<float>::vVvf update_dual_tv<float>::abs( const update_dual_tv<float>::vVvf & val) const throw() @@ -73,7 +73,7 @@ namespace GT6D { } template<> - const update_dual_tv<double>::vVvf + inline const update_dual_tv<double>::vVvf update_dual_tv<double>::abs( const update_dual_tv<double>::vVvf & val) const throw() @@ -83,7 +83,7 @@ namespace GT6D { } template<> - const float + inline const float update_dual_tv<float>::abs(const float & val) const throw() { @@ -91,7 +91,7 @@ namespace GT6D { } template<> - const double + inline const double update_dual_tv<double>::abs(const double & val) const throw() { @@ -99,7 +99,7 @@ namespace GT6D { } template<> - const update_dual_tv<float>::vVvf + inline const update_dual_tv<float>::vVvf update_dual_tv<float>::max( const update_dual_tv<float>::vVvf & val1, const update_dual_tv<float>::vVvf & val2) @@ -110,7 +110,7 @@ namespace GT6D { } template<> - const update_dual_tv<double>::vVvf + inline const update_dual_tv<double>::vVvf update_dual_tv<double>::max( const update_dual_tv<double>::vVvf & val1, const update_dual_tv<double>::vVvf & val2) diff --git a/zUtil_Cxx/include/gt6DUpdatePrimalOps.h b/zUtil_Cxx/include/gt6DUpdatePrimalOps.h index b6a9e8a4..6231930c 100644 --- a/zUtil_Cxx/include/gt6DUpdatePrimalOps.h +++ b/zUtil_Cxx/include/gt6DUpdatePrimalOps.h @@ -16,14 +16,14 @@ namespace GT6D { #define APPLY_FUNC_5FOLD_PRIMAL(shift_val) \ { \ - vVvf inV11 = access.load(&out_data1[elemIdx + shift_val * simd_unroll.shift]);\ - vVvf inV21 = access.load(&out_data2[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV31 = access.load(&in_data3[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV41 = access.load(&in_data4[elemIdx + shift_val * simd_unroll.shift]);\ - const vVvf inV51 = access.load(&in_data5[elemIdx + shift_val * simd_unroll.shift]);\ + vVvf inV11 = access.load(out1 + shift_val * simd_8.shift);\ + vVvf inV21 = access.load(out2 + shift_val * simd_8.shift);\ + const vVvf inV31 = access.load(in3 + shift_val * simd_8.shift);\ + const vVvf inV41 = access.load(in4 + shift_val * simd_8.shift);\ + const vVvf inV51 = access.load(in5 + shift_val * simd_8.shift);\ func(inV11, inV21, inV31, inV41, inV51);\ - access.store(&out_data1[elemIdx + shift_val * simd_unroll.shift], inV11);\ - access.store(&out_data2[elemIdx + shift_val * simd_unroll.shift], inV21);\ + access.store(out1 + shift_val * simd_8.shift, inV11);\ + access.store(out2 + shift_val * simd_8.shift, inV21);\ } const char * primal_error_id = "C_FUN:gt6DUpdatePrimal:wrong_argument"; @@ -104,20 +104,27 @@ namespace GT6D { const Type * const __restrict in_data3, const Type * const __restrict in_data4, const Type * const __restrict in_data5, - const mwSize & numElems, Function & func) + const mwSize & num_elems, Function & func) { typedef typename SIMDUnrolling<Type>::vVvf vVvf; - const mwSize unrolling = 8; - const SIMDUnrolling<Type> simd_unroll(unrolling); - const SIMDUnrolling<Type> simd(1); + const SIMDUnrolling<Type> simd_8(8); + const SIMDUnrolling<Type> simd_1(1); + + const mwSize num_elems_unroll_8 = simd_8.get_unroll(num_elems); + const mwSize num_elems_unroll_1 = simd_1.get_unroll(num_elems); AccessType access; #pragma omp for nowait - for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems); - elemIdx += simd_unroll.block) + for(mwIndex elemIdx = 0; elemIdx < num_elems_unroll_8; elemIdx += simd_8.block) { + Type * const out1 = out_data1 + elemIdx; + Type * const out2 = out_data2 + elemIdx; + const Type * const in3 = in_data3 + elemIdx; + const Type * const in4 = in_data4 + elemIdx; + const Type * const in5 = in_data5 + elemIdx; + APPLY_FUNC_5FOLD_PRIMAL(0); APPLY_FUNC_5FOLD_PRIMAL(1); APPLY_FUNC_5FOLD_PRIMAL(2); @@ -128,8 +135,7 @@ namespace GT6D { APPLY_FUNC_5FOLD_PRIMAL(7); } #pragma omp for nowait - for(mwIndex elemIdx = simd_unroll.get_unroll(numElems); - elemIdx < simd.get_unroll(numElems); elemIdx += simd.block) + for(mwIndex elemIdx = num_elems_unroll_8; elemIdx < num_elems_unroll_1; elemIdx += simd_1.block) { vVvf inV11 = access.load(&out_data1[elemIdx]); vVvf inV21 = access.load(&out_data2[elemIdx]); @@ -144,7 +150,7 @@ namespace GT6D { access.store(&out_data2[elemIdx], inV21); } #pragma omp for nowait - for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++) + for(mwIndex elemIdx = num_elems_unroll_1; elemIdx < num_elems; elemIdx++) { func(out_data1[elemIdx], out_data2[elemIdx], in_data3[elemIdx], in_data4[elemIdx], in_data5[elemIdx]); } diff --git a/zUtil_Cxx/include/internal_cell_defs.h b/zUtil_Cxx/include/internal_cell_defs.h index 72db4d96..8b076165 100644 --- a/zUtil_Cxx/include/internal_cell_defs.h +++ b/zUtil_Cxx/include/internal_cell_defs.h @@ -477,7 +477,7 @@ protected: }; template<> -const inner_non_neg<float>::vVvf +inline const inner_non_neg<float>::vVvf inner_non_neg<float>::operator ()( const inner_non_neg<float>::vVvf & inData1, const inner_non_neg<float>::vVvf & inData2) @@ -492,7 +492,7 @@ const throw() } template<> -const inner_non_neg<double>::vVvf +inline const inner_non_neg<double>::vVvf inner_non_neg<double>::operator ()( const inner_non_neg<double>::vVvf & inData1, const inner_non_neg<double>::vVvf & inData2) @@ -530,7 +530,7 @@ protected: }; template<> -const inner_sum_FISTA_scale_non_neg<float>::vVvf +inline const inner_sum_FISTA_scale_non_neg<float>::vVvf inner_sum_FISTA_scale_non_neg<float>::operator ()( const inner_sum_FISTA_scale_non_neg<float>::vVvf & inData1, const inner_sum_FISTA_scale_non_neg<float>::vVvf & inData2) @@ -546,7 +546,7 @@ const throw() } template<> -const inner_sum_FISTA_scale_non_neg<double>::vVvf +inline const inner_sum_FISTA_scale_non_neg<double>::vVvf inner_sum_FISTA_scale_non_neg<double>::operator ()( const inner_sum_FISTA_scale_non_neg<double>::vVvf & inData1, const inner_sum_FISTA_scale_non_neg<double>::vVvf & inData2) @@ -724,49 +724,52 @@ cell_inner_cycle_sse(Type * const __restrict outData, { typedef typename SIMDUnrolling<Type>::vVvf vVvf; - const mwSize unrolling = 8; - const SIMDUnrolling<Type> simd_unroll(unrolling); - const SIMDUnrolling<Type> simd(1); + const SIMDUnrolling<Type> simd_8(8); + const SIMDUnrolling<Type> simd_1(1); AccessType access; #pragma omp for nowait - for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems); - elemIdx += simd_unroll.block) - { - const vVvf inV11 = access.load(&inData1[elemIdx + 0 * simd_unroll.shift]); - const vVvf inV12 = access.load(&inData1[elemIdx + 1 * simd_unroll.shift]); - const vVvf inV13 = access.load(&inData1[elemIdx + 2 * simd_unroll.shift]); - const vVvf inV14 = access.load(&inData1[elemIdx + 3 * simd_unroll.shift]); - - const vVvf inV21 = access.load(&inData2[elemIdx + 0 * simd_unroll.shift]); - const vVvf inV22 = access.load(&inData2[elemIdx + 1 * simd_unroll.shift]); - const vVvf inV23 = access.load(&inData2[elemIdx + 2 * simd_unroll.shift]); - const vVvf inV24 = access.load(&inData2[elemIdx + 3 * simd_unroll.shift]); - - access.store(&outData[elemIdx + 0 * simd_unroll.shift], func(inV11, inV21)); - access.store(&outData[elemIdx + 1 * simd_unroll.shift], func(inV12, inV22)); - access.store(&outData[elemIdx + 2 * simd_unroll.shift], func(inV13, inV23)); - access.store(&outData[elemIdx + 3 * simd_unroll.shift], func(inV14, inV24)); - - const vVvf inV31 = access.load(&inData1[elemIdx + 4 * simd_unroll.shift]); - const vVvf inV32 = access.load(&inData1[elemIdx + 5 * simd_unroll.shift]); - const vVvf inV33 = access.load(&inData1[elemIdx + 6 * simd_unroll.shift]); - const vVvf inV34 = access.load(&inData1[elemIdx + 7 * simd_unroll.shift]); - - const vVvf inV41 = access.load(&inData2[elemIdx + 4 * simd_unroll.shift]); - const vVvf inV42 = access.load(&inData2[elemIdx + 5 * simd_unroll.shift]); - const vVvf inV43 = access.load(&inData2[elemIdx + 6 * simd_unroll.shift]); - const vVvf inV44 = access.load(&inData2[elemIdx + 7 * simd_unroll.shift]); - - access.store(&outData[elemIdx + 4 * simd_unroll.shift], func(inV31, inV41)); - access.store(&outData[elemIdx + 5 * simd_unroll.shift], func(inV32, inV42)); - access.store(&outData[elemIdx + 6 * simd_unroll.shift], func(inV33, inV43)); - access.store(&outData[elemIdx + 7 * simd_unroll.shift], func(inV34, inV44)); + for(mwIndex elemIdx = 0; elemIdx < simd_8.get_unroll(numElems); + elemIdx += simd_8.block) + { + const Type * const in1 = inData1 + elemIdx; + const Type * const in2 = inData2 + elemIdx; + Type * const out = outData + elemIdx; + + const vVvf inV11 = access.load(in1 + 0 * simd_8.shift); + const vVvf inV12 = access.load(in1 + 1 * simd_8.shift); + const vVvf inV13 = access.load(in1 + 2 * simd_8.shift); + const vVvf inV14 = access.load(in1 + 3 * simd_8.shift); + + const vVvf inV21 = access.load(in2 + 0 * simd_8.shift); + const vVvf inV22 = access.load(in2 + 1 * simd_8.shift); + const vVvf inV23 = access.load(in2 + 2 * simd_8.shift); + const vVvf inV24 = access.load(in2 + 3 * simd_8.shift); + + access.store(out + 0 * simd_8.shift, func(inV11, inV21)); + access.store(out + 1 * simd_8.shift, func(inV12, inV22)); + access.store(out + 2 * simd_8.shift, func(inV13, inV23)); + access.store(out + 3 * simd_8.shift, func(inV14, inV24)); + + const vVvf inV31 = access.load(in1 + 4 * simd_8.shift); + const vVvf inV32 = access.load(in1 + 5 * simd_8.shift); + const vVvf inV33 = access.load(in1 + 6 * simd_8.shift); + const vVvf inV34 = access.load(in1 + 7 * simd_8.shift); + + const vVvf inV41 = access.load(in2 + 4 * simd_8.shift); + const vVvf inV42 = access.load(in2 + 5 * simd_8.shift); + const vVvf inV43 = access.load(in2 + 6 * simd_8.shift); + const vVvf inV44 = access.load(in2 + 7 * simd_8.shift); + + access.store(out + 4 * simd_8.shift, func(inV31, inV41)); + access.store(out + 5 * simd_8.shift, func(inV32, inV42)); + access.store(out + 6 * simd_8.shift, func(inV33, inV43)); + access.store(out + 7 * simd_8.shift, func(inV34, inV44)); } #pragma omp for nowait - for(mwIndex elemIdx = simd_unroll.get_unroll(numElems); - elemIdx < simd.get_unroll(numElems); elemIdx += simd.block) + for(mwIndex elemIdx = simd_8.get_unroll(numElems); + elemIdx < simd_1.get_unroll(numElems); elemIdx += simd_1.block) { const vVvf inV11 = access.load(&inData1[elemIdx]); const vVvf inV21 = access.load(&inData2[elemIdx]); @@ -774,7 +777,7 @@ cell_inner_cycle_sse(Type * const __restrict outData, access.store(&outData[elemIdx], func(inV11, inV21)); } #pragma omp for nowait - for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++) + for(mwIndex elemIdx = simd_1.get_unroll(numElems); elemIdx < numElems; elemIdx++) { outData[elemIdx] = func(inData1[elemIdx], inData2[elemIdx]); } -- GitLab