Skip to content
Snippets Groups Projects
Commit 8eaf1c41 authored by Nicola Vigano's avatar Nicola Vigano
Browse files

6D-C++-vectorization: round of simplifications/inlining/small tweaking

parent 8c7dae43
No related branches found
No related tags found
No related merge requests found
......@@ -16,12 +16,12 @@ namespace GT6D {
#define APPLY_FUNC_5FOLD_DUAL_DETECTOR(shift_val) \
{ \
const vVvf inV11 = access.load(&in_data1[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV21 = access.load(&in_data2[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV31 = access.load(&in_data3[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV41 = access.load(&in_data4[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV51 = access.load(&in_data5[elemIdx + shift_val * simd_unroll.shift]);\
access.store(&out_data[elemIdx + shift_val * simd_unroll.shift], func(inV11, inV21, inV31, inV41, inV51));\
const vVvf inV11 = access.load(in1 + shift_val * simd_8.shift);\
const vVvf inV21 = access.load(in2 + shift_val * simd_8.shift);\
const vVvf inV31 = access.load(in3 + shift_val * simd_8.shift);\
const vVvf inV41 = access.load(in4 + shift_val * simd_8.shift);\
const vVvf inV51 = access.load(in5 + shift_val * simd_8.shift);\
access.store(out + shift_val * simd_8.shift, func(inV11, inV21, inV31, inV41, inV51));\
}
const char * dual_detector_error_id = "C_FUN:gt6DUpdateDualDetector:wrong_argument";
......@@ -156,20 +156,28 @@ namespace GT6D {
const Type * const __restrict in_data3,
const Type * const __restrict in_data4,
const Type * const __restrict in_data5,
const mwSize & numElems, Function & func)
const mwSize & num_elems, Function & func)
{
typedef typename SIMDUnrolling<Type>::vVvf vVvf;
const mwSize unrolling = 8;
const SIMDUnrolling<Type> simd_unroll(unrolling);
const SIMDUnrolling<Type> simd(1);
const SIMDUnrolling<Type> simd_8(8);
const SIMDUnrolling<Type> simd_1(1);
AccessType access;
const mwSize num_elems_unroll_8 = simd_8.get_unroll(num_elems);
const mwSize num_elems_unroll_1 = simd_1.get_unroll(num_elems);
const AccessType access;
#pragma omp for nowait
for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems);
elemIdx += simd_unroll.block)
for(mwIndex elemIdx = 0; elemIdx < num_elems_unroll_8; elemIdx += simd_8.block)
{
const Type * const in1 = in_data1 + elemIdx;
const Type * const in2 = in_data2 + elemIdx;
const Type * const in3 = in_data3 + elemIdx;
const Type * const in4 = in_data4 + elemIdx;
const Type * const in5 = in_data5 + elemIdx;
Type * const out = out_data + elemIdx;
APPLY_FUNC_5FOLD_DUAL_DETECTOR(0);
APPLY_FUNC_5FOLD_DUAL_DETECTOR(1);
APPLY_FUNC_5FOLD_DUAL_DETECTOR(2);
......@@ -180,8 +188,7 @@ namespace GT6D {
APPLY_FUNC_5FOLD_DUAL_DETECTOR(7);
}
#pragma omp for nowait
for(mwIndex elemIdx = simd_unroll.get_unroll(numElems);
elemIdx < simd.get_unroll(numElems); elemIdx += simd.block)
for(mwIndex elemIdx = num_elems_unroll_8; elemIdx < num_elems_unroll_1; elemIdx += simd_1.block)
{
const vVvf inV11 = access.load(&in_data1[elemIdx]);
const vVvf inV21 = access.load(&in_data2[elemIdx]);
......@@ -192,7 +199,7 @@ namespace GT6D {
access.store(&out_data[elemIdx], func(inV11, inV21, inV31, inV41, inV51));
}
#pragma omp for nowait
for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++)
for(mwIndex elemIdx = num_elems_unroll_1; elemIdx < num_elems; elemIdx++)
{
out_data[elemIdx] = func(in_data1[elemIdx], in_data2[elemIdx], in_data3[elemIdx], in_data4[elemIdx], in_data5[elemIdx]);
}
......
......@@ -68,7 +68,7 @@ namespace GT6D {
{ }
template<>
const update_dual_l1<float>::vVvf
inline const update_dual_l1<float>::vVvf
update_dual_l1<float>::abs(
const update_dual_l1<float>::vVvf & val)
const throw()
......@@ -78,7 +78,7 @@ namespace GT6D {
}
template<>
const update_dual_l1<double>::vVvf
inline const update_dual_l1<double>::vVvf
update_dual_l1<double>::abs(
const update_dual_l1<double>::vVvf & val)
const throw()
......@@ -88,7 +88,7 @@ namespace GT6D {
}
template<>
const float
inline const float
update_dual_l1<float>::abs(const float & val)
const throw()
{
......@@ -96,7 +96,7 @@ namespace GT6D {
}
template<>
const double
inline const double
update_dual_l1<double>::abs(const double & val)
const throw()
{
......@@ -104,7 +104,7 @@ namespace GT6D {
}
template<>
const update_dual_l1<float>::vVvf
inline const update_dual_l1<float>::vVvf
update_dual_l1<float>::max(
const update_dual_l1<float>::vVvf & val1,
const update_dual_l1<float>::vVvf & val2)
......@@ -115,7 +115,7 @@ namespace GT6D {
}
template<>
const update_dual_l1<double>::vVvf
inline const update_dual_l1<double>::vVvf
update_dual_l1<double>::max(
const update_dual_l1<double>::vVvf & val1,
const update_dual_l1<double>::vVvf & val2)
......
......@@ -63,7 +63,7 @@ namespace GT6D {
{ }
template<>
const update_dual_tv<float>::vVvf
inline const update_dual_tv<float>::vVvf
update_dual_tv<float>::abs(
const update_dual_tv<float>::vVvf & val)
const throw()
......@@ -73,7 +73,7 @@ namespace GT6D {
}
template<>
const update_dual_tv<double>::vVvf
inline const update_dual_tv<double>::vVvf
update_dual_tv<double>::abs(
const update_dual_tv<double>::vVvf & val)
const throw()
......@@ -83,7 +83,7 @@ namespace GT6D {
}
template<>
const float
inline const float
update_dual_tv<float>::abs(const float & val)
const throw()
{
......@@ -91,7 +91,7 @@ namespace GT6D {
}
template<>
const double
inline const double
update_dual_tv<double>::abs(const double & val)
const throw()
{
......@@ -99,7 +99,7 @@ namespace GT6D {
}
template<>
const update_dual_tv<float>::vVvf
inline const update_dual_tv<float>::vVvf
update_dual_tv<float>::max(
const update_dual_tv<float>::vVvf & val1,
const update_dual_tv<float>::vVvf & val2)
......@@ -110,7 +110,7 @@ namespace GT6D {
}
template<>
const update_dual_tv<double>::vVvf
inline const update_dual_tv<double>::vVvf
update_dual_tv<double>::max(
const update_dual_tv<double>::vVvf & val1,
const update_dual_tv<double>::vVvf & val2)
......
......@@ -16,14 +16,14 @@ namespace GT6D {
#define APPLY_FUNC_5FOLD_PRIMAL(shift_val) \
{ \
vVvf inV11 = access.load(&out_data1[elemIdx + shift_val * simd_unroll.shift]);\
vVvf inV21 = access.load(&out_data2[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV31 = access.load(&in_data3[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV41 = access.load(&in_data4[elemIdx + shift_val * simd_unroll.shift]);\
const vVvf inV51 = access.load(&in_data5[elemIdx + shift_val * simd_unroll.shift]);\
vVvf inV11 = access.load(out1 + shift_val * simd_8.shift);\
vVvf inV21 = access.load(out2 + shift_val * simd_8.shift);\
const vVvf inV31 = access.load(in3 + shift_val * simd_8.shift);\
const vVvf inV41 = access.load(in4 + shift_val * simd_8.shift);\
const vVvf inV51 = access.load(in5 + shift_val * simd_8.shift);\
func(inV11, inV21, inV31, inV41, inV51);\
access.store(&out_data1[elemIdx + shift_val * simd_unroll.shift], inV11);\
access.store(&out_data2[elemIdx + shift_val * simd_unroll.shift], inV21);\
access.store(out1 + shift_val * simd_8.shift, inV11);\
access.store(out2 + shift_val * simd_8.shift, inV21);\
}
const char * primal_error_id = "C_FUN:gt6DUpdatePrimal:wrong_argument";
......@@ -104,20 +104,27 @@ namespace GT6D {
const Type * const __restrict in_data3,
const Type * const __restrict in_data4,
const Type * const __restrict in_data5,
const mwSize & numElems, Function & func)
const mwSize & num_elems, Function & func)
{
typedef typename SIMDUnrolling<Type>::vVvf vVvf;
const mwSize unrolling = 8;
const SIMDUnrolling<Type> simd_unroll(unrolling);
const SIMDUnrolling<Type> simd(1);
const SIMDUnrolling<Type> simd_8(8);
const SIMDUnrolling<Type> simd_1(1);
const mwSize num_elems_unroll_8 = simd_8.get_unroll(num_elems);
const mwSize num_elems_unroll_1 = simd_1.get_unroll(num_elems);
AccessType access;
#pragma omp for nowait
for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems);
elemIdx += simd_unroll.block)
for(mwIndex elemIdx = 0; elemIdx < num_elems_unroll_8; elemIdx += simd_8.block)
{
Type * const out1 = out_data1 + elemIdx;
Type * const out2 = out_data2 + elemIdx;
const Type * const in3 = in_data3 + elemIdx;
const Type * const in4 = in_data4 + elemIdx;
const Type * const in5 = in_data5 + elemIdx;
APPLY_FUNC_5FOLD_PRIMAL(0);
APPLY_FUNC_5FOLD_PRIMAL(1);
APPLY_FUNC_5FOLD_PRIMAL(2);
......@@ -128,8 +135,7 @@ namespace GT6D {
APPLY_FUNC_5FOLD_PRIMAL(7);
}
#pragma omp for nowait
for(mwIndex elemIdx = simd_unroll.get_unroll(numElems);
elemIdx < simd.get_unroll(numElems); elemIdx += simd.block)
for(mwIndex elemIdx = num_elems_unroll_8; elemIdx < num_elems_unroll_1; elemIdx += simd_1.block)
{
vVvf inV11 = access.load(&out_data1[elemIdx]);
vVvf inV21 = access.load(&out_data2[elemIdx]);
......@@ -144,7 +150,7 @@ namespace GT6D {
access.store(&out_data2[elemIdx], inV21);
}
#pragma omp for nowait
for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++)
for(mwIndex elemIdx = num_elems_unroll_1; elemIdx < num_elems; elemIdx++)
{
func(out_data1[elemIdx], out_data2[elemIdx], in_data3[elemIdx], in_data4[elemIdx], in_data5[elemIdx]);
}
......
......@@ -477,7 +477,7 @@ protected:
};
template<>
const inner_non_neg<float>::vVvf
inline const inner_non_neg<float>::vVvf
inner_non_neg<float>::operator ()(
const inner_non_neg<float>::vVvf & inData1,
const inner_non_neg<float>::vVvf & inData2)
......@@ -492,7 +492,7 @@ const throw()
}
template<>
const inner_non_neg<double>::vVvf
inline const inner_non_neg<double>::vVvf
inner_non_neg<double>::operator ()(
const inner_non_neg<double>::vVvf & inData1,
const inner_non_neg<double>::vVvf & inData2)
......@@ -530,7 +530,7 @@ protected:
};
template<>
const inner_sum_FISTA_scale_non_neg<float>::vVvf
inline const inner_sum_FISTA_scale_non_neg<float>::vVvf
inner_sum_FISTA_scale_non_neg<float>::operator ()(
const inner_sum_FISTA_scale_non_neg<float>::vVvf & inData1,
const inner_sum_FISTA_scale_non_neg<float>::vVvf & inData2)
......@@ -546,7 +546,7 @@ const throw()
}
template<>
const inner_sum_FISTA_scale_non_neg<double>::vVvf
inline const inner_sum_FISTA_scale_non_neg<double>::vVvf
inner_sum_FISTA_scale_non_neg<double>::operator ()(
const inner_sum_FISTA_scale_non_neg<double>::vVvf & inData1,
const inner_sum_FISTA_scale_non_neg<double>::vVvf & inData2)
......@@ -724,49 +724,52 @@ cell_inner_cycle_sse(Type * const __restrict outData,
{
typedef typename SIMDUnrolling<Type>::vVvf vVvf;
const mwSize unrolling = 8;
const SIMDUnrolling<Type> simd_unroll(unrolling);
const SIMDUnrolling<Type> simd(1);
const SIMDUnrolling<Type> simd_8(8);
const SIMDUnrolling<Type> simd_1(1);
AccessType access;
#pragma omp for nowait
for(mwIndex elemIdx = 0; elemIdx < simd_unroll.get_unroll(numElems);
elemIdx += simd_unroll.block)
{
const vVvf inV11 = access.load(&inData1[elemIdx + 0 * simd_unroll.shift]);
const vVvf inV12 = access.load(&inData1[elemIdx + 1 * simd_unroll.shift]);
const vVvf inV13 = access.load(&inData1[elemIdx + 2 * simd_unroll.shift]);
const vVvf inV14 = access.load(&inData1[elemIdx + 3 * simd_unroll.shift]);
const vVvf inV21 = access.load(&inData2[elemIdx + 0 * simd_unroll.shift]);
const vVvf inV22 = access.load(&inData2[elemIdx + 1 * simd_unroll.shift]);
const vVvf inV23 = access.load(&inData2[elemIdx + 2 * simd_unroll.shift]);
const vVvf inV24 = access.load(&inData2[elemIdx + 3 * simd_unroll.shift]);
access.store(&outData[elemIdx + 0 * simd_unroll.shift], func(inV11, inV21));
access.store(&outData[elemIdx + 1 * simd_unroll.shift], func(inV12, inV22));
access.store(&outData[elemIdx + 2 * simd_unroll.shift], func(inV13, inV23));
access.store(&outData[elemIdx + 3 * simd_unroll.shift], func(inV14, inV24));
const vVvf inV31 = access.load(&inData1[elemIdx + 4 * simd_unroll.shift]);
const vVvf inV32 = access.load(&inData1[elemIdx + 5 * simd_unroll.shift]);
const vVvf inV33 = access.load(&inData1[elemIdx + 6 * simd_unroll.shift]);
const vVvf inV34 = access.load(&inData1[elemIdx + 7 * simd_unroll.shift]);
const vVvf inV41 = access.load(&inData2[elemIdx + 4 * simd_unroll.shift]);
const vVvf inV42 = access.load(&inData2[elemIdx + 5 * simd_unroll.shift]);
const vVvf inV43 = access.load(&inData2[elemIdx + 6 * simd_unroll.shift]);
const vVvf inV44 = access.load(&inData2[elemIdx + 7 * simd_unroll.shift]);
access.store(&outData[elemIdx + 4 * simd_unroll.shift], func(inV31, inV41));
access.store(&outData[elemIdx + 5 * simd_unroll.shift], func(inV32, inV42));
access.store(&outData[elemIdx + 6 * simd_unroll.shift], func(inV33, inV43));
access.store(&outData[elemIdx + 7 * simd_unroll.shift], func(inV34, inV44));
for(mwIndex elemIdx = 0; elemIdx < simd_8.get_unroll(numElems);
elemIdx += simd_8.block)
{
const Type * const in1 = inData1 + elemIdx;
const Type * const in2 = inData2 + elemIdx;
Type * const out = outData + elemIdx;
const vVvf inV11 = access.load(in1 + 0 * simd_8.shift);
const vVvf inV12 = access.load(in1 + 1 * simd_8.shift);
const vVvf inV13 = access.load(in1 + 2 * simd_8.shift);
const vVvf inV14 = access.load(in1 + 3 * simd_8.shift);
const vVvf inV21 = access.load(in2 + 0 * simd_8.shift);
const vVvf inV22 = access.load(in2 + 1 * simd_8.shift);
const vVvf inV23 = access.load(in2 + 2 * simd_8.shift);
const vVvf inV24 = access.load(in2 + 3 * simd_8.shift);
access.store(out + 0 * simd_8.shift, func(inV11, inV21));
access.store(out + 1 * simd_8.shift, func(inV12, inV22));
access.store(out + 2 * simd_8.shift, func(inV13, inV23));
access.store(out + 3 * simd_8.shift, func(inV14, inV24));
const vVvf inV31 = access.load(in1 + 4 * simd_8.shift);
const vVvf inV32 = access.load(in1 + 5 * simd_8.shift);
const vVvf inV33 = access.load(in1 + 6 * simd_8.shift);
const vVvf inV34 = access.load(in1 + 7 * simd_8.shift);
const vVvf inV41 = access.load(in2 + 4 * simd_8.shift);
const vVvf inV42 = access.load(in2 + 5 * simd_8.shift);
const vVvf inV43 = access.load(in2 + 6 * simd_8.shift);
const vVvf inV44 = access.load(in2 + 7 * simd_8.shift);
access.store(out + 4 * simd_8.shift, func(inV31, inV41));
access.store(out + 5 * simd_8.shift, func(inV32, inV42));
access.store(out + 6 * simd_8.shift, func(inV33, inV43));
access.store(out + 7 * simd_8.shift, func(inV34, inV44));
}
#pragma omp for nowait
for(mwIndex elemIdx = simd_unroll.get_unroll(numElems);
elemIdx < simd.get_unroll(numElems); elemIdx += simd.block)
for(mwIndex elemIdx = simd_8.get_unroll(numElems);
elemIdx < simd_1.get_unroll(numElems); elemIdx += simd_1.block)
{
const vVvf inV11 = access.load(&inData1[elemIdx]);
const vVvf inV21 = access.load(&inData2[elemIdx]);
......@@ -774,7 +777,7 @@ cell_inner_cycle_sse(Type * const __restrict outData,
access.store(&outData[elemIdx], func(inV11, inV21));
}
#pragma omp for nowait
for(mwIndex elemIdx = simd.get_unroll(numElems); elemIdx < numElems; elemIdx++)
for(mwIndex elemIdx = simd_1.get_unroll(numElems); elemIdx < numElems; elemIdx++)
{
outData[elemIdx] = func(inData1[elemIdx], inData2[elemIdx]);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment