diff --git a/zUtil_Cxx/include/DctDataAlgorithms.h b/zUtil_Cxx/include/DctDataAlgorithms.h index 1d2d702355ede46fae9449c39dd3881d8ab104ab..f9206855c43f4674b5b5c4cb55749e629cef7889 100644 --- a/zUtil_Cxx/include/DctDataAlgorithms.h +++ b/zUtil_Cxx/include/DctDataAlgorithms.h @@ -32,7 +32,7 @@ namespace dct { ThreadSafeQueue< std::function<void()> > work_queue; - void initialize_line(Type * __restrict const data, const size_t & line_length, const Type & val = 0.0); + void initialize_line(Type * __restrict const out, const size_t & line_length, const Type & val = 0.0); template<class Op> void transform_line_unary(Type * __restrict const outin, const size_t & line_length, Op op); template<class Op> @@ -232,33 +232,44 @@ namespace dct { template<typename Type, class Alloc, const size_t vector_size> inline void - DctDataProcess<Type, Alloc, vector_size>::initialize_line(Type * const __restrict data, const size_t & line_length, const Type & val) + DctDataProcess<Type, Alloc, vector_size>::initialize_line(Type * const __restrict out, const size_t & line_length, const Type & val) { const SIMDUnrolling<Type, vector_size> simd4(4); const SIMDUnrolling<Type, vector_size> simd1(1); const SIMDRegister<Type, vector_size> vec_val = Coeff<Type, vector_size>::get(val); - Type * __restrict const end_line_unroll4 = data + simd4.get_unroll(line_length); - Type * __restrict const end_line_unroll1 = data + simd1.get_unroll(line_length); - Type * __restrict const end_line = data + line_length; - - Type * __restrict temp_data = data; + Type * __restrict temp_out = out; + Type * __restrict const end_line_out = out + line_length; - for (; temp_data < end_line_unroll4; temp_data += simd4.block) + const size_t shift_to_align = simd1.get_shift_to_align(out); + if (shift_to_align < line_length) { - this->access_u.store(temp_data + 0 * simd4.shift, vec_val); - this->access_u.store(temp_data + 1 * simd4.shift, vec_val); - this->access_u.store(temp_data + 2 * simd4.shift, vec_val); - this->access_u.store(temp_data + 3 * simd4.shift, vec_val); + Type * __restrict const out_a = out + shift_to_align; + const size_t line_length_a = line_length - shift_to_align; + + Type * __restrict const end_line_u4 = out_a + simd4.get_unroll(line_length_a); + Type * __restrict const end_line_u1 = out_a + simd1.get_unroll(line_length_a); + + for (; temp_out < out_a; temp_out++) + { + *temp_out = val; + } + for (; temp_out < end_line_u4; temp_out += simd4.block) + { + this->access_u.store(temp_out + 0 * simd4.shift, vec_val); + this->access_u.store(temp_out + 1 * simd4.shift, vec_val); + this->access_u.store(temp_out + 2 * simd4.shift, vec_val); + this->access_u.store(temp_out + 3 * simd4.shift, vec_val); + } + for (; temp_out < end_line_u1; temp_out += simd1.block) + { + this->access_u.store(temp_out, vec_val); + } } - for (; temp_data < end_line_unroll1; temp_data += simd1.block) + for (; temp_out < end_line_out; temp_out++) { - this->access_u.store(temp_data, vec_val); - } - for (; temp_data < end_line; temp_data++) - { - *temp_data = val; + *temp_out = val; } } @@ -270,34 +281,45 @@ namespace dct { const SIMDUnrolling<Type, vector_size> simd4(4); const SIMDUnrolling<Type, vector_size> simd1(1); - const Type * __restrict const end_line_inout_unroll4 = inout + simd4.get_unroll(line_length); - const Type * __restrict const end_line_inout_unroll1 = inout + simd1.get_unroll(line_length); - const Type * __restrict const end_line_inout = inout + line_length; - Type * __restrict temp_inout = inout; + const Type * __restrict const end_line_inout = inout + line_length; - for (; temp_inout < end_line_inout_unroll4; temp_inout += simd4.block) + const size_t shift_to_align = simd1.get_shift_to_align(inout); + if (shift_to_align < line_length) { - const vVvf & a0 = this->access_u.load(temp_inout + 0 * simd4.shift); - const vVvf & a1 = this->access_u.load(temp_inout + 1 * simd4.shift); - const vVvf & a2 = this->access_u.load(temp_inout + 2 * simd4.shift); - const vVvf & a3 = this->access_u.load(temp_inout + 3 * simd4.shift); - - const vVvf c0 = op(a0); - const vVvf c1 = op(a1); - const vVvf c2 = op(a2); - const vVvf c3 = op(a3); - - this->access_u.store(temp_inout + 0 * simd4.shift, c0); - this->access_u.store(temp_inout + 1 * simd4.shift, c1); - this->access_u.store(temp_inout + 2 * simd4.shift, c2); - this->access_u.store(temp_inout + 3 * simd4.shift, c3); - } - for (; temp_inout < end_line_inout_unroll1; temp_inout += simd1.block) - { - const vVvf & a0 = this->access_u.load(temp_inout); - const vVvf c0 = op(a0); - this->access_u.store(temp_inout, c0); + Type * __restrict const out_a = inout + shift_to_align; + const size_t line_length_a = line_length - shift_to_align; + + const Type * __restrict const end_line_inout_u4 = out_a + simd4.get_unroll(line_length_a); + const Type * __restrict const end_line_inout_u1 = out_a + simd1.get_unroll(line_length_a); + + for (; temp_inout < out_a; temp_inout++) + { + *temp_inout = op(*temp_inout); + } + for (; temp_inout < end_line_inout_u4; temp_inout += simd4.block) + { + const vVvf & a0 = this->access_u.load(temp_inout + 0 * simd4.shift); + const vVvf & a1 = this->access_u.load(temp_inout + 1 * simd4.shift); + const vVvf & a2 = this->access_u.load(temp_inout + 2 * simd4.shift); + const vVvf & a3 = this->access_u.load(temp_inout + 3 * simd4.shift); + + const vVvf c0 = op(a0); + const vVvf c1 = op(a1); + const vVvf c2 = op(a2); + const vVvf c3 = op(a3); + + this->access_u.store(temp_inout + 0 * simd4.shift, c0); + this->access_u.store(temp_inout + 1 * simd4.shift, c1); + this->access_u.store(temp_inout + 2 * simd4.shift, c2); + this->access_u.store(temp_inout + 3 * simd4.shift, c3); + } + for (; temp_inout < end_line_inout_u1; temp_inout += simd1.block) + { + const vVvf & a0 = this->access_u.load(temp_inout); + const vVvf c0 = op(a0); + this->access_u.store(temp_inout, c0); + } } for (; temp_inout < end_line_inout; temp_inout++) { @@ -313,38 +335,48 @@ namespace dct { const SIMDUnrolling<Type, vector_size> simd4(4); const SIMDUnrolling<Type, vector_size> simd1(1); - const Type * __restrict const end_line_in_unroll4 = in + simd4.get_unroll(line_length); - const Type * __restrict const end_line_in_unroll1 = in + simd1.get_unroll(line_length); - const Type * __restrict const end_line_in = in + line_length; - const Type * __restrict temp_in = in; - Type * __restrict temp_out = out; + const Type * __restrict const end_line_out = out + line_length; - for (; temp_in < end_line_in_unroll4; temp_out += simd4.block, temp_in += simd4.block) + const size_t shift_to_align = simd1.get_shift_to_align(out); + if (shift_to_align < line_length) { - const vVvf & a0 = this->access_u.load(temp_in + 0 * simd4.shift); - const vVvf & a1 = this->access_u.load(temp_in + 1 * simd4.shift); - const vVvf & a2 = this->access_u.load(temp_in + 2 * simd4.shift); - const vVvf & a3 = this->access_u.load(temp_in + 3 * simd4.shift); - - const vVvf c0 = op(a0); - const vVvf c1 = op(a1); - const vVvf c2 = op(a2); - const vVvf c3 = op(a3); - - this->access_u.store(temp_out + 0 * simd4.shift, c0); - this->access_u.store(temp_out + 1 * simd4.shift, c1); - this->access_u.store(temp_out + 2 * simd4.shift, c2); - this->access_u.store(temp_out + 3 * simd4.shift, c3); - } - for (; temp_in < end_line_in_unroll1; temp_out += simd1.block, temp_in += simd1.block) - { - const vVvf & a0 = this->access_u.load(temp_in); - const vVvf c0 = op(a0); - this->access_u.store(temp_out, c0); + Type * __restrict const out_a = out + shift_to_align; + const size_t line_length_a = line_length - shift_to_align; + + const Type * __restrict const end_line_out_u4 = out_a + simd4.get_unroll(line_length_a); + const Type * __restrict const end_line_out_u1 = out_a + simd1.get_unroll(line_length_a); + + for (; temp_out < out_a; temp_out++, temp_in++) + { + *temp_out = op(*temp_in); + } + for (; temp_out < end_line_out_u4; temp_out += simd4.block, temp_in += simd4.block) + { + const vVvf & a0 = this->access_u.load(temp_in + 0 * simd4.shift); + const vVvf & a1 = this->access_u.load(temp_in + 1 * simd4.shift); + const vVvf & a2 = this->access_u.load(temp_in + 2 * simd4.shift); + const vVvf & a3 = this->access_u.load(temp_in + 3 * simd4.shift); + + const vVvf c0 = op(a0); + const vVvf c1 = op(a1); + const vVvf c2 = op(a2); + const vVvf c3 = op(a3); + + this->access_u.store(temp_out + 0 * simd4.shift, c0); + this->access_u.store(temp_out + 1 * simd4.shift, c1); + this->access_u.store(temp_out + 2 * simd4.shift, c2); + this->access_u.store(temp_out + 3 * simd4.shift, c3); + } + for (; temp_out < end_line_out_u1; temp_out += simd1.block, temp_in += simd1.block) + { + const vVvf & a0 = this->access_u.load(temp_in); + const vVvf c0 = op(a0); + this->access_u.store(temp_out, c0); + } } - for (; temp_in < end_line_in; temp_out++, temp_in++) + for (; temp_out < end_line_out; temp_out++, temp_in++) { *temp_out = op(*temp_in); } @@ -358,44 +390,54 @@ namespace dct { const SIMDUnrolling<Type, vector_size> simd4(4); const SIMDUnrolling<Type, vector_size> simd1(1); - const Type * __restrict const end_line_in_unroll4 = in + simd4.get_unroll(line_length); - const Type * __restrict const end_line_in_unroll1 = in + simd1.get_unroll(line_length); - const Type * __restrict const end_line_in = in + line_length; - const Type * __restrict temp_in = in; - Type * __restrict temp_out = out; + const Type * __restrict const end_line_out = out + line_length; - for (; temp_in < end_line_in_unroll4; temp_out += simd4.block, temp_in += simd4.block) - { - const vVvf & a0 = this->access_u.load(temp_out + 0 * simd4.shift); - const vVvf & a1 = this->access_u.load(temp_out + 1 * simd4.shift); - const vVvf & a2 = this->access_u.load(temp_out + 2 * simd4.shift); - const vVvf & a3 = this->access_u.load(temp_out + 3 * simd4.shift); - - const vVvf & b0 = this->access_u.load(temp_in + 0 * simd4.shift); - const vVvf & b1 = this->access_u.load(temp_in + 1 * simd4.shift); - const vVvf & b2 = this->access_u.load(temp_in + 2 * simd4.shift); - const vVvf & b3 = this->access_u.load(temp_in + 3 * simd4.shift); - - const vVvf c0 = op(a0, b0); - const vVvf c1 = op(a1, b1); - const vVvf c2 = op(a2, b2); - const vVvf c3 = op(a3, b3); - - this->access_u.store(temp_out + 0 * simd4.shift, c0); - this->access_u.store(temp_out + 1 * simd4.shift, c1); - this->access_u.store(temp_out + 2 * simd4.shift, c2); - this->access_u.store(temp_out + 3 * simd4.shift, c3); - } - for (; temp_in < end_line_in_unroll1; temp_out += simd1.block, temp_in += simd1.block) + const size_t shift_to_align = simd1.get_shift_to_align(out); + if (shift_to_align < line_length) { - const vVvf & a0 = this->access_u.load(temp_out); - const vVvf & b0 = this->access_u.load(temp_in); - const vVvf c0 = op(a0, b0); - this->access_u.store(temp_out, c0); + Type * __restrict const out_a = out + shift_to_align; + const size_t line_length_a = line_length - shift_to_align; + + const Type * __restrict const end_line_out_u4 = out_a + simd4.get_unroll(line_length_a); + const Type * __restrict const end_line_out_u1 = out_a + simd1.get_unroll(line_length_a); + + for (; temp_out < out_a; temp_out++, temp_in++) + { + *temp_out = op(*temp_out, *temp_in); + } + for (; temp_out < end_line_out_u4; temp_out += simd4.block, temp_in += simd4.block) + { + const vVvf & a0 = this->access_u.load(temp_out + 0 * simd4.shift); + const vVvf & a1 = this->access_u.load(temp_out + 1 * simd4.shift); + const vVvf & a2 = this->access_u.load(temp_out + 2 * simd4.shift); + const vVvf & a3 = this->access_u.load(temp_out + 3 * simd4.shift); + + const vVvf & b0 = this->access_u.load(temp_in + 0 * simd4.shift); + const vVvf & b1 = this->access_u.load(temp_in + 1 * simd4.shift); + const vVvf & b2 = this->access_u.load(temp_in + 2 * simd4.shift); + const vVvf & b3 = this->access_u.load(temp_in + 3 * simd4.shift); + + const vVvf c0 = op(a0, b0); + const vVvf c1 = op(a1, b1); + const vVvf c2 = op(a2, b2); + const vVvf c3 = op(a3, b3); + + this->access_u.store(temp_out + 0 * simd4.shift, c0); + this->access_u.store(temp_out + 1 * simd4.shift, c1); + this->access_u.store(temp_out + 2 * simd4.shift, c2); + this->access_u.store(temp_out + 3 * simd4.shift, c3); + } + for (; temp_out < end_line_out_u1; temp_out += simd1.block, temp_in += simd1.block) + { + const vVvf & a0 = this->access_u.load(temp_out); + const vVvf & b0 = this->access_u.load(temp_in); + const vVvf c0 = op(a0, b0); + this->access_u.store(temp_out, c0); + } } - for (; temp_in < end_line_in; temp_out++, temp_in++) + for (; temp_out < end_line_out; temp_out++, temp_in++) { *temp_out = op(*temp_out, *temp_in); } @@ -409,47 +451,57 @@ namespace dct { const SIMDUnrolling<Type, vector_size> simd4(4); const SIMDUnrolling<Type, vector_size> simd1(1); - const Type * __restrict const end_line_in1_unroll4 = in1 + simd4.get_unroll(line_length); - const Type * __restrict const end_line_in1_unroll1 = in1 + simd1.get_unroll(line_length); - const Type * __restrict const end_line_in1 = in1 + line_length; - const Type * __restrict temp_in1 = in1; const Type * __restrict temp_in2 = in2; - Type * __restrict temp_out = out; + const Type * __restrict const end_line_out = out + line_length; - for (; temp_in1 < end_line_in1_unroll4; - temp_out += simd4.block, temp_in1 += simd4.block, temp_in2 += simd4.block) - { - const vVvf & a0 = this->access_u.load(temp_in1 + 0 * simd4.shift); - const vVvf & a1 = this->access_u.load(temp_in1 + 1 * simd4.shift); - const vVvf & a2 = this->access_u.load(temp_in1 + 2 * simd4.shift); - const vVvf & a3 = this->access_u.load(temp_in1 + 3 * simd4.shift); - - const vVvf & b0 = this->access_u.load(temp_in2 + 0 * simd4.shift); - const vVvf & b1 = this->access_u.load(temp_in2 + 1 * simd4.shift); - const vVvf & b2 = this->access_u.load(temp_in2 + 2 * simd4.shift); - const vVvf & b3 = this->access_u.load(temp_in2 + 3 * simd4.shift); - - const vVvf c0 = op(a0, b0); - const vVvf c1 = op(a1, b1); - const vVvf c2 = op(a2, b2); - const vVvf c3 = op(a3, b3); - - this->access_u.store(temp_out + 0 * simd4.shift, c0); - this->access_u.store(temp_out + 1 * simd4.shift, c1); - this->access_u.store(temp_out + 2 * simd4.shift, c2); - this->access_u.store(temp_out + 3 * simd4.shift, c3); - } - for (; temp_in1 < end_line_in1_unroll1; - temp_out += simd1.block, temp_in1 += simd1.block, temp_in2 += simd1.block) + const size_t shift_to_align = simd1.get_shift_to_align(out); + if (shift_to_align < line_length) { - const vVvf & a0 = this->access_u.load(temp_in1); - const vVvf & b0 = this->access_u.load(temp_in2); - const vVvf c0 = op(a0, b0); - this->access_u.store(temp_out, c0); + Type * __restrict const out_a = out + shift_to_align; + const size_t aligned_line_length = line_length - shift_to_align; + + const Type * __restrict const end_line_out_u4 = out_a + simd4.get_unroll(aligned_line_length); + const Type * __restrict const end_line_out_u1 = out_a + simd1.get_unroll(aligned_line_length); + + for (; temp_out < out_a; temp_out++, temp_in1++, temp_in2++) + { + *temp_out = op(*temp_in1, *temp_in2); + } + for (; temp_out < end_line_out_u4; + temp_out += simd4.block, temp_in1 += simd4.block, temp_in2 += simd4.block) + { + const vVvf & a0 = this->access_u.load(temp_in1 + 0 * simd4.shift); + const vVvf & a1 = this->access_u.load(temp_in1 + 1 * simd4.shift); + const vVvf & a2 = this->access_u.load(temp_in1 + 2 * simd4.shift); + const vVvf & a3 = this->access_u.load(temp_in1 + 3 * simd4.shift); + + const vVvf & b0 = this->access_u.load(temp_in2 + 0 * simd4.shift); + const vVvf & b1 = this->access_u.load(temp_in2 + 1 * simd4.shift); + const vVvf & b2 = this->access_u.load(temp_in2 + 2 * simd4.shift); + const vVvf & b3 = this->access_u.load(temp_in2 + 3 * simd4.shift); + + const vVvf c0 = op(a0, b0); + const vVvf c1 = op(a1, b1); + const vVvf c2 = op(a2, b2); + const vVvf c3 = op(a3, b3); + + this->access_u.store(temp_out + 0 * simd4.shift, c0); + this->access_u.store(temp_out + 1 * simd4.shift, c1); + this->access_u.store(temp_out + 2 * simd4.shift, c2); + this->access_u.store(temp_out + 3 * simd4.shift, c3); + } + for (; temp_out < end_line_out_u1; + temp_out += simd1.block, temp_in1 += simd1.block, temp_in2 += simd1.block) + { + const vVvf & a0 = this->access_u.load(temp_in1); + const vVvf & b0 = this->access_u.load(temp_in2); + const vVvf c0 = op(a0, b0); + this->access_u.store(temp_out, c0); + } } - for (; temp_in1 < end_line_in1; temp_out++, temp_in1++, temp_in2++) + for (; temp_out < end_line_out; temp_out++, temp_in1++, temp_in2++) { *temp_out = op(*temp_in1, *temp_in2); } diff --git a/zUtil_Cxx/include/vectorization.h b/zUtil_Cxx/include/vectorization.h index 4bb2b354442585c2d68541cb902f5e49832e1dd9..d5b6b8de4e937cd3e2c186d89d3666332572d302 100644 --- a/zUtil_Cxx/include/vectorization.h +++ b/zUtil_Cxx/include/vectorization.h @@ -77,6 +77,11 @@ public: { return ROUND_DOWN(tot_size, block); } + const size_t + get_shift_to_align(const Type * const start) const + { + return ((vector_size - (reinterpret_cast<size_t>(start) % vector_size)) % vector_size); + } };