diff --git a/zUtil_Cxx/include/DctDataAlgorithms.h b/zUtil_Cxx/include/DctDataAlgorithms.h
index 1d2d702355ede46fae9449c39dd3881d8ab104ab..f9206855c43f4674b5b5c4cb55749e629cef7889 100644
--- a/zUtil_Cxx/include/DctDataAlgorithms.h
+++ b/zUtil_Cxx/include/DctDataAlgorithms.h
@@ -32,7 +32,7 @@ namespace dct {
 
     ThreadSafeQueue< std::function<void()> > work_queue;
 
-    void initialize_line(Type * __restrict const data, const size_t & line_length, const Type & val = 0.0);
+    void initialize_line(Type * __restrict const out, const size_t & line_length, const Type & val = 0.0);
     template<class Op>
     void transform_line_unary(Type * __restrict const outin, const size_t & line_length, Op op);
     template<class Op>
@@ -232,33 +232,44 @@ namespace dct {
 
   template<typename Type, class Alloc, const size_t vector_size>
   inline void
-  DctDataProcess<Type, Alloc, vector_size>::initialize_line(Type * const __restrict data, const size_t & line_length, const Type & val)
+  DctDataProcess<Type, Alloc, vector_size>::initialize_line(Type * const __restrict out, const size_t & line_length, const Type & val)
   {
     const SIMDUnrolling<Type, vector_size> simd4(4);
     const SIMDUnrolling<Type, vector_size> simd1(1);
 
     const SIMDRegister<Type, vector_size> vec_val = Coeff<Type, vector_size>::get(val);
 
-    Type * __restrict const end_line_unroll4 = data + simd4.get_unroll(line_length);
-    Type * __restrict const end_line_unroll1 = data + simd1.get_unroll(line_length);
-    Type * __restrict const end_line = data + line_length;
-
-    Type * __restrict temp_data = data;
+    Type * __restrict temp_out = out;
+    Type * __restrict const end_line_out = out + line_length;
 
-    for (; temp_data < end_line_unroll4; temp_data += simd4.block)
+    const size_t shift_to_align = simd1.get_shift_to_align(out);
+    if (shift_to_align < line_length)
     {
-      this->access_u.store(temp_data + 0 * simd4.shift, vec_val);
-      this->access_u.store(temp_data + 1 * simd4.shift, vec_val);
-      this->access_u.store(temp_data + 2 * simd4.shift, vec_val);
-      this->access_u.store(temp_data + 3 * simd4.shift, vec_val);
+      Type * __restrict const out_a = out + shift_to_align;
+      const size_t line_length_a = line_length - shift_to_align;
+
+      Type * __restrict const end_line_u4 = out_a + simd4.get_unroll(line_length_a);
+      Type * __restrict const end_line_u1 = out_a + simd1.get_unroll(line_length_a);
+
+      for (; temp_out < out_a; temp_out++)
+      {
+        *temp_out = val;
+      }
+      for (; temp_out < end_line_u4; temp_out += simd4.block)
+      {
+        this->access_u.store(temp_out + 0 * simd4.shift, vec_val);
+        this->access_u.store(temp_out + 1 * simd4.shift, vec_val);
+        this->access_u.store(temp_out + 2 * simd4.shift, vec_val);
+        this->access_u.store(temp_out + 3 * simd4.shift, vec_val);
+      }
+      for (; temp_out < end_line_u1; temp_out += simd1.block)
+      {
+        this->access_u.store(temp_out, vec_val);
+      }
     }
-    for (; temp_data < end_line_unroll1; temp_data += simd1.block)
+    for (; temp_out < end_line_out; temp_out++)
     {
-      this->access_u.store(temp_data, vec_val);
-    }
-    for (; temp_data < end_line; temp_data++)
-    {
-      *temp_data = val;
+      *temp_out = val;
     }
   }
 
@@ -270,34 +281,45 @@ namespace dct {
     const SIMDUnrolling<Type, vector_size> simd4(4);
     const SIMDUnrolling<Type, vector_size> simd1(1);
 
-    const Type * __restrict const end_line_inout_unroll4 = inout + simd4.get_unroll(line_length);
-    const Type * __restrict const end_line_inout_unroll1 = inout + simd1.get_unroll(line_length);
-    const Type * __restrict const end_line_inout = inout + line_length;
-
     Type * __restrict temp_inout = inout;
+    const Type * __restrict const end_line_inout = inout + line_length;
 
-    for (; temp_inout < end_line_inout_unroll4; temp_inout += simd4.block)
+    const size_t shift_to_align = simd1.get_shift_to_align(inout);
+    if (shift_to_align < line_length)
     {
-      const vVvf & a0 = this->access_u.load(temp_inout + 0 * simd4.shift);
-      const vVvf & a1 = this->access_u.load(temp_inout + 1 * simd4.shift);
-      const vVvf & a2 = this->access_u.load(temp_inout + 2 * simd4.shift);
-      const vVvf & a3 = this->access_u.load(temp_inout + 3 * simd4.shift);
-
-      const vVvf c0 = op(a0);
-      const vVvf c1 = op(a1);
-      const vVvf c2 = op(a2);
-      const vVvf c3 = op(a3);
-
-      this->access_u.store(temp_inout + 0 * simd4.shift, c0);
-      this->access_u.store(temp_inout + 1 * simd4.shift, c1);
-      this->access_u.store(temp_inout + 2 * simd4.shift, c2);
-      this->access_u.store(temp_inout + 3 * simd4.shift, c3);
-    }
-    for (; temp_inout < end_line_inout_unroll1; temp_inout += simd1.block)
-    {
-      const vVvf & a0 = this->access_u.load(temp_inout);
-      const vVvf c0 = op(a0);
-      this->access_u.store(temp_inout, c0);
+      Type * __restrict const out_a = inout + shift_to_align;
+      const size_t line_length_a = line_length - shift_to_align;
+
+      const Type * __restrict const end_line_inout_u4 = out_a + simd4.get_unroll(line_length_a);
+      const Type * __restrict const end_line_inout_u1 = out_a + simd1.get_unroll(line_length_a);
+
+      for (; temp_inout < out_a; temp_inout++)
+      {
+        *temp_inout = op(*temp_inout);
+      }
+      for (; temp_inout < end_line_inout_u4; temp_inout += simd4.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_inout + 0 * simd4.shift);
+        const vVvf & a1 = this->access_u.load(temp_inout + 1 * simd4.shift);
+        const vVvf & a2 = this->access_u.load(temp_inout + 2 * simd4.shift);
+        const vVvf & a3 = this->access_u.load(temp_inout + 3 * simd4.shift);
+
+        const vVvf c0 = op(a0);
+        const vVvf c1 = op(a1);
+        const vVvf c2 = op(a2);
+        const vVvf c3 = op(a3);
+
+        this->access_u.store(temp_inout + 0 * simd4.shift, c0);
+        this->access_u.store(temp_inout + 1 * simd4.shift, c1);
+        this->access_u.store(temp_inout + 2 * simd4.shift, c2);
+        this->access_u.store(temp_inout + 3 * simd4.shift, c3);
+      }
+      for (; temp_inout < end_line_inout_u1; temp_inout += simd1.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_inout);
+        const vVvf c0 = op(a0);
+        this->access_u.store(temp_inout, c0);
+      }
     }
     for (; temp_inout < end_line_inout; temp_inout++)
     {
@@ -313,38 +335,48 @@ namespace dct {
     const SIMDUnrolling<Type, vector_size> simd4(4);
     const SIMDUnrolling<Type, vector_size> simd1(1);
 
-    const Type * __restrict const end_line_in_unroll4 = in + simd4.get_unroll(line_length);
-    const Type * __restrict const end_line_in_unroll1 = in + simd1.get_unroll(line_length);
-    const Type * __restrict const end_line_in = in + line_length;
-
     const Type * __restrict temp_in = in;
-
     Type * __restrict temp_out = out;
+    const Type * __restrict const end_line_out = out + line_length;
 
-    for (; temp_in < end_line_in_unroll4; temp_out += simd4.block, temp_in += simd4.block)
+    const size_t shift_to_align = simd1.get_shift_to_align(out);
+    if (shift_to_align < line_length)
     {
-      const vVvf & a0 = this->access_u.load(temp_in + 0 * simd4.shift);
-      const vVvf & a1 = this->access_u.load(temp_in + 1 * simd4.shift);
-      const vVvf & a2 = this->access_u.load(temp_in + 2 * simd4.shift);
-      const vVvf & a3 = this->access_u.load(temp_in + 3 * simd4.shift);
-
-      const vVvf c0 = op(a0);
-      const vVvf c1 = op(a1);
-      const vVvf c2 = op(a2);
-      const vVvf c3 = op(a3);
-
-      this->access_u.store(temp_out + 0 * simd4.shift, c0);
-      this->access_u.store(temp_out + 1 * simd4.shift, c1);
-      this->access_u.store(temp_out + 2 * simd4.shift, c2);
-      this->access_u.store(temp_out + 3 * simd4.shift, c3);
-    }
-    for (; temp_in < end_line_in_unroll1; temp_out += simd1.block, temp_in += simd1.block)
-    {
-      const vVvf & a0 = this->access_u.load(temp_in);
-      const vVvf c0 = op(a0);
-      this->access_u.store(temp_out, c0);
+      Type * __restrict const out_a = out + shift_to_align;
+      const size_t line_length_a = line_length - shift_to_align;
+
+      const Type * __restrict const end_line_out_u4 = out_a + simd4.get_unroll(line_length_a);
+      const Type * __restrict const end_line_out_u1 = out_a + simd1.get_unroll(line_length_a);
+
+      for (; temp_out < out_a; temp_out++, temp_in++)
+      {
+        *temp_out = op(*temp_in);
+      }
+      for (; temp_out < end_line_out_u4; temp_out += simd4.block, temp_in += simd4.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_in + 0 * simd4.shift);
+        const vVvf & a1 = this->access_u.load(temp_in + 1 * simd4.shift);
+        const vVvf & a2 = this->access_u.load(temp_in + 2 * simd4.shift);
+        const vVvf & a3 = this->access_u.load(temp_in + 3 * simd4.shift);
+
+        const vVvf c0 = op(a0);
+        const vVvf c1 = op(a1);
+        const vVvf c2 = op(a2);
+        const vVvf c3 = op(a3);
+
+        this->access_u.store(temp_out + 0 * simd4.shift, c0);
+        this->access_u.store(temp_out + 1 * simd4.shift, c1);
+        this->access_u.store(temp_out + 2 * simd4.shift, c2);
+        this->access_u.store(temp_out + 3 * simd4.shift, c3);
+      }
+      for (; temp_out < end_line_out_u1; temp_out += simd1.block, temp_in += simd1.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_in);
+        const vVvf c0 = op(a0);
+        this->access_u.store(temp_out, c0);
+      }
     }
-    for (; temp_in < end_line_in; temp_out++, temp_in++)
+    for (; temp_out < end_line_out; temp_out++, temp_in++)
     {
       *temp_out = op(*temp_in);
     }
@@ -358,44 +390,54 @@ namespace dct {
     const SIMDUnrolling<Type, vector_size> simd4(4);
     const SIMDUnrolling<Type, vector_size> simd1(1);
 
-    const Type * __restrict const end_line_in_unroll4 = in + simd4.get_unroll(line_length);
-    const Type * __restrict const end_line_in_unroll1 = in + simd1.get_unroll(line_length);
-    const Type * __restrict const end_line_in = in + line_length;
-
     const Type * __restrict temp_in = in;
-
     Type * __restrict temp_out = out;
+    const Type * __restrict const end_line_out = out + line_length;
 
-    for (; temp_in < end_line_in_unroll4; temp_out += simd4.block, temp_in += simd4.block)
-    {
-      const vVvf & a0 = this->access_u.load(temp_out + 0 * simd4.shift);
-      const vVvf & a1 = this->access_u.load(temp_out + 1 * simd4.shift);
-      const vVvf & a2 = this->access_u.load(temp_out + 2 * simd4.shift);
-      const vVvf & a3 = this->access_u.load(temp_out + 3 * simd4.shift);
-
-      const vVvf & b0 = this->access_u.load(temp_in + 0 * simd4.shift);
-      const vVvf & b1 = this->access_u.load(temp_in + 1 * simd4.shift);
-      const vVvf & b2 = this->access_u.load(temp_in + 2 * simd4.shift);
-      const vVvf & b3 = this->access_u.load(temp_in + 3 * simd4.shift);
-
-      const vVvf c0 = op(a0, b0);
-      const vVvf c1 = op(a1, b1);
-      const vVvf c2 = op(a2, b2);
-      const vVvf c3 = op(a3, b3);
-
-      this->access_u.store(temp_out + 0 * simd4.shift, c0);
-      this->access_u.store(temp_out + 1 * simd4.shift, c1);
-      this->access_u.store(temp_out + 2 * simd4.shift, c2);
-      this->access_u.store(temp_out + 3 * simd4.shift, c3);
-    }
-    for (; temp_in < end_line_in_unroll1; temp_out += simd1.block, temp_in += simd1.block)
+    const size_t shift_to_align = simd1.get_shift_to_align(out);
+    if (shift_to_align < line_length)
     {
-      const vVvf & a0 = this->access_u.load(temp_out);
-      const vVvf & b0 = this->access_u.load(temp_in);
-      const vVvf c0 = op(a0, b0);
-      this->access_u.store(temp_out, c0);
+      Type * __restrict const out_a = out + shift_to_align;
+      const size_t line_length_a = line_length - shift_to_align;
+
+      const Type * __restrict const end_line_out_u4 = out_a + simd4.get_unroll(line_length_a);
+      const Type * __restrict const end_line_out_u1 = out_a + simd1.get_unroll(line_length_a);
+
+      for (; temp_out < out_a; temp_out++, temp_in++)
+      {
+        *temp_out = op(*temp_out, *temp_in);
+      }
+      for (; temp_out < end_line_out_u4; temp_out += simd4.block, temp_in += simd4.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_out + 0 * simd4.shift);
+        const vVvf & a1 = this->access_u.load(temp_out + 1 * simd4.shift);
+        const vVvf & a2 = this->access_u.load(temp_out + 2 * simd4.shift);
+        const vVvf & a3 = this->access_u.load(temp_out + 3 * simd4.shift);
+
+        const vVvf & b0 = this->access_u.load(temp_in + 0 * simd4.shift);
+        const vVvf & b1 = this->access_u.load(temp_in + 1 * simd4.shift);
+        const vVvf & b2 = this->access_u.load(temp_in + 2 * simd4.shift);
+        const vVvf & b3 = this->access_u.load(temp_in + 3 * simd4.shift);
+
+        const vVvf c0 = op(a0, b0);
+        const vVvf c1 = op(a1, b1);
+        const vVvf c2 = op(a2, b2);
+        const vVvf c3 = op(a3, b3);
+
+        this->access_u.store(temp_out + 0 * simd4.shift, c0);
+        this->access_u.store(temp_out + 1 * simd4.shift, c1);
+        this->access_u.store(temp_out + 2 * simd4.shift, c2);
+        this->access_u.store(temp_out + 3 * simd4.shift, c3);
+      }
+      for (; temp_out < end_line_out_u1; temp_out += simd1.block, temp_in += simd1.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_out);
+        const vVvf & b0 = this->access_u.load(temp_in);
+        const vVvf c0 = op(a0, b0);
+        this->access_u.store(temp_out, c0);
+      }
     }
-    for (; temp_in < end_line_in; temp_out++, temp_in++)
+    for (; temp_out < end_line_out; temp_out++, temp_in++)
     {
       *temp_out = op(*temp_out, *temp_in);
     }
@@ -409,47 +451,57 @@ namespace dct {
     const SIMDUnrolling<Type, vector_size> simd4(4);
     const SIMDUnrolling<Type, vector_size> simd1(1);
 
-    const Type * __restrict const end_line_in1_unroll4 = in1 + simd4.get_unroll(line_length);
-    const Type * __restrict const end_line_in1_unroll1 = in1 + simd1.get_unroll(line_length);
-    const Type * __restrict const end_line_in1 = in1 + line_length;
-
     const Type * __restrict temp_in1 = in1;
     const Type * __restrict temp_in2 = in2;
-
     Type * __restrict temp_out = out;
+    const Type * __restrict const end_line_out = out + line_length;
 
-    for (; temp_in1 < end_line_in1_unroll4;
-        temp_out += simd4.block, temp_in1 += simd4.block, temp_in2 += simd4.block)
-    {
-      const vVvf & a0 = this->access_u.load(temp_in1 + 0 * simd4.shift);
-      const vVvf & a1 = this->access_u.load(temp_in1 + 1 * simd4.shift);
-      const vVvf & a2 = this->access_u.load(temp_in1 + 2 * simd4.shift);
-      const vVvf & a3 = this->access_u.load(temp_in1 + 3 * simd4.shift);
-
-      const vVvf & b0 = this->access_u.load(temp_in2 + 0 * simd4.shift);
-      const vVvf & b1 = this->access_u.load(temp_in2 + 1 * simd4.shift);
-      const vVvf & b2 = this->access_u.load(temp_in2 + 2 * simd4.shift);
-      const vVvf & b3 = this->access_u.load(temp_in2 + 3 * simd4.shift);
-
-      const vVvf c0 = op(a0, b0);
-      const vVvf c1 = op(a1, b1);
-      const vVvf c2 = op(a2, b2);
-      const vVvf c3 = op(a3, b3);
-
-      this->access_u.store(temp_out + 0 * simd4.shift, c0);
-      this->access_u.store(temp_out + 1 * simd4.shift, c1);
-      this->access_u.store(temp_out + 2 * simd4.shift, c2);
-      this->access_u.store(temp_out + 3 * simd4.shift, c3);
-    }
-    for (; temp_in1 < end_line_in1_unroll1;
-        temp_out += simd1.block, temp_in1 += simd1.block, temp_in2 += simd1.block)
+    const size_t shift_to_align = simd1.get_shift_to_align(out);
+    if (shift_to_align < line_length)
     {
-      const vVvf & a0 = this->access_u.load(temp_in1);
-      const vVvf & b0 = this->access_u.load(temp_in2);
-      const vVvf c0 = op(a0, b0);
-      this->access_u.store(temp_out, c0);
+      Type * __restrict const out_a = out + shift_to_align;
+      const size_t aligned_line_length = line_length - shift_to_align;
+
+      const Type * __restrict const end_line_out_u4 = out_a + simd4.get_unroll(aligned_line_length);
+      const Type * __restrict const end_line_out_u1 = out_a + simd1.get_unroll(aligned_line_length);
+
+      for (; temp_out < out_a; temp_out++, temp_in1++, temp_in2++)
+      {
+        *temp_out = op(*temp_in1, *temp_in2);
+      }
+      for (; temp_out < end_line_out_u4;
+          temp_out += simd4.block, temp_in1 += simd4.block, temp_in2 += simd4.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_in1 + 0 * simd4.shift);
+        const vVvf & a1 = this->access_u.load(temp_in1 + 1 * simd4.shift);
+        const vVvf & a2 = this->access_u.load(temp_in1 + 2 * simd4.shift);
+        const vVvf & a3 = this->access_u.load(temp_in1 + 3 * simd4.shift);
+
+        const vVvf & b0 = this->access_u.load(temp_in2 + 0 * simd4.shift);
+        const vVvf & b1 = this->access_u.load(temp_in2 + 1 * simd4.shift);
+        const vVvf & b2 = this->access_u.load(temp_in2 + 2 * simd4.shift);
+        const vVvf & b3 = this->access_u.load(temp_in2 + 3 * simd4.shift);
+
+        const vVvf c0 = op(a0, b0);
+        const vVvf c1 = op(a1, b1);
+        const vVvf c2 = op(a2, b2);
+        const vVvf c3 = op(a3, b3);
+
+        this->access_u.store(temp_out + 0 * simd4.shift, c0);
+        this->access_u.store(temp_out + 1 * simd4.shift, c1);
+        this->access_u.store(temp_out + 2 * simd4.shift, c2);
+        this->access_u.store(temp_out + 3 * simd4.shift, c3);
+      }
+      for (; temp_out < end_line_out_u1;
+          temp_out += simd1.block, temp_in1 += simd1.block, temp_in2 += simd1.block)
+      {
+        const vVvf & a0 = this->access_u.load(temp_in1);
+        const vVvf & b0 = this->access_u.load(temp_in2);
+        const vVvf c0 = op(a0, b0);
+        this->access_u.store(temp_out, c0);
+      }
     }
-    for (; temp_in1 < end_line_in1; temp_out++, temp_in1++, temp_in2++)
+    for (; temp_out < end_line_out; temp_out++, temp_in1++, temp_in2++)
     {
       *temp_out = op(*temp_in1, *temp_in2);
     }
diff --git a/zUtil_Cxx/include/vectorization.h b/zUtil_Cxx/include/vectorization.h
index 4bb2b354442585c2d68541cb902f5e49832e1dd9..d5b6b8de4e937cd3e2c186d89d3666332572d302 100644
--- a/zUtil_Cxx/include/vectorization.h
+++ b/zUtil_Cxx/include/vectorization.h
@@ -77,6 +77,11 @@ public:
   {
     return ROUND_DOWN(tot_size, block);
   }
+  const size_t
+  get_shift_to_align(const Type * const start) const
+  {
+    return ((vector_size - (reinterpret_cast<size_t>(start) % vector_size)) % vector_size);
+  }
 };