Skip to content
Snippets Groups Projects
Commit 8c7dae43 authored by Nicola Vigano's avatar Nicola Vigano
Browse files

6D-C++: Sinos<->Blobs added SSE/SSE2 vectorized operations.

parent 0321f34c
No related branches found
No related tags found
No related merge requests found
...@@ -364,7 +364,6 @@ namespace GT6D { ...@@ -364,7 +364,6 @@ namespace GT6D {
const mwSize * const dims_sino = mxGetDimensions(sino); const mwSize * const dims_sino = mxGetDimensions(sino);
const mwSize & line_length = dims_sino[0]; const mwSize & line_length = dims_sino[0];
const mwSize & lines_num = dims_sino[2]; const mwSize & lines_num = dims_sino[2];
const mwSize line_length_unroll = ROUND_DOWN(line_length, 4);
Type * const sino_data = ((Type *) mxGetData(sino)); Type * const sino_data = ((Type *) mxGetData(sino));
const mwSize sino_skip = dims_sino[0] * dims_sino[1]; const mwSize sino_skip = dims_sino[0] * dims_sino[1];
...@@ -395,29 +394,65 @@ namespace GT6D { ...@@ -395,29 +394,65 @@ namespace GT6D {
blob_skips[m] = dims_blob[0] * dims_blob[1]; blob_skips[m] = dims_blob[0] * dims_blob[1];
} }
#pragma omp parallel for #pragma omp parallel
for (mwIndex line_num = 0; line_num < lines_num; line_num++)
{ {
const mwSize line_sino_skip = line_num * sino_skip; const AccessUnaligned<Type> access;
const SIMDUnrolling<Type> simd4(4);
const SIMDUnrolling<Type> simd(1);
const mwSize line_length_unroll4 = simd4.get_unroll(line_length);
const mwSize line_length_unroll = simd.get_unroll(line_length);
for (mwIndex m = 0; m < num_projs; m++) typedef typename SIMDUnrolling<Type>::vVvf vVvf;
{
const Type & c = coeffs_t[m];
inner_sum_scale<Type> op(c);
const Type * const blob_data_line = blob_data_bases[m] + line_num * blob_skips[m]; #pragma omp for
Type * const sino_data_line = sino_data_bases[m] + line_sino_skip; for (mwIndex line_num = 0; line_num < lines_num; line_num++)
{
const mwSize line_sino_skip = line_num * sino_skip;
for (mwIndex elem = 0; elem < line_length_unroll; elem += 4) for (mwIndex m = 0; m < num_projs; m++)
{ {
sino_data_line[elem+0] = op(sino_data_line[elem+0], blob_data_line[elem+0]); const Type & c = coeffs_t[m];
sino_data_line[elem+1] = op(sino_data_line[elem+1], blob_data_line[elem+1]); inner_sum_scale<Type> op(c);
sino_data_line[elem+2] = op(sino_data_line[elem+2], blob_data_line[elem+2]);
sino_data_line[elem+3] = op(sino_data_line[elem+3], blob_data_line[elem+3]); const Type * const blob_data_line = blob_data_bases[m] + line_num * blob_skips[m];
} Type * const sino_data_line = sino_data_bases[m] + line_sino_skip;
for (mwIndex elem = line_length_unroll; elem < line_length; elem++)
{ for (mwIndex elem = 0; elem < line_length_unroll4; elem += simd4.block)
sino_data_line[elem] = op(sino_data_line[elem], blob_data_line[elem]); {
Type * const sino_line = sino_data_line + elem;
const Type * const blob_line = blob_data_line + elem;
const vVvf & a0 = access.load(sino_line + 0 * simd4.shift);
const vVvf & a1 = access.load(sino_line + 1 * simd4.shift);
const vVvf & a2 = access.load(sino_line + 2 * simd4.shift);
const vVvf & a3 = access.load(sino_line + 3 * simd4.shift);
const vVvf & b0 = access.load(blob_line + 0 * simd4.shift);
const vVvf & b1 = access.load(blob_line + 1 * simd4.shift);
const vVvf & b2 = access.load(blob_line + 2 * simd4.shift);
const vVvf & b3 = access.load(blob_line + 3 * simd4.shift);
const vVvf c0 = op(a0, b0);
const vVvf c1 = op(a1, b1);
const vVvf c2 = op(a2, b2);
const vVvf c3 = op(a3, b3);
access.store(sino_line + 0 * simd4.shift, c0);
access.store(sino_line + 1 * simd4.shift, c1);
access.store(sino_line + 2 * simd4.shift, c2);
access.store(sino_line + 3 * simd4.shift, c3);
}
for (mwIndex elem = line_length_unroll4; elem < line_length_unroll; elem += simd.block)
{
const vVvf & a = access.load(sino_data_line + elem);
const vVvf & b = access.load(blob_data_line + elem);
const vVvf c = op(a, b);
access.store(sino_data_line + elem, c);
}
for (mwIndex elem = line_length_unroll; elem < line_length; elem++)
{
sino_data_line[elem] = op(sino_data_line[elem], blob_data_line[elem]);
}
} }
} }
} }
...@@ -441,7 +476,6 @@ namespace GT6D { ...@@ -441,7 +476,6 @@ namespace GT6D {
const mwSize * const dims_sino = mxGetDimensions(sino); const mwSize * const dims_sino = mxGetDimensions(sino);
const mwSize & line_length = dims_sino[0]; const mwSize & line_length = dims_sino[0];
const mwSize & lines_num = dims_sino[2]; const mwSize & lines_num = dims_sino[2];
const mwSize line_length_unroll = ROUND_DOWN(line_length, 4);
const Type * const sino_data = ((const Type *) mxGetData(sino)); const Type * const sino_data = ((const Type *) mxGetData(sino));
const mwSize sino_skip = dims_sino[0] * dims_sino[1]; const mwSize sino_skip = dims_sino[0] * dims_sino[1];
...@@ -472,32 +506,195 @@ namespace GT6D { ...@@ -472,32 +506,195 @@ namespace GT6D {
blob_skips[m] = dims_blob[0] * dims_blob[1]; blob_skips[m] = dims_blob[0] * dims_blob[1];
} }
#pragma omp parallel for #pragma omp parallel
{
const AccessUnaligned<Type> access;
const SIMDUnrolling<Type> simd4(4);
const SIMDUnrolling<Type> simd(1);
const mwSize line_length_unroll4 = simd4.get_unroll(line_length);
const mwSize line_length_unroll = simd.get_unroll(line_length);
typedef typename SIMDUnrolling<Type>::vVvf vVvf;
#pragma omp for
for (mwIndex line_num = 0; line_num < lines_num; line_num++)
{
const mwSize line_sino_skip = line_num * sino_skip;
for (mwIndex m = 0; m < num_projs; m++)
{
const Type & c = coeffs_t[m];
inner_sum_scale<Type> op(c);
Type * const blob_data_line = blob_data_bases[m] + line_num * blob_skips[m];
const Type * const sino_data_line = sino_data_bases[m] + line_sino_skip;
for (mwIndex elem = 0; elem < line_length_unroll4; elem += simd4.block)
{
const Type * const sino_line = sino_data_line + elem;
Type * const blob_line = blob_data_line + elem;
const vVvf & a0 = access.load(blob_line + 0 * simd4.shift);
const vVvf & a1 = access.load(blob_line + 1 * simd4.shift);
const vVvf & a2 = access.load(blob_line + 2 * simd4.shift);
const vVvf & a3 = access.load(blob_line + 3 * simd4.shift);
const vVvf & b0 = access.load(sino_line + 0 * simd4.shift);
const vVvf & b1 = access.load(sino_line + 1 * simd4.shift);
const vVvf & b2 = access.load(sino_line + 2 * simd4.shift);
const vVvf & b3 = access.load(sino_line + 3 * simd4.shift);
const vVvf c0 = op(a0, b0);
const vVvf c1 = op(a1, b1);
const vVvf c2 = op(a2, b2);
const vVvf c3 = op(a3, b3);
access.store(blob_line + 0 * simd4.shift, c0);
access.store(blob_line + 1 * simd4.shift, c1);
access.store(blob_line + 2 * simd4.shift, c2);
access.store(blob_line + 3 * simd4.shift, c3);
}
for (mwIndex elem = line_length_unroll4; elem < line_length_unroll; elem += simd.block)
{
const vVvf & a = access.load(blob_data_line + elem);
const vVvf & b = access.load(sino_data_line + elem);
const vVvf c = op(a, b);
access.store(blob_data_line + elem, c);
}
for (mwIndex elem = line_length_unroll; elem < line_length; elem++)
{
blob_data_line[elem] = op(blob_data_line[elem], sino_data_line[elem]);
}
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
template<typename Type>
struct ProjCoeffs {
Type * sino_data_base;
const Type * blob_data_base;
mwSize blob_skip;
std::vector<mwSize> offsets;
std::vector<Type> coeffs;
};
template<typename Type>
inline void
do_blobs_to_single_sinogram_struct(mxArray * sino, const mxArray * const blobs,
const mxArray * const coeffs)
{
const mxArray * const proj_struct = mxGetField(coeffs, 0, "proj");
const mwSize num_slices = mxGetNumberOfElements(proj_struct);
const mwSize * const dims_sino = mxGetDimensions(sino);
const mwSize & line_length = dims_sino[0];
const mwSize & lines_num = dims_sino[2];
const mwSize line_length_unroll = ROUND_DOWN(line_length, 4);
const mwSize sino_skip = dims_sino[0] * dims_sino[1];
Type * const sino_data_base = ((Type *) mxGetData(sino));
std::vector<ProjCoeffs<Type> > coeffs_str(num_slices);
for (mwIndex m = 0; m < num_slices; m++)
{
const mxArray * const sino_offset_a = mxGetField(proj_struct, m, "sino_offset");
const mxArray * const blob_offset_a = mxGetField(proj_struct, m, "blob_offset");
const mxArray * const proj_offsets_a = mxGetField(proj_struct, m, "proj_offsets");
const mxArray * const proj_coeffs_a = mxGetField(proj_struct, m, "proj_coeffs");
const double * const proj_offsets = (const double *) mxGetData(proj_offsets_a);
const double * const proj_coeffs = (const double *) mxGetData(proj_coeffs_a);
const mwIndex s = mxGetScalar(sino_offset_a) - 1;
coeffs_str[m].sino_data_base = s * line_length + sino_data_base;
const mwIndex b = mxGetScalar(blob_offset_a) - 1;
const mxArray * const cell_blobs = mxGetCell(blobs, b);
const mwSize * const dims_blob = mxGetDimensions(cell_blobs);
coeffs_str[m].blob_skip = dims_blob[0] * dims_blob[1];
coeffs_str[m].blob_data_base = ((const Type *) mxGetData(cell_blobs));
const mwSize num_projs = mxGetNumberOfElements(proj_offsets_a);
coeffs_str[m].offsets.resize(num_projs);
coeffs_str[m].coeffs.resize(num_projs);
for (mwIndex i = 0; i < num_projs; i++)
{
coeffs_str[m].offsets[i] = (const mwIndex) proj_offsets[i]-1;
coeffs_str[m].coeffs[i] = (const Type) proj_coeffs[i];
}
}
#pragma omp parallel
{
register Type buff[line_length] __attribute__((aligned(MEMORY_ALIGN)));
#pragma omp for
for (mwIndex line_num = 0; line_num < lines_num; line_num++) for (mwIndex line_num = 0; line_num < lines_num; line_num++)
{ {
const mwSize line_sino_skip = line_num * sino_skip; const mwSize line_sino_skip = line_num * sino_skip;
for (mwIndex m = 0; m < num_projs; m++) for (mwIndex m = 0; m < num_slices; m++)
{ {
const Type & c = coeffs_t[m]; const Type * const blob_data_base = coeffs_str[m].blob_data_base + line_num * coeffs_str[m].blob_skip;
inner_sum_scale<Type> op(c);
for (mwIndex elem = 0; elem < line_length_unroll; elem += 4)
{
buff[elem+0] = 0;
buff[elem+1] = 0;
buff[elem+2] = 0;
buff[elem+3] = 0;
}
for (mwIndex elem = line_length_unroll; elem < line_length; elem++)
{
buff[elem] = 0;
}
Type * const blob_data_line = blob_data_bases[m] + line_num * blob_skips[m]; const mwSize & num_projs = coeffs_str[m].offsets.size();
const Type * const sino_data_line = sino_data_bases[m] + line_sino_skip; for (mwIndex i = 0; i < num_projs; i++)
{
const mwIndex & p = coeffs_str[m].offsets[i];
const Type & c = coeffs_str[m].coeffs[i];
inner_sum_scale<Type> op(c);
const mwSize blob_initial_offset = line_length * p;
const Type * const blob_data_line = blob_data_base + blob_initial_offset;
for (mwIndex elem = 0; elem < line_length_unroll; elem += 4)
{
buff[elem+0] = op(buff[elem+0], blob_data_line[elem+0]);
buff[elem+1] = op(buff[elem+1], blob_data_line[elem+1]);
buff[elem+2] = op(buff[elem+2], blob_data_line[elem+2]);
buff[elem+3] = op(buff[elem+3], blob_data_line[elem+3]);
}
for (mwIndex elem = line_length_unroll; elem < line_length; elem++)
{
buff[elem] = op(buff[elem], blob_data_line[elem]);
}
}
Type * const sino_data_line = coeffs_str[m].sino_data_base + line_sino_skip;
for (mwIndex elem = 0; elem < line_length_unroll; elem += 4) for (mwIndex elem = 0; elem < line_length_unroll; elem += 4)
{ {
blob_data_line[elem+0] = op(blob_data_line[elem+0], sino_data_line[elem+0]); sino_data_line[elem+0] = buff[elem+0];
blob_data_line[elem+1] = op(blob_data_line[elem+1], sino_data_line[elem+1]); sino_data_line[elem+1] = buff[elem+1];
blob_data_line[elem+2] = op(blob_data_line[elem+2], sino_data_line[elem+2]); sino_data_line[elem+2] = buff[elem+2];
blob_data_line[elem+3] = op(blob_data_line[elem+3], sino_data_line[elem+3]); sino_data_line[elem+3] = buff[elem+3];
} }
for (mwIndex elem = line_length_unroll; elem < line_length; elem++) for (mwIndex elem = line_length_unroll; elem < line_length; elem++)
{ {
blob_data_line[elem] = op(blob_data_line[elem], sino_data_line[elem]); sino_data_line[elem] = buff[elem];
} }
} }
} }
}
} }
}; };
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment