Skip to content
Snippets Groups Projects
Commit 7097d4ff authored by Nicola Viganò's avatar Nicola Viganò
Browse files

Mex: added cell volumes accumulation function

parent 9a0cfb0a
No related branches found
No related tags found
No related merge requests found
/*
* gtCxxMathsSumCellVolumes.cpp
*
* Created on: 04 mag 2016
* Author: nicola
*/
#include "gtCxxMathsSumCellVolumes.h"
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
if (nrhs < 1) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"No volumes passed!");
return;
}
const mxArray * const volumes = prhs[0];
if (!mxIsCell(volumes)) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"The argument should be a cell of volumes!");
return;
}
mwSize num_volumes = mxGetNumberOfElements(volumes);
if (!num_volumes) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"No volumes passed!");
return;
}
// Getting information about
const mxArray * first_volume = mxGetCell(volumes, 0);
const mwSize num_dims = mxGetNumberOfDimensions(first_volume);
const mwSize * dims = mxGetDimensions(first_volume);
const mxClassID type = mxGetClassID(first_volume);
if (!(type == mxDOUBLE_CLASS || type == mxSINGLE_CLASS)) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"Volumes should be in either double or single floating point types!");
return;
}
for (mwIndex vol_id = 1; vol_id < num_volumes; vol_id++) {
const mxArray * curr_volume = mxGetCell(volumes, vol_id);
const mwSize * curr_vol_dims = mxGetDimensions(curr_volume);
if (mxGetNumberOfDimensions(curr_volume) != num_dims) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"All volumes should have the same dimensions!");
return;
}
for (mwIndex dim_id = 0; dim_id < num_dims; dim_id++) {
if (dims[dim_id] != curr_vol_dims[dim_id]) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"All volumes should have the same dimensions!");
return;
}
}
if (mxGetClassID(curr_volume) != type) {
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"All volumes should be of the same floating point type!");
return;
}
}
if (nrhs >= 2)
{
const mxArray * const num_threads = prhs[1];
initialize_multithreading(mxGetScalar(num_threads));
}
else
{
initialize_multithreading();
}
plhs[0] = mxCreateUninitNumericArray(num_dims, (mwSize *)dims, type, mxREAL);
switch (type)
{
case mxDOUBLE_CLASS:
{
{
GT_MATHS::reduce_volumes<double>(plhs[0], volumes);
}
break;
}
case mxSINGLE_CLASS:
{
{
GT_MATHS::reduce_volumes<float>(plhs[0], volumes);
}
break;
}
default:
{
mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
"The argument needs to be a Cell array of floating point numbers");
return;
}
}
}
/*
* gtCxxMathsSumCellVolumes.h
*
* Created on: 05 mag 2016
* Author: nicola
*/
#ifndef ZUTIL_CXX_INCLUDE_GTCXXMATHSSUMCELLVOLUMES_H_
#define ZUTIL_CXX_INCLUDE_GTCXXMATHSSUMCELLVOLUMES_H_
#include "internal_cell_defs.h"
namespace GT_MATHS {
const char * reduce_volumes_error_id = "gtCxxMathsSumCellVolumes:wrong_argument";
template<typename Type>
void
reduce_volumes(mxArray * const mxOutput, const mxArray * const mxInputCell)
{
const mwSize num_elements = mxGetNumberOfElements(mxOutput);
// Deciding the cache size: we want to use maximum 8KB per thread
const mwSize buffer_size = 8 * 1024 / sizeof(Type);
const SIMDUnrolling<Type> simd_4(4);
typedef typename SIMDUnrolling<Type>::vVvf vVvf;
// Output pointer
Type * output = (Type *)mxGetData(mxOutput);
// Taking input pointers
const mwSize num_cells = mxGetNumberOfElements(mxInputCell);
std::vector<const Type *> inputs(num_cells);
for (mwIndex icell = 0; icell < num_cells; icell++)
{
const mxArray * mxCell = mxGetCell(mxInputCell, icell);
inputs[icell] = (const Type *)mxGetData(mxCell);
}
const AccessAligned<Type> access;
const inner_sum<Type> add_op;
#pragma omp parallel for
for (mwIndex ibase = 0; ibase < num_elements; ibase += buffer_size)
{
const mwSize end_of_buffer = std::min(buffer_size, num_elements - ibase);
const mwSize end_of_unrolled_buffer = simd_4.get_unroll(end_of_buffer);
register Type buffer[end_of_buffer];
const Type * const first_cell_data = inputs[0] + ibase;
// First we initialize with the first volume
for (mwIndex icache = 0; icache < end_of_unrolled_buffer; icache += simd_4.block)
{
const Type * const base_cache_cell = first_cell_data + icache;
const vVvf & in0 = access.load(base_cache_cell + 0 * simd_4.shift);
const vVvf & in1 = access.load(base_cache_cell + 1 * simd_4.shift);
const vVvf & in2 = access.load(base_cache_cell + 2 * simd_4.shift);
const vVvf & in3 = access.load(base_cache_cell + 3 * simd_4.shift);
Type * const base_buff = buffer + icache;
access.store(base_buff + 0 * simd_4.shift, in0);
access.store(base_buff + 1 * simd_4.shift, in1);
access.store(base_buff + 2 * simd_4.shift, in2);
access.store(base_buff + 3 * simd_4.shift, in3);
}
for (mwIndex icache = end_of_unrolled_buffer; icache < end_of_buffer; icache++)
{
buffer[icache] = first_cell_data[icache];
}
// Now we start summing each other volume
for (mwIndex icell = 1; icell < num_cells; icell++)
{
const Type * const cell_data = inputs[icell] + ibase;
// First we initialize with the first volume
for (mwIndex icache = 0; icache < end_of_unrolled_buffer; icache += simd_4.block)
{
const Type * const base_cache_cell = cell_data + icache;
const vVvf & in0_i = access.load(base_cache_cell + 0 * simd_4.shift);
const vVvf & in1_i = access.load(base_cache_cell + 1 * simd_4.shift);
const vVvf & in2_i = access.load(base_cache_cell + 2 * simd_4.shift);
const vVvf & in3_i = access.load(base_cache_cell + 3 * simd_4.shift);
Type * const base_buff = buffer + icache;
const vVvf & in0_b = access.load(base_buff + 0 * simd_4.shift);
const vVvf & in1_b = access.load(base_buff + 1 * simd_4.shift);
const vVvf & in2_b = access.load(base_buff + 2 * simd_4.shift);
const vVvf & in3_b = access.load(base_buff + 3 * simd_4.shift);
access.store(base_buff + 0 * simd_4.shift, add_op(in0_b, in0_i));
access.store(base_buff + 1 * simd_4.shift, add_op(in1_b, in1_i));
access.store(base_buff + 2 * simd_4.shift, add_op(in2_b, in2_i));
access.store(base_buff + 3 * simd_4.shift, add_op(in3_b, in3_i));
}
for (mwIndex icache = end_of_unrolled_buffer; icache < end_of_buffer; icache++)
{
buffer[icache] += cell_data[icache];
}
}
Type * const base_output = output + ibase;
// Finally we save back the result
for (mwIndex icache = 0; icache < end_of_unrolled_buffer; icache += simd_4.block)
{
const Type * const base_buff = buffer + icache;
const vVvf & in0 = access.load(base_buff + 0 * simd_4.shift);
const vVvf & in1 = access.load(base_buff + 1 * simd_4.shift);
const vVvf & in2 = access.load(base_buff + 2 * simd_4.shift);
const vVvf & in3 = access.load(base_buff + 3 * simd_4.shift);
Type * const base_cache_output = base_output + icache;
access.store(base_cache_output + 0 * simd_4.shift, in0);
access.store(base_cache_output + 1 * simd_4.shift, in1);
access.store(base_cache_output + 2 * simd_4.shift, in2);
access.store(base_cache_output + 3 * simd_4.shift, in3);
}
for (mwIndex icache = end_of_unrolled_buffer; icache < end_of_buffer; icache++)
{
base_output[icache] = buffer[icache];
}
}
}
} // namespace GT_MATHS
#endif /* ZUTIL_CXX_INCLUDE_GTCXXMATHSSUMCELLVOLUMES_H_ */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment