Mex: added cell volumes accumulation function

Signed-off-by: Nicola Viganò <ben.vighy@gmail.com>

Mex: added cell volumes accumulation function
7097d4ff · Nicola Viganò · 9a0cfb0a · 7097d4ff · 7097d4ff
Commit 7097d4ff authored 8 years ago by Nicola Viganò
--- a/zUtil_Cxx/3D_ops/gtCxxMathsSumCellVolumes.cpp
+++ b/zUtil_Cxx/3D_ops/gtCxxMathsSumCellVolumes.cpp
+/*
+ * gtCxxMathsSumCellVolumes.cpp
+ *
+ *  Created on: 04 mag 2016
+ *      Author: nicola
+ */
+
+#include "gtCxxMathsSumCellVolumes.h"
+
+void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
+{
+  if (nrhs < 1) {
+    mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+        "No volumes passed!");
+    return;
+  }
+
+  const mxArray * const volumes = prhs[0];
+
+  if (!mxIsCell(volumes)) {
+    mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+        "The argument should be a cell of volumes!");
+    return;
+  }
+
+  mwSize num_volumes = mxGetNumberOfElements(volumes);
+  if (!num_volumes) {
+    mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+        "No volumes passed!");
+    return;
+  }
+
+  // Getting information about
+  const mxArray * first_volume = mxGetCell(volumes, 0);
+  const mwSize num_dims = mxGetNumberOfDimensions(first_volume);
+  const mwSize * dims = mxGetDimensions(first_volume);
+  const mxClassID type = mxGetClassID(first_volume);
+  if (!(type == mxDOUBLE_CLASS || type == mxSINGLE_CLASS)) {
+    mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+        "Volumes should be in either double or single floating point types!");
+    return;
+  }
+
+  for (mwIndex vol_id = 1; vol_id < num_volumes; vol_id++) {
+    const mxArray * curr_volume = mxGetCell(volumes, vol_id);
+    const mwSize * curr_vol_dims = mxGetDimensions(curr_volume);
+    if (mxGetNumberOfDimensions(curr_volume) != num_dims) {
+      mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+          "All volumes should have the same dimensions!");
+      return;
+    }
+    for (mwIndex dim_id = 0; dim_id < num_dims; dim_id++) {
+      if (dims[dim_id] != curr_vol_dims[dim_id]) {
+        mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+            "All volumes should have the same dimensions!");
+        return;
+      }
+    }
+    if (mxGetClassID(curr_volume) != type) {
+      mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+          "All volumes should be of the same floating point type!");
+      return;
+    }
+  }
+
+  if (nrhs >= 2)
+  {
+    const mxArray * const num_threads = prhs[1];
+    initialize_multithreading(mxGetScalar(num_threads));
+  }
+  else
+  {
+    initialize_multithreading();
+  }
+
+  plhs[0] = mxCreateUninitNumericArray(num_dims, (mwSize *)dims, type, mxREAL);
+
+  switch (type)
+  {
+    case mxDOUBLE_CLASS:
+    {
+      {
+        GT_MATHS::reduce_volumes<double>(plhs[0], volumes);
+      }
+      break;
+    }
+    case mxSINGLE_CLASS:
+    {
+      {
+        GT_MATHS::reduce_volumes<float>(plhs[0], volumes);
+      }
+      break;
+    }
+    default:
+    {
+      mexErrMsgIdAndTxt(GT_MATHS::reduce_volumes_error_id,
+          "The argument needs to be a Cell array of floating point numbers");
+      return;
+    }
+  }
+}
+
--- a/zUtil_Cxx/include/gtCxxMathsSumCellVolumes.h
+++ b/zUtil_Cxx/include/gtCxxMathsSumCellVolumes.h
+/*
+ * gtCxxMathsSumCellVolumes.h
+ *
+ *  Created on: 05 mag 2016
+ *      Author: nicola
+ */
+
+#ifndef ZUTIL_CXX_INCLUDE_GTCXXMATHSSUMCELLVOLUMES_H_
+#define ZUTIL_CXX_INCLUDE_GTCXXMATHSSUMCELLVOLUMES_H_
+
+#include "internal_cell_defs.h"
+
+namespace GT_MATHS {
+
+const char * reduce_volumes_error_id = "gtCxxMathsSumCellVolumes:wrong_argument";
+
+template<typename Type>
+void
+reduce_volumes(mxArray * const mxOutput, const mxArray * const mxInputCell)
+{
+  const mwSize num_elements = mxGetNumberOfElements(mxOutput);
+
+  // Deciding the cache size: we want to use maximum 8KB per thread
+  const mwSize buffer_size = 8 * 1024 / sizeof(Type);
+
+  const SIMDUnrolling<Type> simd_4(4);
+
+  typedef typename SIMDUnrolling<Type>::vVvf vVvf;
+
+  // Output pointer
+  Type * output = (Type *)mxGetData(mxOutput);
+
+  // Taking input pointers
+  const mwSize num_cells = mxGetNumberOfElements(mxInputCell);
+  std::vector<const Type *> inputs(num_cells);
+  for (mwIndex icell = 0; icell < num_cells; icell++)
+  {
+    const mxArray * mxCell = mxGetCell(mxInputCell, icell);
+    inputs[icell] = (const Type *)mxGetData(mxCell);
+  }
+
+  const AccessAligned<Type> access;
+  const inner_sum<Type> add_op;
+
+#pragma omp parallel for
+  for (mwIndex ibase = 0; ibase < num_elements; ibase += buffer_size)
+  {
+    const mwSize end_of_buffer = std::min(buffer_size, num_elements - ibase);
+    const mwSize end_of_unrolled_buffer = simd_4.get_unroll(end_of_buffer);
+
+    register Type buffer[end_of_buffer];
+
+    const Type * const first_cell_data = inputs[0] + ibase;
+
+    // First we initialize with the first volume
+    for (mwIndex icache = 0; icache < end_of_unrolled_buffer; icache += simd_4.block)
+    {
+      const Type * const base_cache_cell = first_cell_data + icache;
+      const vVvf & in0 = access.load(base_cache_cell + 0 * simd_4.shift);
+      const vVvf & in1 = access.load(base_cache_cell + 1 * simd_4.shift);
+      const vVvf & in2 = access.load(base_cache_cell + 2 * simd_4.shift);
+      const vVvf & in3 = access.load(base_cache_cell + 3 * simd_4.shift);
+
+      Type * const base_buff = buffer + icache;
+      access.store(base_buff + 0 * simd_4.shift, in0);
+      access.store(base_buff + 1 * simd_4.shift, in1);
+      access.store(base_buff + 2 * simd_4.shift, in2);
+      access.store(base_buff + 3 * simd_4.shift, in3);
+    }
+    for (mwIndex icache = end_of_unrolled_buffer; icache < end_of_buffer; icache++)
+    {
+      buffer[icache] = first_cell_data[icache];
+    }
+
+    // Now we start summing each other volume
+    for (mwIndex icell = 1; icell < num_cells; icell++)
+    {
+      const Type * const cell_data = inputs[icell] + ibase;
+
+      // First we initialize with the first volume
+      for (mwIndex icache = 0; icache < end_of_unrolled_buffer; icache += simd_4.block)
+      {
+        const Type * const base_cache_cell = cell_data + icache;
+        const vVvf & in0_i = access.load(base_cache_cell + 0 * simd_4.shift);
+        const vVvf & in1_i = access.load(base_cache_cell + 1 * simd_4.shift);
+        const vVvf & in2_i = access.load(base_cache_cell + 2 * simd_4.shift);
+        const vVvf & in3_i = access.load(base_cache_cell + 3 * simd_4.shift);
+
+        Type * const base_buff = buffer + icache;
+        const vVvf & in0_b = access.load(base_buff + 0 * simd_4.shift);
+        const vVvf & in1_b = access.load(base_buff + 1 * simd_4.shift);
+        const vVvf & in2_b = access.load(base_buff + 2 * simd_4.shift);
+        const vVvf & in3_b = access.load(base_buff + 3 * simd_4.shift);
+
+        access.store(base_buff + 0 * simd_4.shift, add_op(in0_b, in0_i));
+        access.store(base_buff + 1 * simd_4.shift, add_op(in1_b, in1_i));
+        access.store(base_buff + 2 * simd_4.shift, add_op(in2_b, in2_i));
+        access.store(base_buff + 3 * simd_4.shift, add_op(in3_b, in3_i));
+      }
+      for (mwIndex icache = end_of_unrolled_buffer; icache < end_of_buffer; icache++)
+      {
+        buffer[icache] += cell_data[icache];
+      }
+    }
+
+    Type * const base_output = output + ibase;
+
+    // Finally we save back the result
+    for (mwIndex icache = 0; icache < end_of_unrolled_buffer; icache += simd_4.block)
+    {
+      const Type * const base_buff = buffer + icache;
+      const vVvf & in0 = access.load(base_buff + 0 * simd_4.shift);
+      const vVvf & in1 = access.load(base_buff + 1 * simd_4.shift);
+      const vVvf & in2 = access.load(base_buff + 2 * simd_4.shift);
+      const vVvf & in3 = access.load(base_buff + 3 * simd_4.shift);
+
+      Type * const base_cache_output = base_output + icache;
+      access.store(base_cache_output + 0 * simd_4.shift, in0);
+      access.store(base_cache_output + 1 * simd_4.shift, in1);
+      access.store(base_cache_output + 2 * simd_4.shift, in2);
+      access.store(base_cache_output + 3 * simd_4.shift, in3);
+    }
+    for (mwIndex icache = end_of_unrolled_buffer; icache < end_of_buffer; icache++)
+    {
+      base_output[icache] = buffer[icache];
+    }
+  }
+}
+
+}  // namespace GT_MATHS
+
+#endif /* ZUTIL_CXX_INCLUDE_GTCXXMATHSSUMCELLVOLUMES_H_ */