Commit 466842d1 authored by Pierre Paleo's avatar Pierre Paleo
Browse files

Some clean-up

parent fbf9b946
......@@ -91,152 +91,17 @@ __global__ void backproj(
}
d_slice[y*(n_x) + x] = sum1 * scale_factor;
if (Gx + x < n_x)
d_slice[y*(n_x) + Gx + x] = sum2 * scale_factor;
if (Gy + y < n_y) {
d_slice[(y+Gy)*(n_x) + x] = sum3 * scale_factor;
if (Gx + x < n_x)
d_slice[(y+Gy)*(n_x) + Gx + x] = sum4 * scale_factor;
}
}
/**
*
* Old stuff / tinkering
*
*
**/
/*
#define BLOCK_SIZE 512
#define BLOCK_SIZ2 BLOCK_SIZE*2
__global__ void backproj_cust(
float* d_slice,
int num_projs,
int num_bins,
float axis_position,
int n_x,
int n_y,
float scale_factor
)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if ((x >= n_x) || (y >= n_y)) return;
int xr = (x - axis_position), yr = (y - axis_position);
float sum = 0.0f;
for (int proj = 0; proj < num_projs; proj++) {
// h = axis_pos + x*cos - y*sin
// = (cos, -sin) .* (x, y) + axis_pos
#if USE_F2 == 1
float2 cos_msin = tex2D(tex_sincos_lut, proj + 0.5f, 0.5f);
float h = cos_msin.x * xr + cos_msin.y * yr + axis_position; // TODO dot() ?
//~ float h = __cosf(3.141592f/500*proj) * xr - __sinf(3.141592f/500*proj) * yr + axis_position; // fastest (3x with __intrinsics)
//~ float h = xr*0.1f - yr*0.2f + axis_position;
#else
//~ float pcos = tex2D(tex_cos_lut, proj + 0.5f, 0.5f);
//~ float pmsin = tex2D(tex_msin_lut, proj + 0.5f, 0.5f);
//~ float h = pcos * xr + pmsin * yr + axis_position; // TODO dot() ?
float h = cosf(3.141592f/500*proj) * xr - sinf(3.141592f/500*proj) * yr + axis_position;
#endif
// if(h >= 0 && h < num_bins)
sum += tex2D(tex_projections, h + 0.5f, proj + 0.5f);
if (Gx + x < n_x)
d_slice[(y+Gy)*(n_x) + Gx + x] = sum4 * scale_factor;
}
d_slice[y*num_bins + x] = sum * scale_factor;
}
__global__ void backproj_cust2(
float* d_slice,
int num_projs,
int num_bins,
float axis_position,
int n_x,
int n_y,
float* d_cos,
float* d_msin,
float scale_factor
)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if ((x >= n_x) || (y >= n_y)) return;
float xr = (x - axis_position), yr = (y - axis_position);
volatile __shared__ float shared_mem[BLOCK_SIZ2];
volatile float* s_cos = shared_mem;
volatile float* s_msin = shared_mem + BLOCK_SIZE;
// Fetch "blockDim.x" values to shared memory
int tid = threadIdx.x;
s_cos[tid] = d_cos[tid];
s_msin[tid] = d_msin[tid];
//
//
int next_fetch = BLOCK_SIZE;
__syncthreads();
// ------------
float sum = 0.0f;
int proj = 0, proj_loc = 0;
for (proj = 0, proj_loc = 0; proj < num_projs; proj++, proj_loc++) {
//~ if (proj == next_fetch) {
//~ if (tid + next_fetch < num_projs) {
//~ s_cos[tid] = d_cos[next_fetch + tid];
//~ s_msin[tid] = d_msin[next_fetch + tid];
//~ __syncthreads();
//~ }
//~ next_fetch += BLOCK_SIZE;
//~ proj_loc = 0;
//~ }
// h = axis_pos + x*cos - y*sin
// = (cos, -sin) .* (x, y) + axis_pos
//~ float h = s_cos[proj_loc] * xr + s_msin[proj_loc] * yr + axis_position; // TODO dot() ?
float h =
fmaf(s_cos[proj_loc], xr, axis_position) +
fmaf(s_msin[proj_loc], yr, 0.0f);
//~ if (h >= 0 && h < num_bins)
sum += tex2D(tex_projections, h + 0.5f, proj + 0.5f);
}
d_slice[y*n_x + x] = sum * scale_factor;
//~ d_slice[y*n_x + x] = d_cos[min(x, 500)];
}
// les threads font trop d'i/O par rapport au calcul
// hst_backproj fait travailler les threads "4x plus" => meilleure occupancy
// chaque thread ne traite pas un seul pixel (x, y) de l'image, il traite (x +- 1, y +- 1)
*/
......@@ -74,8 +74,8 @@ class Backprojector(CudaProcessing):
def compute_angles(self):
self.h_cos[0] = np.cos(self.angles).astype("f")
self.h_msin[0] = (-np.sin(self.angles)).astype("f")
self._d_msin = garray.to_gpu(self.h_msin[0]) ###
self._d_cos = garray.to_gpu(self.h_cos[0]) ###
self._d_msin = garray.to_gpu(self.h_msin[0])
self._d_cos = garray.to_gpu(self.h_cos[0])
def compile_kernels(self):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment