__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data, unsigned nx, unsigned ny, unsigned nz, copy_mode mode) { if(mode == x) mode = xx; unsigned length = nx; if(ny > 0) length *= ny; if(nz > 0) length *= nz; unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; if(i >= length) return; switch(mode) { case xx: { dev_data[i] = buffer[i]; } case xy: { dev_data[i] = buffer[i]; } case yx: { j = i / ny; k = i % ny; dev_data[k * nx + j] = buffer[j * ny + k]; } case xyz: { dev_data[i] = buffer[i]; } case xzy: { j = i / (ny * nz); k = (i % (ny * nz)) / nz; l = i % nz; dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; } } } __global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data, unsigned nx, unsigned ny, unsigned nz, copy_mode mode) { if(mode == x) mode = xx; unsigned length = nx; if(ny > 0) length *= ny; if(nz > 0) length *= nz; unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; if(i >= length) return; switch(mode) { case xx: dev_data[i] = buffer[i]; case xy: dev_data[i] = buffer[i]; case yx: j = i / ny; k = i % ny; dev_data[k * nx + j] = buffer[j * ny + k]; case xyz: dev_data[i] = buffer[i]; case xzy: j = i / (ny * nz); k = (i % (ny * nz)) / nz; l = i % nz; dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; } } __global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data, unsigned nx, unsigned ny, unsigned nz, copy_mode mode) { if(mode == x) mode = xx; unsigned length = nx; if(ny > 0) length *= ny; if(nz > 0) length *= nz; unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; if(i >= length) return; switch(mode) { case xx: dev_data[i] = buffer[i]; case xy: dev_data[i] = buffer[i]; case yx: j = i / ny; k = i % ny; dev_data[k * nx + j] = buffer[j * ny + k]; case xyz: dev_data[i] = buffer[i]; case xzy: j = i / (ny * nz); k = (i % (ny * nz)) / nz; l = i % nz; dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; } } __global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data, unsigned nx, unsigned ny, unsigned nz, copy_mode mode) { if(mode == x) mode = xx; unsigned length = nx; if(ny > 0) length *= ny; if(nz > 0) length *= nz; unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; if(i >= length) return; switch(mode) { case xx: dev_data[i] = buffer[i]; case xy: dev_data[i] = buffer[i]; case yx: j = i / ny; k = i % ny; dev_data[k * nx + j] = buffer[j * ny + k]; case xyz: dev_data[i] = buffer[i]; case xzy: j = i / (ny * nz); k = (i % (ny * nz)) / nz; l = i % nz; dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; } } __global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data, unsigned nx, unsigned ny, unsigned nz, copy_mode mode) { if(mode == x) mode = xx; unsigned length = nx; if(ny > 0) length *= ny; if(nz > 0) length *= nz; unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; if(i >= length) return; switch(mode) { case xx: dev_data[i] = buffer[i]; case xy: dev_data[i] = buffer[i]; case yx: j = i / ny; k = i % ny; dev_data[k * nx + j] = buffer[j * ny + k]; case xyz: dev_data[i] = buffer[i]; case xzy: j = i / (ny * nz); k = (i % (ny * nz)) / nz; l = i % nz; dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; } }