enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet #include "cuda_data_cu.h" #include "cuda_wrapper_cu.h" #include "cuda_data_kernel.cu" #include void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) { int size=n[0]; if(n[1]>0) size*=n[1]; if(n[2]>0) size*=n[2]; dim3 threads; threads.x=1; threads.y=1; threads.z=1; dim3 grid; grid.x=1; grid.y=1; grid.z=1; if(size<=128*30) threads.x=32; else if(size<=256*30) threads.x=64; else if(size<=512*30) threads.x=128; else threads.x=256; grid.x=((size-1)+threads.x)/threads.x; if(grid.x>32000) grid.x=32000; while(grid.x*grid.y*threads.x>>((double*)buffer,(float*)dev_data,n[0],n[1],n[2],mode); cudaThreadSynchronize(); CudaWrapper_DownloadCudaData(debugdata, dev_data, size/2); double sum=0; printf("debugdata: "); for(int i=0;i0) size*=n[1]; if(n[2]>0) size*=n[2]; dim3 threads; threads.x=1; threads.y=1; threads.z=1; dim3 grid; grid.x=1; grid.y=1; grid.z=1; if(size<=128*30) threads.x=32; else if(size<=256*30) threads.x=64; else if(size<=512*30) threads.x=128; else threads.x=256; grid.x=((size-1)+threads.x)/threads.x; if(grid.x>32000) grid.x=32000; while(grid.x*grid.y*threads.x>>((double*)buffer,(double*)dev_data,n[0],n[1],n[2],mode); cudaThreadSynchronize(); } void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) { int size=n[0]; if(n[1]>0) size*=n[1]; if(n[2]>0) size*=n[2]; dim3 threads; threads.x=1; threads.y=1; threads.z=1; dim3 grid; grid.x=1; grid.y=1; grid.z=1; if(size<=128*30) threads.x=32; else if(size<=256*30) threads.x=64; else if(size<=512*30) threads.x=128; else threads.x=256; grid.x=((size-1)+threads.x)/threads.x; if(grid.x>32000) grid.x=32000; while(grid.x*grid.y*threads.x>>((float*)buffer,(double*)dev_data,n[0],n[1],n[2],mode); cudaThreadSynchronize(); } void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) { int size=n[0]; if(n[1]>0) size*=n[1]; if(n[2]>0) size*=n[2]; dim3 threads; threads.x=1; threads.y=1; threads.z=1; dim3 grid; grid.x=1; grid.y=1; grid.z=1; if(size<=128*30) threads.x=32; else if(size<=256*30) threads.x=64; else if(size<=512*30) threads.x=128; else threads.x=256; grid.x=((size-1)+threads.x)/threads.x; if(grid.x>32000) grid.x=32000; while(grid.x*grid.y*threads.x>>((float*)buffer,(float*)dev_data,n[0],n[1],n[2],mode); cudaThreadSynchronize(); } void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer) { int size=n[0]; if(n[1]>0) size*=n[1]; if(n[2]>0) size*=n[2]; dim3 threads; threads.x=1; threads.y=1; threads.z=1; dim3 grid; grid.x=1; grid.y=1; grid.z=1; if(size<=128*30) threads.x=32; else if(size<=256*30) threads.x=64; else if(size<=512*30) threads.x=128; else threads.x=256; grid.x=((size-1)+threads.x)/threads.x; if(grid.x>32000) grid.x=32000; while(grid.x*grid.y*threads.x>>((int*)buffer,(int*)dev_data,n[0],n[1],n[2],mode); cudaThreadSynchronize(); } void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer) { }