/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #include #include "cuda_shared.h" #include "cuda_common.h" #include "cuda_wrapper_cu.h" #include "cuda_wrapper_kernel.cu" static int CudaWrapper_total_gpu_mem=0; static double CudaWrapper_total_upload_time=0; static double CudaWrapper_total_download_time=0; static double CudaWrapper_cpubuffer_upload_time=0; static double CudaWrapper_cpubuffer_download_time=0; static cudaStream_t* streams; static int nstreams=0; void CudaWrapper_Init(int argc, char** argv,int me,int ppn,int* devicelist) { MYDBG( printf("# CUDA: debug mode on\n"); ) #if __DEVICE_EMULATION__ printf("# CUDA: emulation mode on\n"); #else // modified from cutil.h static int deviceCount=0; static bool sharedmode=false; if(deviceCount && !sharedmode) return; if(deviceCount && sharedmode) cudaThreadExit(); CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceCount(&deviceCount) ); if (deviceCount == 0) { fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } MYDBG( printf("# CUDA There are %i devices supporting CUDA in this system.\n",deviceCount);) cudaDeviceProp deviceProp[deviceCount]; for(int i=0;ideviceCount) {printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); exit(0);} int devicea=me%ppn; if(devicelist) devicea=devicelist[devicea]; else devicea=dev_list[devicea]; if(devicea>=deviceCount) {printf("Asking for non existent GPU %i. Found only %i GPUs.\n",devicea,deviceCount); exit(0);} MYDBG( printf(" # CUDA myid: %i take device: %i\n",me,devicea); ) CUDA_SAFE_CALL( cudaSetDevice(devicea) ); } else { CUDA_SAFE_CALL( cudaSetValidDevices(dev_list,deviceCount) ); } cudaThreadSynchronize(); int dev; CUDA_SAFE_CALL( cudaGetDevice(&dev)); if (deviceProp[dev].major < 1) { fprintf(stderr, "CUDA error: device does not support CUDA.\n"); exit(EXIT_FAILURE); } else if ((deviceProp[dev].major == 1)&&(deviceProp[dev].minor != 3)) { fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n",dev,deviceProp[dev].name,deviceProp[dev].major,deviceProp[dev].minor); exit(EXIT_FAILURE); } if ((deviceProp[dev].major == 2)&&(CUDA_ARCH<20)) { fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n",deviceProp[dev].major,deviceProp[dev].minor); } if ((deviceProp[dev].major == 1)&&(CUDA_ARCH>=20)) { fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n",CUDA_ARCH); exit(EXIT_FAILURE); } fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name); MYDBG( fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);) MYDBG ( printf("name = %s\n", deviceProp[dev].name); printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem); printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock); printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock); printf("warpSize = %i\n", deviceProp[dev].warpSize); printf("memPitch = %i\n", deviceProp[dev].memPitch); printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock); printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]); printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]); printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem); printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor); printf("clockRate = %i\n", deviceProp[dev].clockRate); printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment); printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap); printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount); printf("computeMode = %i\n", deviceProp[dev].computeMode); ) #endif } void* CudaWrapper_AllocCudaData(unsigned nbytes) { void* dev_data; CUDA_SAFE_CALL( cudaMalloc((void**)&dev_data, nbytes) ); MYDBG( printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data); ) CudaWrapper_total_gpu_mem+=nbytes; return dev_data; } void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes) { MYDBG( printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data,host_data); ) cudaThreadSynchronize(); timespec time1,time2; clock_gettime(CLOCK_REALTIME,&time1); CUDA_SAFE_CALL( cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice) ); clock_gettime(CLOCK_REALTIME,&time2); CudaWrapper_total_upload_time+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; } void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream) { MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice,streams[stream]); } void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes) { MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) cudaThreadSynchronize(); timespec time1,time2; clock_gettime(CLOCK_REALTIME,&time1); CUDA_SAFE_CALL( cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost) ); clock_gettime(CLOCK_REALTIME,&time2); CudaWrapper_total_download_time+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; } void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream) { MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); ) cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost,streams[stream]); } void CudaWrapper_FreeCudaData(void* dev_data,unsigned nbytes) { MYDBG( printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data,nbytes,(char*)dev_data+nbytes); ) CUDA_SAFE_CALL( cudaFree(dev_data) ); CudaWrapper_total_gpu_mem-=nbytes; } void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes) { MYDBG( printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data); ) CUDA_SAFE_CALL( cudaMemset(dev_data, value, nbytes) ); } void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes) { MYDBG( printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source,dev_dest); ) CUDA_SAFE_CALL( cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice) ); } void* CudaWrapper_AllocPinnedHostData(unsigned nbytes,bool mapped,bool writeCombined) { void* host_data; int flags=0; if(mapped) flags=flags | cudaHostAllocMapped; if(writeCombined) flags=flags | cudaHostAllocWriteCombined; CUDA_SAFE_CALL( cudaHostAlloc((void**)&host_data, nbytes,flags) ); // CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) ); MYDBG( printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data); ) return host_data; } void CudaWrapper_FreePinnedHostData(void* host_data) { MYDBG( printf("# CUDA: freeing pinned host memory at %p \n",host_data); ) if(host_data) CUDA_SAFE_CALL( cudaFreeHost(host_data) ); } void cuda_check_error(char* comment) { printf("ERROR-CUDA %s %s\n",comment,cudaGetErrorString(cudaGetLastError())); } int CudaWrapper_CheckMemUseage() { size_t free,total; cudaMemGetInfo(&free,&total); return total-free; //possible with cuda 3.0 ??? //return CudaWrapper_total_gpu_mem; } double CudaWrapper_CheckUploadTime(bool reset) { if(reset) CudaWrapper_total_upload_time=0.0; return CudaWrapper_total_upload_time; } double CudaWrapper_CheckDownloadTime(bool reset) { if(reset) CudaWrapper_total_download_time=0.0; return CudaWrapper_total_download_time; } double CudaWrapper_CheckCPUBufUploadTime(bool reset) { if(reset) CudaWrapper_cpubuffer_upload_time=0.0; return CudaWrapper_cpubuffer_upload_time; } double CudaWrapper_CheckCPUBufDownloadTime(bool reset) { if(reset) CudaWrapper_cpubuffer_download_time=0.0; return CudaWrapper_cpubuffer_download_time; } void CudaWrapper_AddCPUBufUploadTime(double dt) { CudaWrapper_cpubuffer_upload_time+=dt; } void CudaWrapper_AddCPUBufDownloadTime(double dt) { CudaWrapper_cpubuffer_download_time+=dt; } void CudaWrapper_Sync() { cudaThreadSynchronize(); } void CudaWrapper_SyncStream(int stream) { cudaStreamSynchronize(streams[stream]); } void CudaWrapper_AddStreams(int n) { cudaStream_t* new_streams=new cudaStream_t[nstreams+n]; for(int i=0;i0) delete [] streams; streams=new_streams; nstreams+=n; } void* CudaWrapper_returnStreams() { return (void*) streams; } int CudaWrapper_returnNStreams() { return nstreams; }