diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.cpp b/src/USER-CUDA/atom_vec_atomic_cuda.cpp index b6bb14422c..f6b2d76b85 100644 --- a/src/USER-CUDA/atom_vec_atomic_cuda.cpp +++ b/src/USER-CUDA/atom_vec_atomic_cuda.cpp @@ -286,14 +286,14 @@ int AtomVecAtomicCuda::pack_exchange(int dim, double *buf) } if(max_nsend==0) grow_copylist(200); - + int nsend_atoms = Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); if(nsend_atoms>max_nsend) {grow_copylist(nsend_atoms+100);} if(nsend_atoms*NCUDAEXCHANGE>*maxsend) { grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0); - Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); + Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer); } int nlocal=atom->nlocal-nsend_atoms; @@ -395,6 +395,7 @@ int AtomVecAtomicCuda::unpack_exchange(double *buf) } } cuda->shared_data.atom.nlocal=nlocal; + if(atom->nlocal!=nlocal) cuda->shared_data.atom.update_nlocal=2; atom->nlocal=nlocal; mfirst+=m; diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp index 819357bc16..438312ee81 100644 --- a/src/USER-CUDA/cuda.cpp +++ b/src/USER-CUDA/cuda.cpp @@ -130,11 +130,11 @@ Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp) downloadtime=0; dotiming=false; - dotestatom = false; - testatom = 0; + dotestatom = false; + testatom = 0; oncpu = true; - self_comm = 0; + self_comm = 0; MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");) //cCudaData } @@ -267,10 +267,10 @@ void Cuda::accelerator(int narg, char** arg) cu_virial = 0; cu_eatom = 0; cu_vatom = 0; - cu_radius = 0; + cu_radius = 0; cu_density = 0; - cu_omega = 0; - cu_torque = 0; + cu_omega = 0; + cu_torque = 0; cu_special = 0; cu_nspecial = 0; @@ -299,8 +299,11 @@ void Cuda::setSharedDataZero() shared_data.atom.q_flag = 0; shared_data.atom.need_eatom = 0; shared_data.atom.need_vatom = 0; + shared_data.atom.update_nmax = 1; + shared_data.atom.update_nlocal = 1; + shared_data.atom.update_neigh = 1; - shared_data.pair.cudable_force = 0; + shared_data.pair.cudable_force = 0; shared_data.pair.collect_forces_later = 0; shared_data.pair.use_block_per_atom = 0; shared_data.pair.override_block_per_atom = -1; @@ -429,14 +432,6 @@ void Cuda::checkResize() if(cu_atom->q_flag) {delete cu_q; cu_q = new cCudaData ((double*)atom->q, & cu_atom->q , atom->nmax );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);} -/* - if(force->pair) - if(force->pair->eatom) - {delete cu_eatom; cu_eatom = new cCudaData (force->pair->eatom, & cu_atom->eatom , atom->nmax );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} - if(force->pair) - if(force->pair->vatom) - {delete cu_vatom; cu_vatom = new cCudaData ((double*)force->pair->vatom, & cu_atom->vatom , atom->nmax,6 );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} -*/ if(atom->radius) { delete cu_radius; cu_radius = new cCudaData (atom->radius , & cu_atom->radius , atom->nmax ); @@ -444,11 +439,6 @@ void Cuda::checkResize() delete cu_omega_rmass; cu_omega_rmass = new cCudaData (omega_rmass , & cu_atom->omega_rmass , atom->nmax*4); } - /* - if(atom->density) - {delete cu_density; cu_density = new cCudaData (atom->density , & cu_atom->density , atom->nmax );} - */ - if(atom->omega) {delete cu_omega; cu_omega = new cCudaData (((double*) atom->omega) , & cu_atom->omega , atom->nmax,3 );} @@ -464,12 +454,10 @@ void Cuda::checkResize() shared_data.atom.special_flag = neighbor->special_flag; shared_data.atom.molecular = atom->molecular; - cu_atom->update_nmax = 2; - cu_atom->nmax = atom->nmax; + cu_atom->update_nmax = 2; + cu_atom->nmax = atom->nmax; - //delete [] x_type; x_type = new X_FLOAT4[atom->nmax]; delete cu_x_type; cu_x_type = new cCudaData (x_type , & cu_atom->x_type , atom->nmax*4); - // shared_data.buffer_new = 2; } if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]maxhold))&&neighbor->xhold) @@ -488,6 +476,12 @@ void Cuda::checkResize() { cu_map_array = new cCudaData (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size() ); } + else + if(cu_map_array->dev_size()/sizeof(int)get_map_size()) + { + delete cu_map_array; + cu_map_array = new cCudaData (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size() ); + } } @@ -512,11 +506,6 @@ void Cuda::checkResize() if(atom->radius) if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius)); - /* - if(atom->density) - if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density)); - */ - if(atom->omega) if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega)); @@ -558,7 +547,7 @@ void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom) if(not cu_vatom) cu_vatom = new cCudaData ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);} cu_vatom->set_host_data((double*)force->pair->vatom); - cu_vatom->memset_device(0); + cu_vatom->memset_device(0); } } @@ -579,16 +568,9 @@ void Cuda::uploadAll() cu_image->upload(); if(shared_data.atom.q_flag) cu_q ->upload(); - //printf("A3\n"); - //if(shared_data.atom.need_eatom) cu_eatom->upload(); - //printf("A4\n"); - //if(shared_data.atom.need_vatom) cu_vatom->upload(); - //printf("A5\n"); - if(atom->rmass) cu_rmass->upload(); if(atom->radius) cu_radius->upload(); - // if(atom->density) cu_density->upload(); if(atom->omega) cu_omega->upload(); if(atom->torque) cu_torque->upload(); if(atom->special) cu_special->upload(); @@ -631,7 +613,6 @@ void Cuda::downloadAll() if(atom->rmass) cu_rmass->download(); if(atom->radius) cu_radius->download(); - // if(atom->density) cu_density->download(); if(atom->omega) cu_omega->download(); if(atom->torque) cu_torque->download(); if(atom->special) cu_special->download(); @@ -747,13 +728,13 @@ void Cuda::setTimingsZero() shared_data.cuda_timings.neigh_special = 0; //PPPM - shared_data.cuda_timings.pppm_particle_map; - shared_data.cuda_timings.pppm_make_rho; - shared_data.cuda_timings.pppm_brick2fft; - shared_data.cuda_timings.pppm_poisson; - shared_data.cuda_timings.pppm_fillbrick; - shared_data.cuda_timings.pppm_fieldforce; - shared_data.cuda_timings.pppm_compute; + shared_data.cuda_timings.pppm_particle_map = 0; + shared_data.cuda_timings.pppm_make_rho = 0; + shared_data.cuda_timings.pppm_brick2fft = 0; + shared_data.cuda_timings.pppm_poisson = 0; + shared_data.cuda_timings.pppm_fillbrick = 0; + shared_data.cuda_timings.pppm_fieldforce = 0; + shared_data.cuda_timings.pppm_compute = 0; CudaWrapper_CheckUploadTime(true); CudaWrapper_CheckDownloadTime(true); @@ -789,8 +770,8 @@ void Cuda::print_timings() printf(" Exchange MPI \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi); printf(" Exchange Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack); printf(" Exchange Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack); - printf(" Exchange Kernel Fill \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill); - printf(" Exchange CPU Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack); + printf(" Exchange Kernel Fill \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill); + printf(" Exchange CPU Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack); printf(" Exchange Upload \t %lf \n",shared_data.cuda_timings.comm_exchange_upload); printf(" Exchange Download \t %lf \n",shared_data.cuda_timings.comm_exchange_download); printf("\n"); diff --git a/src/USER-CUDA/cuda_neigh_list.cpp b/src/USER-CUDA/cuda_neigh_list.cpp index ef9edf5ef3..01f8e0c6a8 100644 --- a/src/USER-CUDA/cuda_neigh_list.cpp +++ b/src/USER-CUDA/cuda_neigh_list.cpp @@ -111,6 +111,7 @@ void CudaNeighList::dev_alloc() neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors]; cu_neighbors_inner = new cCudaData (neighbors_inner , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors ); } + cuda->shared_data.atom.update_neigh=2; MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); ) } diff --git a/src/USER-CUDA/neigh_full_cuda.cpp b/src/USER-CUDA/neigh_full_cuda.cpp index 61c9897f4a..14fe153ec9 100644 --- a/src/USER-CUDA/neigh_full_cuda.cpp +++ b/src/USER-CUDA/neigh_full_cuda.cpp @@ -250,6 +250,7 @@ void NeighborCuda::full_bin_cuda(NeighList *list) }*/ list->cuda_list->cu_numneigh->download(); list->cuda_list->cu_ilist->download(); + cuda->shared_data.atom.update_neigh=2; //printf("Done\n"); MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");) diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp index fbaa1800a5..0a3ba3ff40 100644 --- a/src/USER-CUDA/verlet_cuda.cpp +++ b/src/USER-CUDA/verlet_cuda.cpp @@ -564,6 +564,7 @@ void VerletCuda::run(int n) cuda->shared_data.atom.reneigh_flag=0; cuda->shared_data.atom.update_nlocal=1; cuda->shared_data.atom.update_nmax=1; + cuda->shared_data.atom.update_neigh=1; cuda->shared_data.domain.update=1; cuda->shared_data.buffer_new=1; cuda->uploadtime=0; @@ -627,14 +628,12 @@ void VerletCuda::run(int n) //start force calculation asynchronus cuda->shared_data.comm.comm_phase=1; - // printf("Pre Force Compute\n"); force->pair->compute(eflag, vflag); timer->stamp(TIME_PAIR); //CudaWrapper_Sync(); //download comm buffers from GPU, perform MPI communication and upload buffers again clock_gettime(CLOCK_REALTIME,&starttime); - // printf("Pre forward_comm(2)\n"); comm->forward_comm(2); clock_gettime(CLOCK_REALTIME,&endtime); cuda->shared_data.cuda_timings.comm_forward_total+= @@ -642,16 +641,13 @@ void VerletCuda::run(int n) timer->stamp(TIME_COMM); //wait for force calculation - //printf("Pre Synch\n"); CudaWrapper_Sync(); timer->stamp(TIME_PAIR); //unpack communication buffers clock_gettime(CLOCK_REALTIME,&starttime); - // printf("Pre forward_comm(3)\n"); comm->forward_comm(3); clock_gettime(CLOCK_REALTIME,&endtime); - // printf("Post forward_comm(3)\n"); cuda->shared_data.cuda_timings.comm_forward_total+= endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; @@ -663,11 +659,9 @@ void VerletCuda::run(int n) else { //perform standard forward communication - //printf("Forward_comm\n"); clock_gettime(CLOCK_REALTIME,&starttime); comm->forward_comm(); clock_gettime(CLOCK_REALTIME,&endtime); - //printf("Forward_comm_done\n"); cuda->shared_data.cuda_timings.comm_forward_total+= endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; timer->stamp(TIME_COMM); @@ -677,13 +671,13 @@ void VerletCuda::run(int n) else { int nlocalold=cuda->shared_data.atom.nlocal; - //if(firstreneigh) + if(firstreneigh) { cuda->shared_data.atom.update_nlocal=1; - cuda->shared_data.atom.update_nmax=1; + cuda->shared_data.atom.update_nmax=1; firstreneigh=0; } - cuda->shared_data.buffer_new=1; + cuda->shared_data.buffer_new=1; MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); ) cuda->setDomainParams(); if(n_pre_exchange) modify->pre_exchange(); @@ -759,10 +753,10 @@ void VerletCuda::run(int n) cuda->shared_data.cuda_timings.test2+= endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; - //rebuild neighbor list - test_atom(testatom,"Pre Neighbor"); + //rebuild neighbor list + test_atom(testatom,"Pre Neighbor"); neighbor->build(); - timer->stamp(TIME_NEIGHBOR); + timer->stamp(TIME_NEIGHBOR); MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); ) //if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU @@ -772,7 +766,7 @@ void VerletCuda::run(int n) cuda->cu_tag->download(); cuda->cu_type->download(); cuda->cu_mask->download(); - if(cuda->cu_q) cuda->cu_q->download(); + if(cuda->cu_q) cuda->cu_q->download(); } cuda->shared_data.comm.comm_phase=3; } @@ -969,14 +963,16 @@ void VerletCuda::run(int n) test_atom(testatom,"post output"); if(cuda->shared_data.atom.update_nlocal>0) - cuda->shared_data.atom.update_nlocal--; - if(cuda->shared_data.atom.update_nmax>0) - cuda->shared_data.atom.update_nmax--; - if(cuda->shared_data.domain.update>0) + cuda->shared_data.atom.update_nlocal--; + if(cuda->shared_data.atom.update_nmax>0) + cuda->shared_data.atom.update_nmax--; + if(cuda->shared_data.atom.update_neigh>0) + cuda->shared_data.atom.update_neigh--; + if(cuda->shared_data.domain.update>0) cuda->shared_data.domain.update--; - if(cuda->shared_data.buffer_new>0) + if(cuda->shared_data.buffer_new>0) cuda->shared_data.buffer_new--; - cuda->shared_data.atom.reneigh_flag=0; + cuda->shared_data.atom.reneigh_flag=0; } @@ -984,6 +980,7 @@ void VerletCuda::run(int n) cuda->downloadAllNeighborLists(); cuda->shared_data.atom.update_nlocal=1; cuda->shared_data.atom.update_nmax=1; + cuda->shared_data.atom.update_neigh=1; cuda->shared_data.buffer_new=1; cuda->shared_data.domain.update=1; cuda->oncpu = true;