/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ //#define CUDA_PRECISION 1 #include "cuda_precision.h" #include "cuda_common.h" struct FFT_DATA { FFT_FLOAT re; FFT_FLOAT im; }; #include "fft3d_cuda_cu.h" #include "fft3d_cuda_kernel.cu" #include void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow) { dim3 grid; grid.x = nslow; grid.y = nmid; grid.z = 1; dim3 threads; threads.x = nfast; threads.y = 1; threads.z = 1; cudaThreadSynchronize(); initfftdata_kernel <<< grid, threads, 0>>>(in, out); cudaThreadSynchronize(); MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError()))); } void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow) { dim3 grid; grid.x = nslow; grid.y = nmid; grid.z = 1; dim3 threads; threads.x = nfast * 2; threads.y = 1; threads.z = 1; permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out); cudaThreadSynchronize(); MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError()))); } void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow) { dim3 grid; grid.x = nslow; grid.y = nmid; grid.z = 1; dim3 threads; threads.x = nfast * 2; threads.y = 1; threads.z = 1; permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out); cudaThreadSynchronize(); } void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo) { dim3 grid; grid.x = (ihi - ilo + 1); grid.y = (jhi - jlo + 1); grid.z = 1; dim3 threads; threads.x = (khi - klo + 1) * 2; threads.y = 1; threads.z = 1; permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo); cudaThreadSynchronize(); } void FFTsyncthreads() { cudaThreadSynchronize(); }