diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 6f0c7c8433..1b2900f97f 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       const numtyp4 pol3j = polar3[j];
       numtyp qkyz = pol3j.x; // rpole[j][9];
       numtyp qkzz = pol3j.y; // rpole[j][12];
-      int jtype = pol3j.z; // amtype[j];
-      int jgroup =  pol3j.w; // amgroup[j];
+      //int jtype = pol3j.z; // amtype[j];
+      //int jgroup =  pol3j.w; // amgroup[j];
 
       const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
@@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp dirx = diy*zr - diz*yr;
       numtyp diry = diz*xr - dix*zr;
       numtyp dirz = dix*yr - diy*xr;
-      numtyp dkrx = dky*zr - dkz*yr;
-      numtyp dkry = dkz*xr - dkx*zr;
-      numtyp dkrz = dkx*yr - dky*xr;
       numtyp dikx = diy*dkz - diz*dky;
       numtyp diky = diz*dkx - dix*dkz;
       numtyp dikz = dix*dky - diy*dkx;
       numtyp qirx = qiz*yr - qiy*zr;
       numtyp qiry = qix*zr - qiz*xr;
       numtyp qirz = qiy*xr - qix*yr;
-      numtyp qkrx = qkz*yr - qky*zr;
-      numtyp qkry = qkx*zr - qkz*xr;
-      numtyp qkrz = qky*xr - qkx*yr;
       numtyp qikx = qky*qiz - qkz*qiy;
       numtyp qiky = qkz*qix - qkx*qiz;
       numtyp qikz = qkx*qiy - qky*qix;
@@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp qikrx = qizk*yr - qiyk*zr;
       numtyp qikry = qixk*zr - qizk*xr;
       numtyp qikrz = qiyk*xr - qixk*yr;
-      numtyp qkirx = qkzi*yr - qkyi*zr;
-      numtyp qkiry = qkxi*zr - qkzi*xr;
-      numtyp qkirz = qkyi*xr - qkxi*yr;
       numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
       numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
       numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
       numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
       numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
       numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
-      numtyp diqkrx = diqkz*yr - diqky*zr;
-      numtyp diqkry = diqkx*zr - diqkz*xr;
-      numtyp diqkrz = diqky*xr - diqkx*yr;
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
@@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
-    const numtyp4 pol1i = polar1[i];
-    numtyp dix = pol1i.y;    // rpole[i][1];
-    numtyp diy = pol1i.z;    // rpole[i][2];
-    numtyp diz = pol1i.w;    // rpole[i][3];
-    const numtyp4 pol2i = polar2[i];
-    numtyp qixx = pol2i.x;   // rpole[i][4];
-    numtyp qixy = pol2i.y;   // rpole[i][5];
-    numtyp qixz = pol2i.z;   // rpole[i][6];
-    numtyp qiyy = pol2i.w;   // rpole[i][8];
     const numtyp4 pol3i = polar3[i];
-    numtyp qiyz = pol3i.x;   // rpole[i][9];
-    numtyp qizz = pol3i.y;   // rpole[i][12];
     int itype  = pol3i.z;    // amtype[i];
     int igroup = pol3i.w;    // amgroup[i];
 
@@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       // intermediates involving moments and separation distance
 
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
       numtyp dkr = dkx*xr + dky*yr + dkz*zr;
       numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
       numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
@@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     }
 
     int itype,igroup;
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
 
@@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int itype,igroup;
-    /*
-    numtyp bfac;
-    numtyp psc3,psc5,psc7;
-    numtyp dsc3,dsc5,dsc7;
-    numtyp usc3,usc5;
-    numtyp psr3,psr5,psr7;
-    numtyp dsr3,dsr5,dsr7;
-    numtyp usr5;
-    numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6,term7;
-    numtyp rc3[3],rc5[3],rc7[3];
-    numtyp prc3[3],prc5[3],prc7[3];
-    numtyp drc3[3],drc5[3],drc7[3];
-    numtyp urc3[3],urc5[3];
-    numtyp bn[5];
-    */
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
     int numj, nbor, nbor_end;
@@ -1196,8 +1140,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -1303,7 +1245,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
       numtyp uir = uix*xr + uiy*yr + uiz*zr;
-      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      //numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
       numtyp ukr = ukx*xr + uky*yr + ukz*zr;
       numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
 
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index be183b284d..304159e571 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void amoeba_setup_fft(const int size, const int element_type) {
+  AMOEBAMF.setup_fft(size, element_type);
+}
+
 void amoeba_compute_fft1d(void** in, void** out, const int mode) {
   AMOEBAMF.compute_fft1d(in, out, mode);
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index b0d6ecee68..05a48f9588 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -579,7 +579,17 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 }
 
 // ---------------------------------------------------------------------------
-// Compute FFT
+// Setup the FFT plan
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::setup_fft(const int size, const int element_type)
+{
+
+}
+
+// ---------------------------------------------------------------------------
+// Compute FFT on the device
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index cf767be96e..2bff362f29 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -190,7 +190,12 @@ class BaseAmoeba {
     _fieldp.update_host(_max_fieldp_size*8,false);
   }
 
+  /// setup a plan for FFT, where size is the number of elements
+
+  void setup_fft(const int size, const int element_type=0);
+
   /// compute forward/backward FFT on the device
+
   void compute_fft1d(void** in, void** out, const int mode);
 
   // -------------------------- DEVICE DATA -------------------------
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index ad52df3d4b..f514a50620 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -21,10 +21,12 @@ using namespace LAMMPS_NS;
 
 #define SCALE 0
 
+enum {FORWARD,BACKWARD};
+
 // External functions from GPU library
 
-int amoeba_setup_fft(const int size);
-int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
+int amoeba_setup_fft(const int size, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int mode);
 
 /* ----------------------------------------------------------------------
    partition an FFT grid across processors
@@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm");
   debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm");
 #endif
+
   // copy owned 4d brick grid values to FFT grid
 
   n = 0;
@@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
 
+  //amoeba_compute_fft1d(cfft,cfft,FORWARD);
+
   if (SCALE) {
     double scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 734ca53bba..29db1b4c1b 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -38,6 +38,7 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
+// same as in amoeba_induce.cpp
 enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
 enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
 enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
@@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};
 
+// same as in pair_amoeba.cpp
 enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
 
 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
@@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style()
       ic_kspace =
         new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
     }
-
   }
 }