From a243be2dc9c25e7fc175d899b2f2d796e9cb7bd6 Mon Sep 17 00:00:00 2001 From: Aidan Thompson Date: Sun, 21 Apr 2019 22:10:03 -0600 Subject: [PATCH] Added bare-bones yarray algorithm, 2x speedup --- src/SNAP/pair_snap.cpp | 84 ++++++++++++++++------------ src/SNAP/sna.cpp | 124 +++++++++++++++++++++++++++++++++++++++++ src/SNAP/sna.h | 3 + 3 files changed, 174 insertions(+), 37 deletions(-) diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp index 8b547e6e73..73faaa71f7 100644 --- a/src/SNAP/pair_snap.cpp +++ b/src/SNAP/pair_snap.cpp @@ -158,9 +158,12 @@ PairSNAP::~PairSNAP() void PairSNAP::compute(int eflag, int vflag) { - if (use_optimized) - compute_optimized(eflag, vflag); - else +// if (use_optimized) +// compute_optimized(eflag, vflag); +// else + +// hard-code compute_regular() + compute_regular(eflag, vflag); } @@ -248,51 +251,58 @@ void PairSNAP::compute_regular(int eflag, int vflag) double* coeffi = coeffelem[ielem]; + // omit beta0 from beta vector + + double* beta = coeffi+1; + snaptr->compute_yi(beta); + for (int jj = 0; jj < ninside; jj++) { int j = snaptr->inside[jj]; snaptr->compute_duidrj(snaptr->rij[jj], snaptr->wj[jj],snaptr->rcutij[jj]); - snaptr->compute_dbidrj(); - snaptr->copy_dbi2dbvec(); +// snaptr->compute_dbidrj(); +// snaptr->copy_dbi2dbvec(); - fij[0] = 0.0; - fij[1] = 0.0; - fij[2] = 0.0; +// fij[0] = 0.0; +// fij[1] = 0.0; +// fij[2] = 0.0; - // linear contributions +// // linear contributions - for (int k = 1; k <= ncoeff; k++) { - double bgb = coeffi[k]; - fij[0] += bgb*snaptr->dbvec[k-1][0]; - fij[1] += bgb*snaptr->dbvec[k-1][1]; - fij[2] += bgb*snaptr->dbvec[k-1][2]; - } +// for (int k = 1; k <= ncoeff; k++) { +// double bgb = coeffi[k]; +// fij[0] += bgb*snaptr->dbvec[k-1][0]; +// fij[1] += bgb*snaptr->dbvec[k-1][1]; +// fij[2] += bgb*snaptr->dbvec[k-1][2]; +// } - // quadratic contributions +// // quadratic contributions - if (quadraticflag) { - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - double bveci = snaptr->bvec[icoeff]; - double fack = coeffi[k]*bveci; - double* dbveci = snaptr->dbvec[icoeff]; - fij[0] += fack*dbveci[0]; - fij[1] += fack*dbveci[1]; - fij[2] += fack*dbveci[2]; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - double facki = coeffi[k]*bveci; - double fackj = coeffi[k]*snaptr->bvec[jcoeff]; - double* dbvecj = snaptr->dbvec[jcoeff]; +// if (quadraticflag) { +// int k = ncoeff+1; +// for (int icoeff = 0; icoeff < ncoeff; icoeff++) { +// double bveci = snaptr->bvec[icoeff]; +// double fack = coeffi[k]*bveci; +// double* dbveci = snaptr->dbvec[icoeff]; +// fij[0] += fack*dbveci[0]; +// fij[1] += fack*dbveci[1]; +// fij[2] += fack*dbveci[2]; +// k++; +// for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { +// double facki = coeffi[k]*bveci; +// double fackj = coeffi[k]*snaptr->bvec[jcoeff]; +// double* dbvecj = snaptr->dbvec[jcoeff]; - fij[0] += facki*dbvecj[0]+fackj*dbveci[0]; - fij[1] += facki*dbvecj[1]+fackj*dbveci[1]; - fij[2] += facki*dbvecj[2]+fackj*dbveci[2]; - k++; - } - } - } +// fij[0] += facki*dbvecj[0]+fackj*dbveci[0]; +// fij[1] += facki*dbvecj[1]+fackj*dbveci[1]; +// fij[2] += facki*dbvecj[2]+fackj*dbveci[2]; +// k++; +// } +// } +// } + + snaptr->compute_deidrj(fij); f[i][0] += fij[0]; f[i][1] += fij[1]; diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp index 7ed1bc1e23..d30d94dc9d 100644 --- a/src/SNAP/sna.cpp +++ b/src/SNAP/sna.cpp @@ -522,6 +522,124 @@ void SNA::compute_zi_omp(int sub_threads) } } +/* ---------------------------------------------------------------------- + compute Yi by summing over products of beta and Zi +------------------------------------------------------------------------- */ + +void SNA::compute_yi(double* beta) +{ + int j; + int idxz_count; + double **jjjzarray_r, **jjjzarray_i; + + for(int j = 0; j <= twojmax; j++) { + for(int mb = 0; 2*mb <= j; mb++) + for(int ma = 0; ma <= j; ma++) { + yarray_r[j][ma][mb] = 0.0; + yarray_i[j][ma][mb] = 0.0; + } // end loop over ma, mb + } // end loop over j + + for(int JJ = 0; JJ < idxj_max; JJ++) { + const int j1 = idxj[JJ].j1; + const int j2 = idxj[JJ].j2; + const int j3 = idxj[JJ].j; + + j = j3; + jjjzarray_r = zarray_r[j1][j2][j3]; + jjjzarray_i = zarray_i[j1][j2][j3]; + for(int mb = 0; 2*mb <= j; mb++) + for(int ma = 0; ma <= j; ma++) { + yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]; + yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]; + } // end loop over ma, mb + + j = j1; + jjjzarray_r = zarray_r[j3][j2][j1]; + jjjzarray_i = zarray_i[j3][j2][j1]; + double j1fac = (j3+1)/(j+1.0); + for(int mb = 0; 2*mb <= j; mb++) + for(int ma = 0; ma <= j; ma++) { + yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j1fac; + yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j1fac; + } // end loop over ma, mb + + j = j2; + jjjzarray_r = zarray_r[j3][j1][j2]; + jjjzarray_i = zarray_i[j3][j1][j2]; + double j2fac = (j3+1)/(j+1.0); + for(int mb = 0; 2*mb <= j; mb++) + for(int ma = 0; ma <= j; ma++) { + yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j2fac; + yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j2fac; + } // end loop over ma, mb + + } // end loop over jjb + +} + +/* ---------------------------------------------------------------------- + compute dEidRj +------------------------------------------------------------------------- */ + +void SNA::compute_deidrj(double* dedr) +{ + + for(int k = 0; k < 3; k++) + dedr[k] = 0.0; + + for(int j = 0; j <= twojmax; j++) { + + for(int mb = 0; 2*mb < j; mb++) + for(int ma = 0; ma <= j; ma++) { + + double* dudr_r = duarray_r[j][ma][mb]; + double* dudr_i = duarray_i[j][ma][mb]; + double jjjmambyarray_r = yarray_r[j][ma][mb]; + double jjjmambyarray_i = yarray_i[j][ma][mb]; + for(int k = 0; k < 3; k++) + dedr[k] += + dudr_r[k] * jjjmambyarray_r + + dudr_i[k] * jjjmambyarray_i; + + } //end loop over ma mb + + // For j even, handle middle column + + if (j%2 == 0) { + + int mb = j/2; + for(int ma = 0; ma < mb; ma++) { + double* dudr_r = duarray_r[j][ma][mb]; + double* dudr_i = duarray_i[j][ma][mb]; + double jjjmambyarray_r = yarray_r[j][ma][mb]; + double jjjmambyarray_i = yarray_i[j][ma][mb]; + for(int k = 0; k < 3; k++) + dedr[k] += + dudr_r[k] * jjjmambyarray_r + + dudr_i[k] * jjjmambyarray_i; + + } + + int ma = mb; + double* dudr_r = duarray_r[j][ma][mb]; + double* dudr_i = duarray_i[j][ma][mb]; + double jjjmambyarray_r = yarray_r[j][ma][mb]; + double jjjmambyarray_i = yarray_i[j][ma][mb]; + for(int k = 0; k < 3; k++) + dedr[k] += + (dudr_r[k] * jjjmambyarray_r + + dudr_i[k] * jjjmambyarray_i)*0.5; + + } // end if jeven + + } // End loop over j + + for(int k = 0; k < 3; k++) + dedr[k] *= 2.0; + +} + /* ---------------------------------------------------------------------- compute Bi by summing conj(Ui)*Zi ------------------------------------------------------------------------- */ @@ -1535,6 +1653,10 @@ void SNA::create_twojmax_arrays() "sna:uarraytot"); memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim, "sna:zarray"); + memory->create(yarray_r, jdim, jdim, jdim, + "sna:yarray"); + memory->create(yarray_i, jdim, jdim, jdim, + "sna:yarray"); } } @@ -1563,6 +1685,8 @@ void SNA::destroy_twojmax_arrays() memory->destroy(zarray_r); memory->destroy(uarraytot_i); memory->destroy(zarray_i); + memory->destroy(yarray_r); + memory->destroy(yarray_i); } } diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h index d05ad0fb84..2c90da1d30 100644 --- a/src/SNAP/sna.h +++ b/src/SNAP/sna.h @@ -47,6 +47,7 @@ public: void compute_ui_omp(int, int); void compute_zi(); void compute_zi_omp(int); + void compute_yi(double*); void compute_bi(); void copy_bi2bvec(); @@ -54,6 +55,7 @@ public: void compute_duidrj(double*, double, double); void compute_dbidrj(); + void compute_deidrj(double*); void compute_dbidrj_nonsymm(); void copy_dbi2dbvec(); double compute_sfac(double, double); @@ -80,6 +82,7 @@ public: int twojmax, diagonalstyle; double*** uarraytot_r, *** uarraytot_i; double***** zarray_r, ***** zarray_i; + double*** yarray_r, *** yarray_i; double*** uarraytot_r_b, *** uarraytot_i_b; double***** zarray_r_b, ***** zarray_i_b; double*** uarray_r, *** uarray_i;