From cf24dd026520a5283589fd89150e49da3e4f0bc2 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 2 Oct 2017 04:53:17 -0700 Subject: [PATCH 1/6] Adding pair style dpd/intel and dihedral style fourier/intel Adding raw performance numbers for Skylake xeon server. Fixes for using older Intel compilers and compiling without OpenMP. Fix adding in hooks for using USER-INTEL w/ minimization. --- doc/src/JPG/user_intel.png | Bin 20491 -> 19528 bytes doc/src/accelerate_intel.txt | 8 +- doc/src/dihedral_fourier.txt | 1 + doc/src/pair_dpd.txt | 1 + src/USER-INTEL/README | 16 +- src/USER-INTEL/TEST/README | 20 +- src/USER-INTEL/TEST/in.intel.dpd | 48 ++ src/USER-INTEL/dihedral_fourier_intel.cpp | 441 +++++++++++++ src/USER-INTEL/dihedral_fourier_intel.h | 82 +++ src/USER-INTEL/fix_intel.cpp | 1 + src/USER-INTEL/fix_intel.h | 3 + src/USER-INTEL/intel_buffers.cpp | 3 +- src/USER-INTEL/intel_preprocess.h | 9 + src/USER-INTEL/npair_full_bin_ghost_intel.cpp | 2 - src/USER-INTEL/npair_intel.cpp | 2 - src/USER-INTEL/pair_dpd_intel.cpp | 617 ++++++++++++++++++ src/USER-INTEL/pair_dpd_intel.h | 110 ++++ 17 files changed, 1345 insertions(+), 19 deletions(-) create mode 100644 src/USER-INTEL/TEST/in.intel.dpd create mode 100644 src/USER-INTEL/dihedral_fourier_intel.cpp create mode 100644 src/USER-INTEL/dihedral_fourier_intel.h create mode 100644 src/USER-INTEL/pair_dpd_intel.cpp create mode 100644 src/USER-INTEL/pair_dpd_intel.h diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png index 7ec83b3207b06c4bbda7d56f2a7d9d94a15d115d..5061f1af2e26d9c2c1110390143d9ebf96946bd4 100755 GIT binary patch literal 19528 zcmeHu1z1#V*X|$)f(jz3)Jvy?;2^D(fOLpRNeC+49ZJX0NP~1J1Bi5YNJtMYT?0rr zbM}Dxs^9z7^`HNo|D5Z)E^%Pb-s@TGUTfWJt!MA;ColWpD%MRb5D0WtQbJS_1iFw0 z0-*$5x&R#MI`ZZOK2WR_9|(h>U8GCEA83a6W$uGO1;Lk(buI#bW135-S%E;;t&u+{ z69yFaAkf2cNzwbt_FAh60v~31!_tiyt2`78uciloegEKj*`p9)wwYMSXWiUChAid? z=LOO`dM(K~v3n(XNo7I$phlc-L(`Zm*Z#o4A^qxm6n;?RcCw<4AOaQxg$sL~nr#mu zP=ViT)8vyM6kq_q8K5_j10T_(_)fsbQ%iYx#njPCu+ZuL;k;^4deg?tf|==cxZ#{b zAue!T#r=Zy3#zF#)k|^;@S2)Z4aDgI_4~1YJ0rfXI0ztN0^8r^!G6=JJ5~vF^w6bF z5!NXV9L~dNs-~9puR02Wj;Qn86x;+PQJlS8Ms=vwYLG@)=@Q)QPYiJ6=m`n6; z9Zg?#C$_4jWhX%afhtLU*arON-gf`Ya5{|N4gWv&<>B-?Vt)a#uZ^H5@Aq*+4MFi2 z0D-nI4s6z(jJh>?9GaVfZr=lxbgu^fZ$hg3Kk>J9@AW(&OlDT%w0YjViKGy6vziHV z01Kjl*e@{bzP-z^tNG>S_7L|8BjBo$<(>5;A?sCece4)&)C_7O$45p)rLNnP1EP+? z4N8#v{c~g2FR8sBr?*llTS|XFHb1<74OGkU7UnVi7G@{&ogxWpL)(E{5HzYmB?gMw{O*_!2I_fDFV|-J1Tb+{d402)2x!FM*z3lZW5807Bfo zz49q@Z;Fi*-j~>Y*j=aIOCVw_d?)$%VgHl$bu;JKi=Zo4{RJ4h_?ujhq$^zRlV~@f zg90&;hq#(fkBqX{j-%e~?%xK3o=PBPW-#s^t0e{q2o9ryye|UQ8OG|$`BKO>;G}#% zkfDnM0&%j779fsKDOz(|~q*zDxr(t(3;jfqqDB zr0cG!F>qTc!ZO9dc^g|*ZPe?F_Ou>TNA4Ebi7$D*V${b|`l~4W#2Vj1S zkbjM`gR&E-HtWxm6$-R#OoNjeFhEzjen;=;ybi!B{_KaI4uZpOZmZhmnIgyYWT2HJjhV)_W z*FTkzDAT-Khw1{Q4QYSKChcUYzcfvz8f%B1BW=Sm5LVxsO3^o9tX znAYh82Aq?7_JzH^(ZLTlNGrhiu((xRQZq*qTM#H77j0|>zyL>k{AnOs(`lA@%Am}> zkFY+;o?1o<2b}T;n&eQ~C@!SvlOB1l?`O?*|8ewqDvw$*m=4ESg8uqCPX^= zM3fGgDC{&XiBt|>s#W?~5^53oXkCmsh_ne4J+oioh5(!u?@eA)(W{RfX4)WA(9q z@T3yz7pq@HjR1V6tt66-@fPiuwXJ~igpdBSAUu4h0~F0L6}X$5+YGe;DE(*co8KqF z`;6Cd->`i_G`rnIy@&GiDjco_B{ffzj+Y!yUmUOcY;N@JgYZE=3+E(oI@sV>t1ROr za^k@UnG+f3i4oT~pJ#bxF$`}r?u23Iw~NeJfpo0@8CE@iU9)f%_?djPU*x-dsA=A% zLzyhZ_zh%}f)=?MRC)5>7Thv)IOY-rU99`bMa}q46LbfJ_ku(LxIn+awMy>gUAM5>=r5 zcX=E&WHm)x9a4~Z~I z!jF&KPV8LB%;--1iIR>bi``pBA}vl%5wRQlhNqZkQComBEqA| zW1+ECiWM@3T@e1;zd8rgzlwQ;aG@#AEn|Ig>Zbe(%6{}WeK`DbRt&by^n!%_si?H~ z+sHxk0e|(C0y0e(zblzlZYqud&!5>^tQ)*uh59+<-@590Q?*ee4h4EJZ5J6r+^miZ?jzo(<#`D2cDXAC`c%X&Kftl(rHWyx-1COm%Lnajd}_& zYsFiH3Wh&>qv+o6XI}vt8yQ|`;Hb}NkI+<~RlLw^x3GZ9VHn0q99m(ED;8c|h9}Ij$ZcjiNO zYSY^4(}B)cBhn3vZYh3eh%jw*EKW9)df|(Sb}7TNYuB|)fpO>Z-mYq6J@|ZodolrfAWVoWVRlJYNP&gv!#FJY#ZL3lnAWRw0NhQF~Ri#{yC0V|z8 za2`DW`B9o>$9GPq@fWba@C$Z{60Gx#|55b+T|_7OYTSTLk6@wd>VvtuJ))w#)cesvD8P@rrA~sAglg(_cmX7!KPp@uCz4{;_^$b^y9TP@q6{+ ziCd9aJ!e+rG59k-tF=q}SK8OilSak*M*4mXM)HvzDlJv&AvMeP_b#7vA&|X)M*%9T zFk7>U7A1K@{3BM}^a|SoH}`+z@)Tb{f30M!b!z64oRt+-I50Q$-27M51NEBg2ko4b zN9}uhg#|eI%l}$Vg#lRzbLvSW$T^q&^y3u&I$h4u&4Fn6repyHAqUG!-mwE9Tob8(gmDho zlF~l~DC{{*9ru?wQWGVL8UH#j&XNAVYQRX=-_{ltlUW(AzlSK~Zws9KfcS)H>fb@= z-&sJ#L?2h7)F`4D4AjP~mwLFC$X@t+$@_b1{THqR+&?ms)flZX&`dRp9GJfM5`a(Q zp5p_3j)IB~wp8$q9nlXm$kGWMr%GWEF!)%c;Kz}g0}*RQjz7p!`+EMJ?CJ#_{<39L zr87Jj`s#m+$TLO%a>@H!Dw(#g*b_BuO0r*4o24j3ywFiriT3E#6Bh_IRzjBd9PkHU z20#-N6J_0Bv3Yxtqf~K^SPoh~s$3Mx-NVrS@%PYEAyE^>2XX@dmcu}~sNmAsaD*g< z&&~tcntue+tB#wT+x+Yau?oTb+wEu|??)q$El3W8g~}h$M4cp(m@{YOvFF(wfoxfx zS1 zxE$~%-A%C=LU9!@9spc`o51Nnn*?NPfN#onbkCqTZ8HD>A?4ZFjqF2X^LUFcdv46; zRLH21IN(82q%Z%L&Hxl*q6&^3&y7;A>xrw71cV?b@-uSnR81JUSZ{ARFrJYEsq_<9 z$nfep>(10f0i>@0W7WUl+jC-Op#m}+9T@t5jUi|MCr=Fk1A#L7i(AgiTYv;WA1EZ@ z48Qs!DTOfl2B2Fn-mi!PeJFqk6_TTBn`wT7F*EC;fjAI7+ek1t0eb<^knxkq{ugU; zU;u&i-FtEhOGZG$e;o*mbNm5UpOuggzDW86=DjoA0LrWX9_0J@k<|uI2WbDlSjMHr z+3loY^Uqt11-F@8I%{(t0ucv31;K-39QGzfO$OBYh$Kx#XumN(i(~9<_b0v{O zVS(VkuLs2*r6#_G&&wy1SEC`vDRlq71hUlrDAwH?*OL@d_9-s{F~AC7cVztu3ZVb7 zN{cEB1fT#Q*LeR4EL2o4e%wzb`sMEFm>1C_WB}^s0uj2YN~T@F1#lWb9!r5K zK-=;cSqKn<&NphV*3Nq3leBc>fc2>mm_*3qT%59|&3kJ9v!v+~UEDkvYuzEX+$rzSXE^RNQ@OKEfFFNXnS z*`cGSj%@sX&f2rxztUTZuXS;B&AY^bL~s72N_NJj96k_}-}&Thw{p5z0=UEITgO9! zbUmB4%^B(ci-1%*4-}kLr+>}NjZKx7voiW0<$-5gWIeNGXK&LaGUyzI3-u$_NKD(1KM#u;} zv_+URst(*n-132r1JB5W@cmmg$t6#I>hy44FE5ZS+Hgk1^xvyc3Dbfx0qrtu&*qR% z|A_9dIL>zqA!bv9mb7W`2~ohvf3JRQE|L4GngS2(NO+#&J@v)+f+TtPv0eb~ zD(Zt30k*ZCV(>^)-c5V7bE@nt{VNm{72S<@5pJfx;wbveuz zjE_mBt#O~~;nEd`I;Oz(uw1~CFu{TfIzez(B3A~mk^K89HDti9RBdy2ohT8Ym(FdH zY1#4+*<~+Qtg(hVW0*pq$S}!AuCcg<&o|3lkQCD1TV-8ySOlFc@ z!aer%8#YTqD0J#Jx%v3zCYJB^9o5IQlPG*gswgO;rpTghj*a?X6=XbIJvU7Y-J-s& ze^jaSQ^(x;>W(XBg4KfbuP7Q~?NpZ{6$;^7bCt&Em{42$L~_)G4LCZwOPtw5f7wr! zNen{s$>7)ZL$F@pjTN`M&<~KzoE{$6GVM-kPpDj4zQt`|Py<5&eAYMooHoR0bn^iz zh1y(Db@gk~PlmtB_)0*YRI}&OzZ&BS6CP^VLVX9@Qid`pZ0+Nfn5DV1pQ&E*me`Sn@q^Np@k^P%Dw7fC79{mLG+IHcC4rtZi{bPzz3U zS9OuFBLm)R`{Wdt3hdZkpJyr5B*HvC+6Z0Mk@SkZE%(Sv->Wr$qk##Ja;2R@_G1hb zx{;qaKh=lvJ`rMRR@!9abz30nS#4fg#Q6A63vrYhwoJMl#qJ^EE_1N-rRTt42TZSY{Bdx@zCN@4T)C{s> zvBA&ZS#n={(nVs@xhA}Dfen5+SAq$@yXxuwm$*O1EK0*iK^81d*|?N9z&D3r5xtDn z+1Fl93h~l3KfrStTdf`Eb{RF24uPPI2AUXaVIjt*Mi$<0-PsgF5?BeIBq%iaHo=KO z!DZuwphrTT!TR$=4w$ac&zi1=c+=yHqEkIB6`uQ|B|9NACq@VH{#;2tkLHR@*#evK zI!`??wrhIBu?Fy-2xhVB6BpcAbXd~L)qGYs+4M()_{>cfqndQRb%Sr}td0j+2yU4y zZA(27kH`GLac^!yP`#@rp|kW)uXZ_HfM1Rqwp2>XPD&nqm#Sinr)9Zyd3lkxva?Ls z;sAABoE|G!aGusT*N1dn`_qDQnggnO!GoIzRA&s! zBLj+bZFv})!3t@Z(PO*zVvUx&CZwU8M^52PhlyK3dOargowv@L=J(es@Cuk zVHN9>+&dGC*uu2^?PM~C+DnpNhnQC>x}+bMmnXTvy2{Z7yAo!+u8=^C@>wjD>{`di z@bey|a0t>g8{(}_dTN!;q=%|plML@gx@7>uqeonOK^UvQ)0UnAc_vzI zkfQ3}`CgZogpF8?c6(-zvg5pRmh`z@kS<>CdZ7V7!TG~l```egC2reun@XFrhq=k% z5r-G#!%J;;tftc|XZdp|=ic^6lergbPIif@1`)Xiqx09D#8ra%>L!VO%dv0g+{P!T zbs6S@tW)H*tnA%r!iE>m1iBrxXc{cK?fTR@GQP%tF^9>`;y3+lDAD>6IqQw$v`6N< z^F~*8@YvaoJM%t0Tf?vhvk%I|*KVGrU2O)J#62wH&_aMXD@AXdui zXI=`<=Y}^vpglDk;V0=vo1{yY3KQztWRBJMKLj83>O5PYe;;t-PF|uQBE9BUKb>)q z-!2BTaDHi=0a5Wgf_c=_DAH7{-6el09iS22vN4>sBX;HzC`=*>Q%QM(qibTT-cqaW zyv77Yf^;#B1D_kJ*YrbJDn^zjUeNUkmp-|Y<*cDlL1&haV zK3MI~<(hM!@IKY)!*zY_kBCT9fR|N2z~B-H zem1#4$f$(J?3`(vjF3U)1ILs(d)|ya+TPcF52Lo-J{xoKv_*s)p!RUOeHQ)}zwgo5 zAMFg%gm2sxM3*0JA5sk|wq7fR3pQ5VsJaw(LL)XeFm|Vr&74nXYQ3L)q(ht$27mEl zwC(W&#jlneE~pl&YfJ4-a;E97V^gJAxGJ7fyS za7uQuEqeGlRi-k`7Ea%-qp0N$Z=5^ z6mugJKKLvdMh(C03DlQcUYkpvMxIm2h4YKhKnwa3l;|Jt>?X*hO38ePf8OXOQX_U!xc1Nz`8|H$G7Z zmu!8P-d2L_#B0cyQI1ur^6#{1@CO~hLu*AtideIQ-<*)2c2+)o4`W=`?5y!-?2T$D z*;pf7&BjLUU6HtC>+|{~MT>6*Pb5f$kvc=F7a_%s@89wWXU)N;5=XJ?+UVDYhb?Qd z#)3=JxP?9B7Pqw@a28!k2yGdQ9Uvf3!|M!gciXr)-ieL+THUE+r)%W2h4?m!Oxg>r z3;89}wHCx?+Zw@E0yLW@X~@pIMGB2bGhBFEH}q>(&#CSO!#g9XoA^jyZWXqo%wx<@B63=QRN`cv^J zy9?CM^;lCwO;idB!lRbK9Wn)y@o2qkRhBK=QF-lmoLz1x4ypT}PA@IVx8q=}&&qtP zP=!+{vzx9?+7f(Xdo*3n-`pXNH`TAZ>gTk{S zq4l(8y&C6Qi%CruvHTWC{f-cJQ-ipQq-m~ox$4A#faxi5dG=>^lM4GU$qRvZne8+U zYBDOT7v}s57)j<#hBgv&ywZa=@cCb>zrps=m&GJcfikqkKbJ$BY<2qbbx@I*G9pjL zLX(`HY6|`mwWd;8>=>rSy=b$FPcejyOfoPCo-^@KEcl9zLl?~ zF{tXCasg{a&!2_6PZet2yXOzt7@Z3!Grsp{jiywz7AJYJGCb+Cs%=d(Hbhf`*KatQ zG#mBYbbu2WkG+c&=HouSyJ|YN`RU}ATO2~7lX^yDAn@d61>7m@UHG$KrraTdapgYEPgyeksWY!lKX6QyF}Wt$Sk7_DJut}RedYZwpe%dgv^#tu+gcll}#3} zziXiF!x?{ko1y(zuujH%?%!%GuyS?FZTXZu#6uc~4ir=!6MP=(%QdW<_Y1hMcRmzy zoyttJpG$&8|Jo*$LPNlHU-7z&E6T|>_aQ8MR~5^265(|gz_>{~C?0BdI^|eNaIm^g zz}L5X-^j;g%vHf9P!~t*9s-knILD~zsu%PvTgBT^96NhG<)iEx?KSE=cVGZ71-HMq zSSq5LP;zuL^PL?V9gnnhySGlGO8;xmv&O(bj1fLY;sab$3!dmKk-J;5pg$~eHrah! z%l{eH6-XeIf8flKXF4CnX*#s%Wr%9-wEbZKgsF z-7HFQ#7>@Dj}7n|2k7e+x6=dqp)dITWzq#~mbjliPZvym;je#}(uWwk8mjr*#tYEF zB~?&ZQbv?DNAl^-mR>^=t5cqb3zDio(5ApA@nWpkXLIy zJM3Np9wYo^CAD-IVgb|H@jk)~oxCUfata2*ZUp-!0b)CuxxM9CbtmHEK5u4;`rC?2 zV``bxwZ~C6_U>2Tv&c1p^?bD6S+}*^@lf$!wx&msMdK8X{OBJC?XK^{*VKmvF_73% zi6}-8E!`-yZEH=pGGz$t8u5$P+@5oQhWMuo(3r|@+v?4D3r}xc>mDIaSo9JL#!n)@ zo?a=@P!K$SI4_DJPDYBix)OXvGL$XMF-RNvAb;7h;YfgXjhkzmh3SVO(|$^eCTZOg z>iVO*S9|=%CIv`eV%lB4Bdx{AFLP_dS&ilB#&&rksQAm(salIudmkDKs&Ep<@)13C zOIy#FHDP-2_Mmx7|ibSaoKd;cEeW@!E z)=MSRvL>l#D*f?UzjyzKT|A%NkSo9#+CASiAuxaHhRccr*U*_w%Qb#u+~7xznWaKB zIWWB=XL=htZw}>Dx5_ek5jfU$byzNMe=lfSK$*R@CZxQ_LNW20$o|NbjgTaW(!QbQ zm7=8QElE@6D->^hN1o%fCBJWFStUBqPbrcK@+79_R>YTW!Xm$3BKm>%VMzX9*ls># zP?R$xPE=pQQ;G_63iiOtMLqdN1e=&pT}+82QbBV6lSdNcQ;xHSrS?T}(>sr$Q(v#> z4~F|h4i0F{2=3q2@_uQuF}k|oyYsPPtc$Xtc87oa8h>l=AAES$FNv&3E8b;M^Ja57 zhk&ZRFAjX194h)`SXOW1HPygN1x~!$_BLO_P$L9XFE*XiMS_)DNJx|l=}b0eDlhIO zzrJV8ukI2gqF~?a2*ppQk5!nq@)pga>9vCs-`9wA;3HX8%NR^m^S6eH+(z!fRP5{5 z2+X@yCTM&$kdFAA`T*bXL)NGHfr#PlmoK=Bnufh&Kex?MRiFR!RfwDX5v zyh1`>g%=@`!^-ccC@s{|Lhf^ae5Say8O3WNr0Oq)XW~y!B|@cq-C-HFQ&ye(wS$3v z>dTUek0Z7s9F@M39CtCSUrwXGGWNEU)3cWWg%iQrXz__QIkPvk({0Rp#C=W=cECPj zP!FHVfhx4muel?t-d%UCdu*a$n#nNE_*EvkLjOtuBg<`O6!H%il&*a1a5YmJ#yI~w z{4}fqX>t4ZC0F7{#RQFKnzQNOykZW9E$U2iEW@6KmJlS86s01sMmG5v47;b-pB|GN zl2YeVzcLzw+L6YMOv{&8XZ`dU1^xjD1)$B0Gem|c7vFGY z^ooAciV>m0CyJ4`(41ZMdm|-tb~UnkyAU2dGx6W6J?;f|lD-m~_UlYdncLs-;p;Z` zFg8(WRJp`0T>Mz|+DpP>!IXqJ{q7T~QxQ`Z&08MzEv50G;-`Dj&d)X)N2{cl%ElxV z+-^Ri27A#zc~42iz!G+impjI?5fvOOyNY#-!`5_GQIO!Flf+o`B3XG!ghlK4myb@c z=0;H`EN;kr$Yp%O7kqSH+Jp{zDYsHtyo3Yp%w7Qc0>emM9P|`K83|Rz)0qXJK7vRB z*kd65cul9P+d_6C`b59&bOXD*|1I=w7^90=NOJ4_Rx^|x%Fuy#aQF59>lNT^BfWe4 z+VDjZAq+fCG~g|zHsiC_D)Vpcw(8eKsGmN#dxYRHKS1D?ij|Ez$kjP7oCNbVEed5G zsY?8^g%7}oSV!zk8hl;4_^xYJRF2h`d{Wy<`z3)_n3kWTv>2B~C%>#&$;?^>ZcVzq zKUoDV06{8=h?3MSt#{BWu`+U-lIydYR&u9E+D9C8Lz#j3gZ#6>3zO z%06Q|A3qWIoC+x0z#CJ*3-w!ZGN*^Yy_$zXR^-s}ERkvdm`2kD^UNT=P%Mn?-gv}GL*npoOE)CMK|7-pZ6ku=Ar58_8#4y=qNp=`JxcJ znU&=_e>>B5`X#sGYL1F-MiXkC!}_&(0>OffStP1SXQ;xHVP*r}WY7oIs?!5& zzC(It-Xd2<)a1wopQu_vCc@N%qmrMf!);9$E$CmlxY(7F_7DUUG7e7hp1>v%S>Y|z zEXG9mtDguUp(GSE?iJSNJu|M2wu8nU>G{GuktwKTFIJ%PBjW1a^i*uj*4%tD{Irfl z#mQZU7ULpzQDioJ>Qim-1uMOjgF(YWTmD9i+Z;6Q0$#7s#2r3f4!F{mYz&s-5nZPI zP^vTI?ZUT*Ru>-&IY82qrt^o%L<^w_vM)sZK9`+x$OKY8i|r|RI^3D#s(LaaW5O}k zR5s+OuIpf;?dIfvQbb)ETDs_{<}k9dI_WC#B+>3<#Jk{D_sx%_OwOsT1);OM3Xk;R2y$;;b zOdLtT`o?omGwiWOTJ{2GXcU1so*XS+!g$jxkC}OQ!&SyQ;1Sj2M8<$mR7-S%iC5$X z>39Tb#4eMtVOj$!QOI>S4BI>MU7wqAjDe^kt20n(UdF2aUTI3}beoG6c6Y{;`8L&G z@4bgY;zYY6In~Bw-N?q5ym}7AFTjYfIF9D91+y~0N`JZIa#=;pyUNtJ63YI@UMt{k zmoq~+2|2kZx1d6)mE1Bt6+{gvHc`o~#9`APgdKwAU%$|E20FGv1*+|_ux5boJ4Pk9 zY&)81&gnb}wKFR6jx*dqsj_=23fpwVyTv>e8xiDpDR16!sCiBK&Y{=@qhOqy9$Z8^y*`0a!{>MfGyBql!NS8&fOK2GL0dXelJMf;{?qqg5l zWHuqvlqOMr5&MQ|0R>ak3YORF8LeTgyllNSSm}MsVkdH{IZjVzZUF~>Pv0lfZo*rLe2?fp+=3Q1wXvjhB-t~K93n6a#@w>7bM8#H` zQ+_;r8PTa5IrknogkCnv{M;74#<3=C?K~J*sD@)BexS^?(?Y@2bTRZ3q`pku@6aJ0 z-+SKw#5kp);;|JIx&D-;;2XlW#hnj0ZFkn+nu8O0=tlC4D*p&`#MVMiGY6fvE-pZk zqQxk4(5_JMArTa}Ad2cMBJF!6REIL^6u!=e!FYNfrWX{jYW6&ce)A&S7bDUobxgwj zz(b&NQ)zQ1tV0CNhB677!JOpF@x9s4 zmxG0rW2Hmiu^%dp-5Yo0(+brctXt`^DTp%pXjTZi6AVy=Tf<&yB1#-(Is9t zMNF2EDTwux~#%j z8DPq35PbcmU70fc1+}?X;vs!r4K?dC8}mJ(Sg;Bvl-}Jf4o|iSCPz3^UcWDJh2zv+ zdVyamfF{c$9Cw7Qc-Gt~rQ5KRsK7=>P&x2Sb3tYFC{f2l=K^7$hM^Q$-oOkz=ty+A z%~*yy-5VlFd7>eS1z`c_HLUbPiwFGvLiNr$cad7-G`TVdu9*v5UpRr*y{L9?*nA=2 zuzy1f@?jcw}n#!`Fu&d}%yUG$%ym`ozi8-`amfkT11f=yjJ<2ZBd5CTI$nkl<_vUk@ zIQy)-b4oq^W?%y=$+UPzJ9TZZ5`MYcU%g?Uz;t1VkbBPwxWI-{m<+di?p8+euvxl=9?Ng%7Sx5%A0#ifh+dr7R~+v_;k$l ze)+UMBv{mibZESb>BSwEg0dSxEXP3(G_Ermw&NM`%#(k)H@f7#@N5p^7c_Zz&*^>Q z(giOuEXv-kNGGNM!U= z)m7}PSD~wGc8&x$zjQImjKxkD#_uMoO&2*SO0VcTze#y@^#zAvH-2iR1ck$4P&F_- zia(=?8z4+KXNo;c#ffa74OLY1*5f-=AWWe~qsY{4ysnK}RSu_6dYq;-UGUDj*l;+5 zF&@HPt*5G3Pltr93T7lk*tm45MPlW7Ls6?!Zc7G}q&;OlxtJzrKlx{gfutu_F!_#N z9I0K`kOOVN7wkf{_JbqrV?%-!xhmDAI-a>)25eo_&-z+kyMna@#!U@tBXnU1uRqTr zAQF482CdMOjo#I7+Zfgrl5A>}IqutD-o61m+Ae9B=VTy%mxSn}-Y}yD^7c8B*t!uX z+R7s;^3+LMiAGK=cka}`TV1_D!Dk;r?CMWqBfZ73G&S%hyaQbSi1sU6H!Rh%pWKs( z(4T0qQ1~O9E%xMHyd^z~XWA<3T66M;as6TYI*8W_mW-9L7mIjo{EK)wi%`D)*#yB- zgIS0lI4}&WX-OuTG&zx4Wbv|<%b^MF+2ji zCbN^i??^;++KKS*(4ZNk8xfJ>Yni*Z)!ZQ11ksn{(ghInYyF4Ra}1r18HfHce*ZR^ z?!8!Lw~?&Z-O4wPxUd^*bkAO_Ci>%105hH4jxO@(SK*275L$jBIsv+2C@&&o7q{TO zx4m~-;^?KZH?P0!k?(DL`<;|H3cdPZ)fxxFhsYS?cuXvh^}d>hPm_>O;WN;ekVcO z_mtYioZay?B5Mm4@*Y?6tTn#9VarlvCu8}=-FmLC03C|;xa*;;M1L8W)xCEPxb;Oq zotYvKmIxgPMgV0v4^0L9qzq_FB)}qY07{)9f)xOVfg(;6m~Gka|J7Ru+lTfxbN9A} z_DWc}c^B@ye@uvFkmCKu&~D4#>S419Piaj;n4kI#5#Az%2$x-Y=NKso!Bx)p7yvo(V7A7E2DDX?Rh6LkcJZ_~Zykkl1jKp{2JjJSEL1#S z@M*Hf4Jm$!3#QHpq*P;_o0mL#iDbuQf85tQ7oTgYWM7HE6suKPkMwae?K$rYB2yatb;2zAUx4avj zv^Ch<8gVTkyHrmKtPgq*k)&O2joWzua7_%z6;d@8--|&53>*U}7CL#MO>7O<+3$#8 z^zuSN2p4VH;Jo)Ht6vw=Mn}53u>&I;Of6E)t@@oDA;Bu$bY=&rfg%B~bQg|7!31_X zQ2ZtHBFSj3>KOhouOOuD6qUqNV$in1h?T)NOq{qkdlIm$Tl&faUyil5 zZWsEX;xWKHD?@L>%XgMtXAB+<0K_Y{!!aB#pWvvs5`TvduI93}o+nUl<`Nby3X2fmegM;BODp8$;VvEItL zk)osyLixov{21kxnK)iX_>g!95bSaDh1+>ngAx@u-dN8qmfg>ND_j$8xxfu`BXgoh zUD|Y>_ILr9?(tRbFuTnW!m0f+P_!b>*s76490ODc&(E$uk2n|) z;`n(rT)QNdC-%S|5=WQ*ob40q_A2W_s+YJIbJEa90B_dzL0^NO3V%0j03Ymy`%Y+X zZs*iy41gHL*B>#05u`ixz(9gNi1mxTH*GIL$`^oj&Ul_J_LMkkKm6muPF8~au>Og4 zGCI@tE5ISHsZ$b9sei%)zeR~z3LX6Ux1fW{|LG#n8SDt{dffsxzXyPm^2a-OVnwC# zUQfjHfd%mYSimG7Ve-izlOy)U#r{X$Q_$2y^^9oz8_!M6R zLMLN+E|1fenKxX~1FN#PM!e+TY@`Qh?F-SLv#`Qg7`K1yb^W=D;2iV6Of_-fNnW2>?FKfr%^Rc=!VSNd(SV2U!+!_DC2RpnCy?o__Ybkyn z|E)6|6s6_-=`6Wx<7$)0^ySPo|EoPqLK(iBM=sGXuY2zvXCHWIpgriLOBADK^YLfu zH(uCEZ(3i+&uPx~mhf*6ReQf9Icjli&obKeBDI|v?HBD@;q378!hH8hXAeqwFLQ)# zOy`v2C^+uD)k?T1lz6v`d*K%6*H+NmB0pDc)+G$(16(@ebCGwIDa9N)$sNVPZ}vTF zIucC11h)S^q8oNT&eL+uSVxY$eP}q6>(pVJ)yDCcW5+jdK8nsKc-eHsFE>B{5-AYDDR;b3K?-fQaf@d#7;@Z4Y(_Iul^-VjS zS~z1~d@B1&1xhHe$LkgX$!MlYH`hOH%mdIU`|~u@;I(#ZW^Fk)xbx}U0T@C&WCsj= zlAZX%W$KrEZ2;ZjQ-BA%!37W@4>*Psx@9VGv$6P(>-bIdZ?3mQxb76wr=F?qYt&Gb zzT@=%scR5#-_!2XgBA((R@KGmTrbBxOztw3#C*zM!{07y^AfFDY9o@6K9j&Xk}&co z!nrm$izizku{|~#3Cx?n-d4VDmZV)=v)j08WbWcFq5bx+Z*Qk30V3py`s+c&oZ(h`!HtEF^x1U3-i}l=nz3>2& z$%um^aX@McYZ27;cU316EQ#TQMMj0)dQUtTbnP2tNs`e01(x*Gw zaWb1@1RRffn%XMd1#|=4HeH?BfHxWSp$<|MJ^VTtVB@cRaS# z#JyirkCI>X5>rNGR8=AM-I#ZC`KckX&yr;65^uCzR_b>3o15$e6%&L-t2nv{Hd zM6*t3s<7fOi!nvm#&24Mg{h-75oKLr?M}(T!CakJ^3mb^C#z`&CE()&=^(q<0E^7jhmsYbI8WFsPuM`Qtu3zF zd+#2(-*A=#f!UvYl`<300!q$ULN3IyKp-OkenkFRR}16sTacmi>9)v(_ika=gf=_J zGkq0C{>{CB6IkJpxA-@@_L<~CP9h~KrNw(LUu5u>)IL@wXb zXhZyS-hru?Fp4&~2O(IOa!(a=#u)MJ7BlhUl`zvRv0jD)=oo~$9-b_kxj!uLW`&+F z1PdL|LH#3vQm@fSj)zn8j&auShbn2zBbaLw6)6)e`JZa9W!bS*6j=jaMIbuBK{jK6 zWaYNF71u2EIIQOW_|v>e)Ezj6{H=y4dpwyfM<1Z_gu-P=&2H}9XS1mGPJjQGb@Y_* zhkt*0z|HMMZ}1+?K>|gcG6pUJE3b`N>@FCvNN_r+sElU>5(Ao24Xu6>Qmy>&FP^VN zhws`Q!1+1d#WzhIdCa_4r^N;C-0>~xH;S168%(@An&oh}cV+sGDwaeUNOZejNhB6L zn@o@@ph8WpB`W}(f$X+IYLkXaf)UN943Y2va1rg@Y^NqhAa|D5fnIuL}CN?5s0HO z75f)mAw-QM1~e?(g}rx8*U8?!zIuEQQ!o)aa+Ta>vy6Qbw=cfI?nuw5kb-b zR)2{t{)-gumjuvTp#nBm)4g_Yaeb6A7RaPKh4XNqA~-@_VONH^q!s2b(zss>fPYUS z7t_1A7$;ENah~t~<*o)q(J8|Tpq%#QRGkv2AK;z_#wvDiC*;Bi|DTNbo(vHk+EZrp zZ@v*|%>W?KUR&bolYd$KZ!TPD6Z2g6A{Xy2dG-%3$1#heqr|IOLjb8d;P8)JAiMAR z#lXHTUR_iR(7{h;;yV|U2CCG5l+V6h0p&J=#ymamm_k6vi{M~@lLIPIh+YXDy!qQW z?7!@IZz{QK_*o1+C#d0-?jScCz=ps47C!1WN8X`3)AvuMdI}Q~7@-Ad3L|=MlXpAy zo8*1|#zS>vkhv8zXY8a|RX_)VNq*bqM?3)Pv(+I;+u`3sWB)~je?|0F;#osX$ta<6 z1B;j-+hIv_GN^wZ)L#qo8`+kuZR#ag`din3UGQ&95lHH9L-~El0BO?YL$-NP0^mpE zumq;68v$ZxhWJmg{69AAzVX~7@z&}Sb{@6{F-r8-ls`0c-+TZ&aw`UnyoElTfL#@3 zuPwA~B>2o}t9`r9Om4@IHilaEfxTty81#4P9|29rf0Aq3BlxXZP0Ni4v(WWO4Y9U# z3Q*whs_t*{Sta8znX_(vQ>ZnN&{9 z2mA-<{$$KQi`w4+`hP{W{M;=3huXCB)@7c9q7yK|XT^*u=KiX*r^4<2jplG!P7aRT z0=kR(2iwX(uTyl2hfMPq&HqIL|1X-xxPV*@Q0gy{dFb#5fTNb?f6he_Tui^ST`qwb zD5+ELuu{|ahdb}pU)KF@e$m#%fcqI0Ua`hqCN$m;udmMTD*$!Tipu1_f?b%JU@mMI z+xX!B&$l^cgzSe1@ab8x-$6@VAL8$oKxm5rNCzteBwjnI4YZSt@J&f(=&^Fxb3++JEriVP0DeXYKRkiU$iQJrx zby@oZuET$Bi;(&5LZ2IjwcjWL^osv@StQM$k9}u|`#)b8?bSQ%@e>CGnI?GbKU~`D z6WQN|B*pyi7RTR|o9k^tc*+)fc4uuxDseG%|AFlJrx|AK2a3cGq_!}X{MDPY-?3UH zbW}_q0YUnb4yv_act;Mvz=Jsr+9ugty9pe;SF!IOy!SydbmnTLca;Ap_zi*fpYpzo z;>z}jRi=L?`3p(@5t^QTXE;RO32h3b2u17PeUpJccA%GpfU_-#O&|DwBMrd2#O#o# zJb*X-qC{1uGmM|na|@EdYGXl4Pc*(Wy5r9+&o4F5cZ9sWV@CY*ocG>t2uRACx{WCRXN9q0-%aBxo_b|2sk0n3WYyQz<=DbMaKqmZm%>KKu_GPw2%5wI|ouSA! zFY8I>k>JNOUrqlUiU$cN?78|v2{pnHPSEDRAq%YF&X2_~O>C+Wx`J(G0=|GP{x9mv zBDcjSnK zB`WT3if<7ngX2I4_Vt_M`Byx?rPk5X_L~R&cdGva_@mney6xrJHXvlPyRq4m zS4sF_RgcUseF4yL?{}rDFxrFjgWq)ap0L;ITnR#++C7Y@Oe&XkA-Vs_f+&7`x~s?H ze-eaa9vXRI_}ZxbPE)X`L&TAKLdAG}*6eK^o zxQ|z9iY!NnyU5C6XeVu=&gaQP>W|4n{V+8fZ9L&^RX0DEz%?qZQ+Gw3S7!WXULO}r z3R4n3&mwZ#l5~{6Q6997lO}w6tdKL#)A=#a)FY%hF#eN2N7%(KZaxEs;~pm5S1o8( zfUc$o2B}coj@ocOT}FR+_VQ^ZOzmvLbrtE>Te(mMnCY~;Kn9`V`I!H^bL0C zSg3`L?+?I{bm$9~UHzCt4bnO31*t&r5FIaiZm2yj5xFpV4RePd(w$xZ*=^DXdOv)& z&)rU-y>uj{4Ql0~$r;enXXK`Jf3mK}fDo_E{rHvQG?o0aSF5_1@m=^*oH0urz7L98 z*~E)T!yXiltQ>~~7YpZF8bLEpbPt{-)dr2%npDi2u>lQ1nisMv*)_jFr(aP~cg|@0IG=8c zE3_)oR7;P)ICvi9jXjq`d62~fe$k{MByVB)qzEmvZ`sx<(K^XlifBo!Oh``-E5ESVTGZWW2*_mTe$s?TRtOiqDY$ieco z%S5oY9hC>?P(b1AD6C(3Ux!_pt4jNFfwoAT<>BZjmzWnZK_piXrw(NJ5=Z!EIjAr@ zL`aVyWKQG8y6UZ}#KM?0RiDGj*7TN-K5PgzEf?ttVkXnitdr}k{qU6|$i)rO?+oLj z4s&*3&d1DdJ2QMMH2p)bZv~iFUGKDTx}1G-v(nU23p61~(pm>an##>TQLrC72E(X0 z4T5Qnuxwy{+gR!%ItLf<0;e&mo*ah|Wh}o%S1Iu2B6qu@d#xPCymxfty5P5sk1Mn0W|oOLmx&cDx~o%C@;y0)jWep(9Az-+_etDb*N?%6=-*|2v;u=}R<2KIjRxWa}TPwFNVc#a04IBX5A zF61C%q{jOi)PnKWnv|+NM9U#XiFAb&Ord5S3v9zTH7yp$*ebPI(CeQtgZtCRs~t<( zJOauP>VuU{6@nA>2kQW>DjB^-B(k&;d$A;6qm|u9+rg4Opci5md|S{pR21vEVeiZM3w z)UpZDg5nZ^my2sWtPv6xUOpii?KlXhP> zgf$l|gRJFLaZj`;b|xc(BpB5eHzCQ<+P?Ax16mS!IQ?oR%pe@h#aBPE!Bj0fi0E=Z zc?Jefi|?FvjaZDG<)iy_HkIPqx$-lxmP%bTiPY&Tu?dsQA{`o@c7p`wC3H8fw7~WS z+~v3dgxIWO<_1Md_FV|~lOjWd!8`LD#ZgBxM0T8R<&$)wQIOn61~;`%LG(J`9)5+e zcLudL+API-W@e1aG!9X6rRI>5vh}Dayqc)X^ka9PT#U^{Acn#;(mSSQq^hc{9trq)RrVc*U;C&+OJ9>S3<#mg?$PIYqglosk$i@x{&_jyZ{tYjh z_{fH>F++SDe6B_)F!^)hLYYZfsLu6r3Y{Venqs)Z{hPgspvpDX78hrIsNIW+G1tiH zE_xn;dGksgbYHaY*IcT7?ISFVdC5}IN*h8odDfCfA9Q-QWc>Bb`|@@l2m^CI9P&F~ zKT$v91xDE+Pf0~+#q-(pxUN>lo$Om{Im~g)T6?Qd_G=pH6`bU9QA>KgM+DirN4CbAjVccj zh7}%2Tf^5WaM; z{;*TiKjO$$!6V|_MTUwaVXzrJnTlj67oHdJr3&ray-n%3@xlvR_^KIs@tV1K{Y+5L ztRuWeBo>zxuA88HKa0N?Bp&~iYV6qwR_$Wfc8|AUu@iplNi-=5^FtL>Ii95>JftlR zwClWZOE1<0{c|eH^cIeobB$EI=h!AQP0QcsVgv;PwdqSTg@uOnY&GdFe?!rj`W9P} zznV|MNvmWjL(!UEE9k>iG7FPy51&0ev*k6Q!7vk6naPbFvhTY<=hfeuSz4b1A9)0# z(A(KXEovuPm$p!i()zY@+cPw)u`srLiYya24MKL+bCzapezJs(vh9#jUsyJm94+;j zept|*IL~&{U4o`FRCw;%APpocN`TR0BVl8@a{BAEv;A9p2^uy^W0Ud%Tj(Mat5JEu zyX^s^KD(3OkbJI3WQxAh&CDe1U@MI!__hm>~)x|^6=bbR(TnDf!vD2G&i zN5OR?qd!2TPq^N*xA^sX56aC^G#4r7>y(Ua)TtEjG+~CX=ZMn_Kbj_P@Yf3GPpJ9Q zsG#d_-(hM89>_zz74?uZd);rt;`RWjEVeq9lvNqEb;pP!lNoeu4HL$@8ihKUxe=vAlnGf~;5T zN&QP6<|v`EsQ7fqxw)=vk=;Be4k?;v=L9RS@UJ~O1r zvLqtC^pT`ZBOxHexR(A@Y9{(FHF)+F^Kps>P9ua!a6gGo$%lx=twXYkJm!Lxk($IB z$`RY01Y^`1q}i`wmqW>tR+l)amVFv=?x*5d&amHEGLoRT?APFp8qGmx>OikL$79li zd4o3_HU`J)BY6cjN&|Uq5}XV!gsV2VERSOk8}g0|_Z3zrD3}moi6)iXkWf3 z6iL0NHv4S6m9jj2{aJG6{jEo*HEV;}qrU{SA)1F*EnhO~H#3jB@{U5*!mk0g$r@Qg zLMUN?c(aW<&kbPO!jaASaru5)Fi=a^py+|nwP%vZeB-xTBPmGEWcQi(w7Jy*9y@(GZKtJ~7S2@(OY`1lU7)W1K2kPOcc_wJf!@?M#*jV!>&F1zm6*)lH8tJq zFXgE1JBv(Oc7)kXCM=K(K0PsR+*L8(~YrNsS7L(N#GsYR7*X~ zV8KmYsw(_~Sn@`WYM@|>n|J?5Sc87~Wfr8=d6rPeF$c{U!51GF(hux5)I%(8JOX3X zK+|3KcA-_y6tWm;TA;-k>`4LG?N}hj^cpxK`rb;hHLk$NSof9ZpuDfqR_PV9pf;b) zug*JUgl&7tO8q2450cJt?t1<`u>8VEHlx1&rq!Vn?Q(+!Dv6JPeIHd(h>OnZg1yB3 zv<9yg9S0|xEF_&W*UeqOgB|S13Ys&w6T)G(5)gN&LX;DFzD2bpvf4Ng%mjVwlI_lG ztlm!5EYV=sw5VZUxa6O)U_=rpS{BAC26RTUScgs7Z~F>3GipuaXV?w+ysSBGe(7p2g<+ zbNg>+corQ~Eb!L$t_6->cCjLBo7Pm8q;-B$5?F(88&=?S1b7j>D4j}wI=LAE|n!ndn4X#=me;ES54-M zVT3Hiu(cZx1+}`OtG%>54_$Ie#OW$R?%gPt`AB!A!{Uvq@Jn#fq3;Vc1HSEz>XZki->^(Q#aHH(B$D7>h4W(pW zvx)HTkL`94m_aA^im5K99YP_NsV9?rp2%F)OA?ln%zeT&v!#)=W!o4{LOpx<8Jn~J zct?HFK-i*Uw~SFdGhPISUA~W%HEIA~Oa^nWSR(fHNj9-$y8TXXN}cewJAs%u&We{3 ztu&!LbcS?tD;nvt0dl@86oE6M=@ic!6-mQJ`&bo7RqlLzeVa!!kv)7+pEI?zAPg$8 zlqfJ~_c^EoEx}){67zwqp={-uDc2J2Lc68!oX%xgrd;!w!jh;^O3NPGyENr5CR&eR z2ZSLv`?EQ@7}*yn&1iL{H$Ge}*=_c^rS{o@eEsqc%K7m%Q~bjG_((XWt3I{ut58j= ztp!e1<*s}EVWq14)<_H5Q^I6V93KfLbsvJ?&W~A#D4<2A9h7Pce8CE+diBxQh4NIBs0g~NoM9Hhe%Q_uns0; zvvPzy?{6&(kJb}4z+b;rH7qi1R~5KMQP)SyubJp6V1DVvkgU$*5j+2h9V4;ppw3UV zF*j!qRph75K8t~EZoGE6an{5_&>gb#go$TiBXPLsnRi-+ zc&v?`jPP;R3-{mix_NVh=0ExSc^80hJQ-+*^c{Y{dTxymI#TFd3y?P(wEgmpNl3Y_ z;iv7M9+jCQ;#8Rt<&#oTtPY84^wtVu;8=d9=dp8R?c+6;B$n2tkR|Q->-17|xgx6{ z#wBv;sM5YP2c6Uj?s3B_H^1Ne-NP<{0vL=Z3_>kzEFhR8mmKlDb!iKmyoYkn^NC9bmivx zvzlQnYU``^%CuP#mQ(kuD&@t}^vo(i*otjuo3Ev#5!U)?Vf6>0z>EAf(@QVrT(cRB zS$rgv-!YTM8GhboRAKh9F0ji+yfNc)P;Pt1d(CLb>9alc>?4%HQk6(p|KRIR#`|8p zcb>jVMrnL(SZz*phu4aR7g|*(sxDQiOckjrUiL>9L?$_WRL*%2S_-ayUn|$a*u^QE zqQbA)pKC=qckB$+Vr>>De-hh*q8V*`H@H1$`Sh#I!iq#fAVYl^t2jj~TNYdeXfe*% zjJ?e(?Y43Zs>R2?HR6nVb4yF}p~g6Frcmqi50&XWg)k1QTRffM0*1(vRMCd95d(_~ zht=W!*)3T_*VMFxBK)&?BxoA%kprcoU72p!W);X5^<6oRw`2%^OLKi)sFk&-0ar1- z5rnFP+DvK8m)h~1U{4osXl&N8M#k}*((2}hD;qXfe!f31u{09t=bZ;;S)90`GBnH0 z*afc3w2CIJojJ^#3xahBv)Vpj>v;U8wWUMBR7|B4SMN_#trnoI7Z<$lZ{(a?2N0y1 zCZ;hGW_M+X-vUXZQf`)(yJk{cL)rq1;SC)Kx3Rl4%-dw!tYgS|9bU7w#xN1?{Pygm z@C|#hQO3^h?sqssZ4r8t{|$X7xR8_kopgGcYDLlS0!%o)o?d;TBK*=T$^EAX0VSwu z@)VcLc(Jt~;vW!)XY}6j(F1(vp>(-&GdxLbk@G}rT9P{YAXYrwzUn6`hCJ@vlKHHxrH&UD$>RZkRxW`D=E(+!72Xy3F<%_cpyV!iEw*oVC?Q&HG&v zt$S7N<%Qp2ho3ClUUSsT*|#@g);4~{e#;8T&_>L(HebXNMskeDs(2%6uHg(bt@&}! zO>>$MV)(H7F+yya%2vLf>H(k-(HTd!P}ckH4u1ACT8n z(0QNHEchwHkF|Kdlv}lF0MpXIG0t&=J^hM@Hy3r%^@Y{l=2_ZiTHet((TB}FxlZN| zLv#UV4iFWJ8>*??Rw5tm{Oqr{WtuqjTMS-O7JSVkz#vY4_qlD7laOgdw$+8`_O;PT zWY@7v$job@FVCC7SH?Z-&1~F9WZ#$vAzIa>W>ppKd_if`$@y0lB|DtlLstZ`VHv|E zmXxf>zMDaZdoFVYCV_^8trUdu$h7!(x>@1b!jKcxFLhv>rSOW(7tdK&ubu9sf#iN^X&zS3o|?$uQ?EDuxHWJ#v4dFgFhwKmn}iJd@` z?he>1N=x62;dRRvS_%yk*-Qu1?uIP^E6z%Gc&^o`cgqz26}q{BrR11F8LCLDzP^?^ z`f)vNg{#@u%aw*&nkJje=DJk9qIH(cm{C$wdE8>CxDWiUql}z%$E$G*n75~I9aGh|otW-5p zylyv1Dzv?~o|`m$c3QNawJ7t+Pou<|gl!^883pvGh(`|7UR?^Cc^wsu z`TC+PY_QG)TyD96$BbL_l;~QdlU2bA8`k`KvRn&EEsY<7t%T=Gr7bb?FP~TH_J?6> z3{qYfb~k0ukB>Qy>%ASKb{)A4w6xZH;dpt^!dkvld&u;xdF>O;aV$e=^JQj&>SkgP z{B@(k=sY1kj4{JC2$3LdU+mJPtx-VN7vN~tQ^dj^eXfgFleALz3khV!0pUEb zw9q84)`uY;Q7P#@>F~R6H)@2}mEXAaCOW@}w`We#3XWov4W5b3D~}2nE+7vYGYiVs zuFV9x3+)$#1kH^Qu0ISe7aIbGwB0Kr`Hu%J#z?JSIb z!gt&z%T30`YxEBSUMXnGn3MWSU@p>+JZ0|!<*#Z%G$;g8J?=7~x%*QD{Cl$_;?aVDrAu~evr?d@4jA;_unc6GjONHE?gkubPbmh|oQ%hR> z7{B^rj;q(%c8~;bT5DT!N%Dknl$+GHW%|3{g2lK5$&n#1$6d4aFDorBg7?pIQx{6k zpG%i0f4R5?3D`7&_8gPZraXL^?!XxT*~a64*O>mfu`oQu0UIV#7r?{S!rQuqXJ~ zOT2m6e*@A$LtW^9Gc8E-N>t6tNKzNW`N^hrcGuyHnn5Oe&x6_H3M;A0TYz2$F8!7Y zQr03&t;97wkN_%82WP}&M)-ZgeJYFhRuSgKaFMNHiwIBOfwpONtr{P-S4Qz{@>4zzCf)^U z*<@`Q-m6xrR!myvkHdRRJsuxx>8_o5ymr_Z2R>!vjLRorE2)A;_)l= z!1jsQcQVw}3|I9!zVvk^tdYmCKp}4Q@KA%$h89#a*J|CUlCHCzl?qPURevmbxL|U5 zT4HWt+#{jW^s>drRIn#MWl|@@`P4vo!Pcqy6F@~iv&E?%3RF2wb!+qE4V_f;O5_kwV}_=3b`q@n(@wENj46jM^XjoRj%l1CH}y6w(6D)4*rC$3{=sE(V&3 z_j7~=OpkvMU~%P>*-oc9udhLIb>WUey4Odi*Incp0G~W5oe#a{Ry<+eSi?HDYP#_u z9eM7(ogS!%T8V|}Li;8=gl;^`MF=~mTKPn@iSKRB7bIyBb;oPj>&c43zU-JH6ImM` z{{+e-k?`_7tz^W>O+$7tyQ@dwC){6YImdCMS`JYuI6m`>aD;O0+o~wz- zpybk6vm(}xuZ3Bi3$@Ce4>T9t(kok|8#Ab3PeKW#p7CWBNVDl52TqQzJWcS+5a}Fd zFJXhcHhj%6zjc#;eqQ&_ZL_r+zd4z- m!AP(@hNdL%-Oas2H$O~2cP3(Q1^$2^k&GnxZjOY( 0" then "processors * * * grid numa" + +units lj +atom_style atomic +comm_modify mode single vel yes + +lattice fcc 3.0 +region box block 0 ${xx} 0 ${yy} 0 ${zz} +create_box 1 box +create_atoms 1 box +mass 1 1.0 + +velocity all create 1.0 87287 loop geom + +pair_style dpd 1.0 1.0 928948 +pair_coeff 1 1 25.0 4.5 + +neighbor 0.5 bin +neigh_modify delay 0 every 1 + +fix 1 all nve +timestep 0.04 + +thermo 1000 + +if "$p > 0" then "run_style verlet/power" + +if "$w > 0" then "run $w" +run ${rr} diff --git a/src/USER-INTEL/dihedral_fourier_intel.cpp b/src/USER-INTEL/dihedral_fourier_intel.cpp new file mode 100644 index 0000000000..805ffc0e25 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.cpp @@ -0,0 +1,441 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "dihedral_fourier_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp) + : DihedralFourier(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralFourier::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + if (evflag) { + if (vflag && !eflag) { + if (force->newton_bond) + eval<0,1,1>(vflag, buffers, fc); + else + eval<0,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralFourierIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // c,s calculation + + const flt_t ax = vb1y*vb2zm - vb1z*vb2ym; + const flt_t ay = vb1z*vb2xm - vb1x*vb2zm; + const flt_t az = vb1x*vb2ym - vb1y*vb2xm; + const flt_t bx = vb3y*vb2zm - vb3z*vb2ym; + const flt_t by = vb3z*vb2xm - vb3x*vb2zm; + const flt_t bz = vb3x*vb2ym - vb3y*vb2xm; + + const flt_t rasq = ax*ax + ay*ay + az*az; + const flt_t rbsq = bx*bx + by*by + bz*bz; + const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rg = sqrt(rgsq); + + flt_t rginv, ra2inv, rb2inv; + rginv = ra2inv = rb2inv = (flt_t)0.0; + if (rg > 0) rginv = (flt_t)1.0/rg; + if (rasq > 0) ra2inv = (flt_t)1.0/rasq; + if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq; + const flt_t rabinv = sqrt(ra2inv*rb2inv); + + flt_t c = (ax*bx + ay*by + az*bz)*rabinv; + const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); + + // error check + #ifndef LMP_INTEL_USE_SIMDOFF + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + #endif + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + flt_t deng; + flt_t df = (flt_t)0.0; + if (EFLAG) deng = (flt_t)0.0; + + for (int j = 0; j < nterms[type]; j++) { + const flt_t tcos_shift = fc.bp[j][type].cos_shift; + const flt_t tsin_shift = fc.bp[j][type].sin_shift; + const flt_t tk = fc.bp[j][type].k; + const int m = fc.bp[j][type].multiplicity; + + flt_t p = (flt_t)1.0; + flt_t ddf1, df1; + ddf1 = df1 = (flt_t)0.0; + + for (int i = 0; i < m; i++) { + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; + } + + p = p*tcos_shift + df1*tsin_shift; + df1 = df1*tcos_shift - ddf1*tsin_shift; + df1 *= -m; + p += (flt_t)1.0; + + if (m == 0) { + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; + } + + if (EFLAG) deng += tk * p; + df -= tk * df1; + } + + const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; + const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm; + const flt_t fga = fg*ra2inv*rginv; + const flt_t hgb = hg*rb2inv*rginv; + const flt_t gaa = -ra2inv*rg; + const flt_t gbb = rb2inv*rg; + + const flt_t dtfx = gaa*ax; + const flt_t dtfy = gaa*ay; + const flt_t dtfz = gaa*az; + const flt_t dtgx = fga*ax - hgb*bx; + const flt_t dtgy = fga*ay - hgb*by; + const flt_t dtgz = fga*az - hgb*bz; + const flt_t dthx = gbb*bx; + const flt_t dthy = gbb*by; + const flt_t dthz = gbb*bz; + + const flt_t sx2 = df*dtgx; + const flt_t sy2 = df*dtgy; + const flt_t sz2 = df*dtgz; + + flt_t f1x = df*dtfx; + flt_t f1y = df*dtfy; + flt_t f1z = df*dtfz; + + const flt_t f2x = sx2 - f1x; + const flt_t f2y = sy2 - f1y; + const flt_t f2z = sz2 - f1z; + + flt_t f4x = df*dthx; + flt_t f4y = df*dthy; + flt_t f4z = df*dthz; + + const flt_t f3x = -sx2 - f4x; + const flt_t f3y = -sy2 - f4y; + const flt_t f3z = -sz2 - f4z; + + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif + } + + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oedihedral += sedihedral; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif + } // omp parallel + + if (EFLAG) energy += oedihedral; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::init_style() +{ + DihedralFourier::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1, setflag, nterms, memory); + + for (int i = 1; i < bp1; i++) { + if (setflag[i]) { + for (int j = 0; j < nterms[i]; j++) { + fc.bp[j][i].cos_shift = cos_shift[i][j]; + fc.bp[j][i].sin_shift = sin_shift[i][j]; + fc.bp[j][i].k = k[i][j]; + fc.bp[j][i].multiplicity = multiplicity[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::ForceConst::set_ntypes(const int nbondtypes, + int *setflag, + int *nterms, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(bp); + + if (nbondtypes > 0) { + _maxnterms = 1; + for (int i = 1; i <= nbondtypes; i++) + if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]); + + _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp"); + } + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_fourier_intel.h b/src/USER-INTEL/dihedral_fourier_intel.h new file mode 100644 index 0000000000..a775e129f4 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.h @@ -0,0 +1,82 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(fourier/intel,DihedralFourierIntel) + +#else + +#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H +#define LMP_DIHEDRAL_FOURIER_INTEL_H + +#include "dihedral_fourier.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralFourierIntel : public DihedralFourier { + + public: + DihedralFourierIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t cos_shift, sin_shift, k; + int multiplicity; } fc_packed1; + + fc_packed1 **bp; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); } + + void set_ntypes(const int nbondtypes, int *setflag, int *nterms, + Memory *memory); + + private: + int _nbondtypes, _maxnterms; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 637fc0d06e..eac48b8510 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -285,6 +285,7 @@ int FixIntel::setmask() { int mask = 0; mask |= PRE_REVERSE; + mask |= MIN_PRE_REVERSE; #ifdef _LMP_INTEL_OFFLOAD mask |= POST_FORCE; mask |= MIN_POST_FORCE; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 068e5ed890..d7093e79bb 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -43,6 +43,7 @@ class FixIntel : public Fix { virtual int setmask(); virtual void init(); virtual void setup(int); + inline void min_setup(int in) { setup(in); } void setup_pre_reverse(int eflag = 0, int vflag = 0); void pair_init_check(const bool cdmessage=false); @@ -50,6 +51,8 @@ class FixIntel : public Fix { void kspace_init_check(); void pre_reverse(int eflag = 0, int vflag = 0); + inline void min_pre_reverse(int eflag = 0, int vflag = 0) + { pre_reverse(eflag, vflag); } // Get all forces, calculation results from coprocesser void sync_coprocessor(); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index b4b664cb94..ac208f5a0c 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -409,6 +409,7 @@ void IntelBuffers::grow_ccache(const int off_flag, IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0); lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef"); #endif + memset(_ccachei, 0, vsize * sizeof(int)); memset(_ccachej, 0, vsize * sizeof(int)); #ifdef _LMP_INTEL_OFFLOAD @@ -425,7 +426,7 @@ void IntelBuffers::grow_ccache(const int off_flag, #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ - nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \ + in(ccachei:length(vsize) alloc_if(1) free_if(0)) \ in(ccachej:length(vsize) alloc_if(1) free_if(0)) } #ifdef LMP_USE_AVXCD diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index a7663d54a6..d49d0d8b00 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, ito = inum; \ } +#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ + nthr, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ip = 1; \ + ito = inum; \ + } + #endif #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp index 12101712f1..e6d45d7b2c 100644 --- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + stencil[k]]; const int bend = binhead[ibin + stencil[k] + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index 79dc75366e..0068e02635 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin]; const int bend = binhead[ibin + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) { diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp new file mode 100644 index 0000000000..0b5760a7b0 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -0,0 +1,617 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#include +#include "pair_dpd_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define LMP_MKL_RNG VSL_BRNG_MT19937 +#define FC_PACKED1_T typename ForceConst::fc_packed1 +#define IEPSILON 1.0e10 + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::PairDPDIntel(LAMMPS *lmp) : + PairDPD(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + random_thread = NULL; + _nrandom_thread = 0; +} + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::~PairDPDIntel() +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_NO_MKL_RNG + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #else + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #endif + } + #endif + delete []random_thread; +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::compute(int eflag, int vflag) +{ + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairDPDIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag, vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (_onetype) { + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (eflag) { + if (force->newton_pair) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } +} + +template +void PairDPDIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * _noalias const x = buffers->get_x(offload); + typedef struct { double x, y, z; } lmp_vt; + lmp_vt *v = (lmp_vt *)atom->v[0]; + const flt_t dtinvsqrt = 1.0/sqrt(update->dt); + + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + const FC_PACKED1_T * _noalias const param = fc.param[0]; + const flt_t * _noalias const special_lj = fc.special_lj; + int * _noalias const rngi_thread = fc.rngi; + const int rng_size = buffers->get_max_nbors(); + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + int *overflow = fix->get_off_overflow_flag(); + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, 0); + + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + #ifdef LMP_NO_MKL_RNG + RanMars *my_random = random_thread[tid]; + #else + VSLStreamStatePtr *my_random = &(random_thread[tid]); + #endif + flt_t *my_rand_buffer = fc.rand_buffer_thread[tid]; + int rngi = rngi_thread[tid]; + + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + flt_t icut, a0, gamma, sigma; + if (ONETYPE) { + icut = param[3].icut; + a0 = param[3].a0; + gamma = param[3].gamma; + sigma = param[3].sigma; + } + for (int i = iifrom; i < iito; i += iip) { + int itype, ptr_off; + const FC_PACKED1_T * _noalias parami; + if (!ONETYPE) { + itype = x[i].w; + ptr_off = itype * ntypes; + parami = param + ptr_off; + } + + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp, fytmp, fztmp, fwtmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t vxtmp = v[i].x; + const flt_t vytmp = v[i].y; + const flt_t vztmp = v[i].z; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + + if (rngi + jnum > rng_size) { + #ifdef LMP_NO_MKL_RNG + for (int jj = 0; jj < rngi; jj++) + my_rand_buffer[jj] = my_random->gaussian(); + #else + if (sizeof(flt_t) == sizeof(float)) + vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (float*)my_rand_buffer, (float)0.0, (float)1.0 ); + else + vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (double*)my_rand_buffer, 0.0, 1.0 ); + #endif + rngi = 0; + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < jnum; jj++) { + flt_t forcelj, evdwl; + forcelj = evdwl = (flt_t)0.0; + + int j, jtype, sbindex; + if (!ONETYPE) { + sbindex = jlist[jj] >> SBBITS & 3; + j = jlist[jj] & NEIGHMASK; + } else + j = jlist[jj]; + + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + if (!ONETYPE) { + jtype = x[j].w; + icut = parami[jtype].icut; + } + const flt_t rsq = delx * delx + dely * dely + delz * delz; + const flt_t rinv = (flt_t)1.0/sqrt(rsq); + + if (rinv > icut) { + flt_t factor_dpd; + if (!ONETYPE) factor_dpd = special_lj[sbindex]; + + flt_t delvx = vxtmp - v[j].x; + flt_t delvy = vytmp - v[j].y; + flt_t delvz = vztmp - v[j].z; + flt_t dot = delx*delvx + dely*delvy + delz*delvz; + flt_t randnum = my_rand_buffer[jj]; + + flt_t iwd = rinv - icut; + if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0; + + if (!ONETYPE) { + a0 = parami[jtype].a0; + gamma = parami[jtype].gamma; + sigma = parami[jtype].sigma; + } + flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt; + if (!ONETYPE) fpair *= factor_dpd; + fpair *= iwd; + + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + flt_t cut = (flt_t)1.0/icut; + flt_t r = (flt_t)1.0/rinv; + evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut); + if (!ONETYPE) evdwl *= factor_dpd; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); + } // if rsq + } // for jj + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + rngi += jnum; + } // for ii + + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); + rngi_thread[tid] = rngi; + } // end omp + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EFLAG || vflag) + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- + global settings + ------------------------------------------------------------------------- */ + +void PairDPDIntel::settings(int narg, char **arg) { + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_NO_MKL_RNG + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #else + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #endif + } + delete []random_thread; + #endif + PairDPD::settings(narg,arg); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_NO_MKL_RNG + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #else + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #endif +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::init_style() +{ + PairDPD::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + fix->pair_init_check(); + #ifdef _LMP_INTEL_OFFLOAD + if (fix->offload_balance() != 0.0) + error->all(FLERR, + "Offload for dpd/intel is not yet available. Set balance to 0."); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + _onetype = 0; + if (atom->ntypes == 1 && !atom->molecular) _onetype = 1; + + int tp1 = atom->ntypes + 1; + fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } else { + cut = init_one(i,j); + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } + } + } + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.param[i][j].a0 = a0[i][j]; + fc.param[i][j].gamma = gamma[i][j]; + fc.param[i][j].sigma = sigma[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::ForceConst::set_ntypes(const int ntypes, + const int nthreads, + const int max_nbors, + Memory *memory, + const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + _memory->destroy(param); + _memory->destroy(rand_buffer_thread); + _memory->destroy(rngi); + } + if (ntypes > 0) { + _cop = cop; + memory->create(param,ntypes,ntypes,"fc.param"); + memory->create(rand_buffer_thread, nthreads, max_nbors, + "fc.rand_buffer_thread"); + memory->create(rngi,nthreads,"fc.param"); + for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors; + } + } + _ntypes = ntypes; + _memory = memory; +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts + ------------------------------------------------------------------------- */ + +void PairDPDIntel::read_restart_settings(FILE *fp) +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_NO_MKL_RNG + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #else + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #endif + } + delete []random_thread; + #endif + PairDPD::read_restart_settings(fp); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_NO_MKL_RNG + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #else + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #endif +} diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h new file mode 100644 index 0000000000..9181ff38f4 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.h @@ -0,0 +1,110 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(dpd/intel,PairDPDIntel) + +#else + +#ifndef LMP_PAIR_DPD_INTEL_H +#define LMP_PAIR_DPD_INTEL_H + +#include "pair_dpd.h" +#include "fix_intel.h" + +#ifdef LMP_NO_MKL_RNG +#include "random_mars.h" +#else +#include "mkl_vsl.h" +#endif + +namespace LAMMPS_NS { + +class PairDPDIntel : public PairDPD { + + public: + PairDPDIntel(class LAMMPS *); + ~PairDPDIntel(); + + virtual void compute(int, int); + void settings(int, char **); + void init_style(); + void read_restart_settings(FILE *); + + private: + FixIntel *fix; + int _cop, _onetype, _nrandom_thread; + + #ifdef LMP_NO_MKL_RNG + RanMars **random_thread; + #else + VSLStreamStatePtr *random_thread; + #endif + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + + template + class ForceConst { + public: + typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1; + + _alignvar(flt_t special_lj[4],64); + fc_packed1 **param; + flt_t **rand_buffer_thread; + int *rngi; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } + + void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, + Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +*/ From 2a24cbfe0c2f4158aeac7fa833f59f918dcfe811 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 21:13:51 -0400 Subject: [PATCH 2/6] reverse logic for using MKL pRNG: this way, make serial and make mpi will compile LAMMPS with USER-INTEL installed --- src/MAKE/OPTIONS/Makefile.intel_coprocessor | 2 +- src/MAKE/OPTIONS/Makefile.intel_cpu | 3 +- src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi | 3 +- .../OPTIONS/Makefile.intel_knl_coprocessor | 6 +- src/USER-INTEL/pair_dpd_intel.cpp | 90 +++++++++---------- src/USER-INTEL/pair_dpd_intel.h | 16 ++-- 6 files changed, 62 insertions(+), 58 deletions(-) mode change 100755 => 100644 src/MAKE/OPTIONS/Makefile.intel_cpu diff --git a/src/MAKE/OPTIONS/Makefile.intel_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_coprocessor index a717be93ff..75e4d89170 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor @@ -10,7 +10,7 @@ CC = mpiicpc MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \ - -qoverride-limits $(MIC_OPT) + -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu old mode 100755 new mode 100644 index b7db064574..2c3cc51249 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu @@ -9,7 +9,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) + -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ + -DLMP_USE_MKL_RNG SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index 8a45b781f8..ff2d0cc5c2 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -9,7 +9,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT + -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ + -DLMP_USE_MKL_RNG -DLMP_INTEL_USELRT SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor index 406e98b36d..769c166105 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor @@ -9,8 +9,10 @@ SHELL = /bin/sh CC = mpiicpc MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ - -xHost -fno-alias -ansi-alias -restrict \ - -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT + -xHost -fno-alias -ansi-alias -restrict \ + -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT \ + -DLMP_USE_MKL_RNG + SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp index 0b5760a7b0..c7cddfccc1 100644 --- a/src/USER-INTEL/pair_dpd_intel.cpp +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -47,12 +47,12 @@ PairDPDIntel::~PairDPDIntel() { #if defined(_OPENMP) if (_nrandom_thread) { - #ifdef LMP_NO_MKL_RNG - for (int i = 1; i < _nrandom_thread; i++) - delete random_thread[i]; - #else + #ifdef LMP_USE_MKL_RNG for (int i = 0; i < _nrandom_thread; i++) vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; #endif } #endif @@ -216,10 +216,10 @@ void PairDPDIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - #ifdef LMP_NO_MKL_RNG - RanMars *my_random = random_thread[tid]; - #else + #ifdef LMP_USE_MKL_RNG VSLStreamStatePtr *my_random = &(random_thread[tid]); + #else + RanMars *my_random = random_thread[tid]; #endif flt_t *my_rand_buffer = fc.rand_buffer_thread[tid]; int rngi = rngi_thread[tid]; @@ -264,16 +264,16 @@ void PairDPDIntel::eval(const int offload, const int vflag, if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (rngi + jnum > rng_size) { - #ifdef LMP_NO_MKL_RNG - for (int jj = 0; jj < rngi; jj++) - my_rand_buffer[jj] = my_random->gaussian(); - #else + #ifdef LMP_USE_MKL_RNG if (sizeof(flt_t) == sizeof(float)) vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, (float*)my_rand_buffer, (float)0.0, (float)1.0 ); else vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, (double*)my_rand_buffer, 0.0, 1.0 ); + #else + for (int jj = 0; jj < rngi; jj++) + my_rand_buffer[jj] = my_random->gaussian(); #endif rngi = 0; } @@ -420,12 +420,12 @@ void PairDPDIntel::eval(const int offload, const int vflag, void PairDPDIntel::settings(int narg, char **arg) { #if defined(_OPENMP) if (_nrandom_thread) { - #ifdef LMP_NO_MKL_RNG - for (int i = 1; i < _nrandom_thread; i++) - delete random_thread[i]; - #else + #ifdef LMP_USE_MKL_RNG for (int i = 0; i < _nrandom_thread; i++) vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; #endif } delete []random_thread; @@ -433,7 +433,19 @@ void PairDPDIntel::settings(int narg, char **arg) { PairDPD::settings(narg,arg); _nrandom_thread = comm->nthreads; - #ifdef LMP_NO_MKL_RNG + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else random_thread =new RanMars*[comm->nthreads]; random_thread[0] = random; @@ -446,18 +458,6 @@ void PairDPDIntel::settings(int narg, char **arg) { } #endif - #else - - random_thread=new VSLStreamStatePtr[comm->nthreads]; - #if defined(_OPENMP) - #pragma omp parallel - { - int tid = omp_get_thread_num(); - vslNewStream(&random_thread[tid], LMP_MKL_RNG, - seed + comm->me + comm->nprocs * tid ); - } - #endif - #endif } @@ -575,12 +575,12 @@ void PairDPDIntel::read_restart_settings(FILE *fp) { #if defined(_OPENMP) if (_nrandom_thread) { - #ifdef LMP_NO_MKL_RNG - for (int i = 1; i < _nrandom_thread; i++) - delete random_thread[i]; - #else + #ifdef LMP_USE_MKL_RNG for (int i = 0; i < _nrandom_thread; i++) vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; #endif } delete []random_thread; @@ -588,7 +588,19 @@ void PairDPDIntel::read_restart_settings(FILE *fp) PairDPD::read_restart_settings(fp); _nrandom_thread = comm->nthreads; - #ifdef LMP_NO_MKL_RNG + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else random_thread =new RanMars*[comm->nthreads]; random_thread[0] = random; @@ -601,17 +613,5 @@ void PairDPDIntel::read_restart_settings(FILE *fp) } #endif - #else - - random_thread=new VSLStreamStatePtr[comm->nthreads]; - #if defined(_OPENMP) - #pragma omp parallel - { - int tid = omp_get_thread_num(); - vslNewStream(&random_thread[tid], LMP_MKL_RNG, - seed + comm->me + comm->nprocs * tid ); - } - #endif - #endif } diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h index 9181ff38f4..416d873c00 100644 --- a/src/USER-INTEL/pair_dpd_intel.h +++ b/src/USER-INTEL/pair_dpd_intel.h @@ -28,10 +28,10 @@ PairStyle(dpd/intel,PairDPDIntel) #include "pair_dpd.h" #include "fix_intel.h" -#ifdef LMP_NO_MKL_RNG -#include "random_mars.h" -#else +#ifdef LMP_USE_MKL_RNG #include "mkl_vsl.h" +#else +#include "random_mars.h" #endif namespace LAMMPS_NS { @@ -46,15 +46,15 @@ class PairDPDIntel : public PairDPD { void settings(int, char **); void init_style(); void read_restart_settings(FILE *); - + private: FixIntel *fix; int _cop, _onetype, _nrandom_thread; - #ifdef LMP_NO_MKL_RNG - RanMars **random_thread; - #else + #ifdef LMP_USE_MKL_RNG VSLStreamStatePtr *random_thread; + #else + RanMars **random_thread; #endif template class ForceConst; @@ -86,7 +86,7 @@ class PairDPDIntel : public PairDPD { ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, - Memory *memory, const int cop); + Memory *memory, const int cop); private: int _ntypes, _cop; From 466fde6443bf2c7c7b96502cc3ceecb0a24c979f Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 21:20:26 -0400 Subject: [PATCH 3/6] update documentation for the reversal in the INTEL_MKL_RNG define --- doc/src/accelerate_intel.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index c858ca0940..e585209cf5 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -82,10 +82,11 @@ this order :l The {newton} setting applies to all atoms, not just atoms shared between MPI tasks :l Vectorization can change the order for adding pairwise forces :l -Unless specified otherwise at build time, the random number -generator for dissipative particle dynamics uses the Mersenne -Twister generator (that should be more robust than the standard -generator) :l +When using the -DLMP_USE_MKL_RNG define (all included intel optimized +makefiles do) at build time, the random number generator for +dissipative particle dynamics (pair style dpd/intel) uses the Mersenne +Twister generator included in the Intel MKL library (that should be +more robust than the default Masaglia random number generator) :l :ule The precision mode (described below) used with the USER-INTEL From d2aa05cb3661497c70204ae8ea0822689123ebff Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 2 Oct 2017 21:24:51 -0400 Subject: [PATCH 4/6] update README in USER-INTEL for recent LRT logic reversal --- src/USER-INTEL/README | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index 35cde38f15..edfc69120c 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -42,11 +42,11 @@ available. This allow for running most styles in LAMMPS with threading. ----------------------------------------------------------------------------- -The Long-Range Thread mode (LRT) in the Intel package currently uses -pthreads by default. If pthreads are not supported in the build environment, -the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for -builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to -build with compilers that support threads using the C++11 standard. When using +The Long-Range Thread mode (LRT) in the Intel package is enabled through the +-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles +include this define. This feature will use pthreads by default. +Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that +support threads intrinsically using the C++11 standard. When using LRT mode, you might need to disable OpenMP affinity settings (e.g. export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings need to be changed. From 5e89269631263f7b800e6db09546f580d93b03a9 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 2 Oct 2017 23:41:14 -0700 Subject: [PATCH 5/6] Minor adjustments to intel makefiles and documentation based on the reversed preprocessor logic and default memory align. Removing knl_coprocessor makefile. --- doc/src/accelerate_intel.txt | 42 +++--- src/MAKE/MACHINES/Makefile.cori2 | 7 +- src/MAKE/OPTIONS/Makefile.intel_cpu | 9 +- src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi | 5 +- src/MAKE/OPTIONS/Makefile.intel_cpu_mpich | 6 +- src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi | 8 +- .../OPTIONS/Makefile.intel_knl_coprocessor | 125 ------------------ src/MAKE/OPTIONS/Makefile.knl | 6 +- src/USER-INTEL/README | 11 +- src/USER-INTEL/verlet_lrt_intel.cpp | 2 +- src/USER-INTEL/verlet_lrt_intel.h | 5 +- 11 files changed, 50 insertions(+), 176 deletions(-) delete mode 100644 src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index e585209cf5..aaa38d7de2 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -27,12 +27,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously. Angle Styles: charmm, harmonic :ulb,l Bond Styles: fene, fourier, harmonic :l Dihedral Styles: charmm, harmonic, opls :l -Fixes: nve, npt, nvt, nvt/sllod :l +Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l Improper Styles: cvff, harmonic :l Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, -lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo, -sw, tersoff :l +lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, +rebo, sw, tersoff :l K-Space Styles: pppm, pppm/disp :l :ule @@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks). :c,image(JPG/user_intel.png) Results are speedups obtained on Intel Xeon E5-2697v4 processors -(code-named Broadwell) and Intel Xeon Phi 7250 processors -(code-named Knights Landing) with "June 2017" LAMMPS built with -Intel Parallel Studio 2017 update 2. Results are with 1 MPI task -per physical core. See {src/USER-INTEL/TEST/README} for the raw -simulation rates and instructions to reproduce. +(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named +Knights Landing), and Intel Xeon Gold 6148 processors (code-named +Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio +2017 update 2. Results are with 1 MPI task per physical core. See +{src/USER-INTEL/TEST/README} for the raw simulation rates and +instructions to reproduce. :line @@ -113,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l For some of the simple 2-body potentials without long-range electrostatics, performance and scalability can be better with the "newton off" setting added to the input script :l -For simulations on higher node counts, add "processors * * * grid +For simulations on higher node counts, add "processors * * * grid numa" to the beginning of the input script for better scalability :l If using {kspace_style pppm} in the input script, add "kspace_modify diff ad" for better performance :l @@ -124,8 +125,8 @@ For Intel Xeon Phi CPUs: Runs should be performed using MCDRAM. :ulb,l :ule -For simulations using {kspace_style pppm} on Intel CPUs -supporting AVX-512: +For simulations using {kspace_style pppm} on Intel CPUs supporting +AVX-512: Add "kspace_modify diff ad" to the input script :ulb,l The command-line option should be changed to @@ -242,14 +243,17 @@ However, if you do not have coprocessors on your system, building without offload support will produce a smaller binary. The general requirements for Makefiles with the USER-INTEL package -are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When -using Intel compilers, "-restrict" is required and "-qopenmp" is -highly recommended for CCFLAGS and LINKFLAGS. LIB should include -"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD" -is required for CCFLAGS and "-qoffload" is required for LINKFLAGS. -Other recommended CCFLAG options for best performance are -"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2 --no-prec-div". +are as follows. When using Intel compilers, "-restrict" is required +and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. +CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads +are not supported in the build environment) and "-DLMP_USE_MKL_RNG" +(unless Intel Math Kernel Library (MKL) is not available in the build +environment). For Intel compilers, LIB should include "-ltbbmalloc" +or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added +to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is +required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other +recommended CCFLAG options for best performance are "-O2 -fno-alias +-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div". NOTE: The vectorization and math capabilities can differ depending on the CPU. For Intel compilers, the "-x" flag specifies the type of diff --git a/src/MAKE/MACHINES/Makefile.cori2 b/src/MAKE/MACHINES/Makefile.cori2 index a367d54080..45e1ab1f8a 100755 --- a/src/MAKE/MACHINES/Makefile.cori2 +++ b/src/MAKE/MACHINES/Makefile.cori2 @@ -15,13 +15,14 @@ SHELL = /bin/sh CC = CC OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \ + $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = CC -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu index 2c3cc51249..41d0f959fe 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu @@ -8,15 +8,14 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ - -DLMP_USE_MKL_RNG +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index ff2d0cc5c2..ef514f43c6 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -8,9 +8,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) \ - -DLMP_USE_MKL_RNG -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 40d517bce4..68f879860a 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpicxx -cxx=icc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -cxx=icc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi index fe1be99e58..457a64b223 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi @@ -9,14 +9,14 @@ SHELL = /bin/sh export OMPI_CXX = icc CC = mpicxx OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor deleted file mode 100644 index 769c166105..0000000000 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ /dev/null @@ -1,125 +0,0 @@ -# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT - -SHELL = /bin/sh - -# --------------------------------------------------------------------- -# compiler/linker settings -# specify flags and libraries needed for your compiler - -CC = mpiicpc -MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 -CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ - -xHost -fno-alias -ansi-alias -restrict \ - -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT \ - -DLMP_USE_MKL_RNG - -SHFLAGS = -fPIC -DEPFLAGS = -M - -LINK = mpiicpc -LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT) -LIB = -ltbbmalloc -SIZE = size - -ARCHIVE = ar -ARFLAGS = -rc -SHLIBFLAGS = -shared - -# --------------------------------------------------------------------- -# LAMMPS-specific settings, all OPTIONAL -# specify settings for LAMMPS features you will use -# if you change any -D setting, do full re-compile after "make clean" - -# LAMMPS ifdef settings -# see possible settings in Section 2.2 (step 4) of manual - -LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG - -# MPI library -# see discussion in Section 2.2 (step 5) of manual -# MPI wrapper compiler/linker can provide this info -# can point to dummy MPI library in src/STUBS as in Makefile.serial -# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts -# INC = path for mpi.h, MPI compiler settings -# PATH = path for MPI library -# LIB = name of MPI library - -MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -MPI_PATH = -MPI_LIB = - -# FFT library -# see discussion in Section 2.2 (step 6) of manaul -# can be left blank to use provided KISS FFT library -# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings -# PATH = path for FFT library -# LIB = name of FFT library - -FFT_INC = -DFFT_MKL -DFFT_SINGLE -FFT_PATH = -FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core - -# JPEG and/or PNG library -# see discussion in Section 2.2 (step 7) of manual -# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC -# INC = path(s) for jpeglib.h and/or png.h -# PATH = path(s) for JPEG library and/or PNG library -# LIB = name(s) of JPEG library and/or PNG library - -JPG_INC = -JPG_PATH = -JPG_LIB = -ljpeg - -# --------------------------------------------------------------------- -# build rules and dependencies -# do not edit this section - -include Makefile.package.settings -include Makefile.package - -EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) -EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) -EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) -EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) -EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) - -# Path to src files - -vpath %.cpp .. -vpath %.h .. - -# Link target - -$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS) - $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) - $(SIZE) $(EXE) - -# Library targets - -lib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) - -shlib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ - $(OBJ) $(EXTRA_LIB) $(LIB) - -# Compilation rules - -%.o:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -%.d:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ - -%.o:%.cu $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -# Individual dependencies - -depend : fastdep.exe $(SRC) - @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 - -fastdep.exe: ../DEPEND/fastdep.c - cc -O -o $@ $< - -sinclude .depend diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl index 881c51f0e4..8e266a4fce 100644 --- a/src/MAKE/OPTIONS/Makefile.knl +++ b/src/MAKE/OPTIONS/Makefile.knl @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = -ltbbmalloc SIZE = size diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index edfc69120c..871d881f39 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -53,12 +53,11 @@ need to be changed. ----------------------------------------------------------------------------- -The random number generator for Dissipative Particle Dynamics (DPD) in the -Intel package uses the Mersenne Twister pseudorandom number generator as -implemented in the Intel Math Kernel Library (MKL). This generator is faster -and more robust with a significantly longer period than the default DPD -generator. However, if MKL is not installed, the standard random number -generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG". +Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG +should be added to the compile flags. This will enable using the MKL Mersenne +Twister random number generator (RNG) for Dissipative Particle Dynamics +(DPD). This RNG can allow significantly faster performance and it also has a +significantly longer period than the standard RNG for DPD. ----------------------------------------------------------------------------- diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index 81f4586143..9ff5f85176 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -68,7 +68,7 @@ void VerletLRTIntel::init() _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0)); - #ifdef LMP_INTEL_NOLRT + #ifndef LMP_INTEL_USELRT error->all(FLERR, "LRT otion for Intel package disabled at compile time"); #endif diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h index 813cd53605..0d7154ff64 100644 --- a/src/USER-INTEL/verlet_lrt_intel.h +++ b/src/USER-INTEL/verlet_lrt_intel.h @@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel) #include "verlet.h" #include "pppm_intel.h" -#ifndef LMP_INTEL_USELRT -#define LMP_INTEL_NOLRT -#else - +#ifdef LMP_INTEL_USELRT #ifdef LMP_INTEL_LRT11 #define _LMP_INTEL_LRT_11 #include From 9dc42fd4db713cb74d52697d0e1af2f6404867e3 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Mon, 2 Oct 2017 23:53:05 -0700 Subject: [PATCH 6/6] intel_simd.h is currently also needed by dihedral/charmm, not just sw. --- src/USER-INTEL/Install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh index f7163e6791..da553d158a 100644 --- a/src/USER-INTEL/Install.sh +++ b/src/USER-INTEL/Install.sh @@ -46,7 +46,7 @@ action nbin_intel.h action nbin_intel.cpp action npair_intel.h action npair_intel.cpp -action intel_simd.h pair_sw_intel.cpp +action intel_simd.h action intel_intrinsics.h pair_tersoff_intel.cpp action intel_intrinsics_airebo.h pair_airebo_intel.cpp