Merge pull request #497 from lammps/add-user-reaxc-omp

Add USER-OMP compatible OpenMP support to USER-REAXC
2017-06-15 08:24:03 -06:00
parent 34cfc7bd51 2f6bbcfbbc
commit 3c7417fb59
38 changed files with 6481 additions and 215 deletions
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@ -717,7 +717,7 @@ package"_Section_start.html#start_3.
 "phonon"_fix_phonon.html,
 "pimd"_fix_pimd.html,
 "qbmsst"_fix_qbmsst.html,
-"qeq/reax"_fix_qeq_reax.html,
+"qeq/reax (ko)"_fix_qeq_reax.html,
 "qmmm"_fix_qmmm.html,
 "qtb"_fix_qtb.html,
 "reax/c/bonds"_fix_reax_bonds.html,
@ -1057,7 +1057,7 @@ package"_Section_start.html#start_3.
 "oxdna2/excv"_pair_oxdna2.html,
 "oxdna2/stk"_pair_oxdna2.html,
 "quip"_pair_quip.html,
-"reax/c (k)"_pair_reaxc.html,
+"reax/c (ko)"_pair_reaxc.html,
 "smd/hertz"_pair_smd_hertz.html,
 "smd/tlsph"_pair_smd_tlsph.html,
 "smd/triangulated/surface"_pair_smd_triangulated_surface.html,
--- a/doc/src/fix_qeq_reax.txt
+++ b/doc/src/fix_qeq_reax.txt
@ -8,17 +8,19 @@

 fix qeq/reax command :h3
 fix qeq/reax/kk command :h3
+fix qeq/reax/omp command :h3

 [Syntax:]

-fix ID group-ID qeq/reax Nevery cutlo cuthi tolerance params :pre
+fix ID group-ID qeq/reax Nevery cutlo cuthi tolerance params args :pre

 ID, group-ID are documented in "fix"_fix.html command
 qeq/reax = style name of this fix command
 Nevery = perform QEq every this many steps
 cutlo,cuthi = lo and hi cutoff for Taper radius
 tolerance = precision to which charges will be equilibrated
-params = reax/c or a filename :ul
+params = reax/c or a filename
+args   = {dual} (optional) :ul

 [Examples:]

@ -59,6 +61,10 @@ potential file, except that eta is defined here as twice the eta value
 in the ReaxFF file. Note that unlike the rest of LAMMPS, the units
 of this fix are hard-coded to be A, eV, and electronic charge.

+The optional {dual} keyword allows to perform the optimization
+of the S and T matrices in parallel. This is only supported for
+the {qeq/reax/omp} style. Otherwise they are processed separately.
+
 [Restart, fix_modify, output, run start/stop, minimize info:]

 No information about this fix is written to "binary restart
--- a/doc/src/pair_reaxc.txt
+++ b/doc/src/pair_reaxc.txt
@ -8,6 +8,7 @@

 pair_style reax/c command :h3
 pair_style reax/c/kk command :h3
+pair_style reax/c/omp command :h3

 [Syntax:]

--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -126,4 +126,5 @@ fi

 if (test $1 = "USER-REAXC") then
  depend KOKKOS
+  depend USER-OMP
 fi
--- a/src/USER-OMP/angle_dipole_omp.cpp
+++ b/src/USER-OMP/angle_dipole_omp.cpp
@ -122,14 +122,14 @@ void AngleDipoleOMP::eval(int nfrom, int nto, ThrData * const thr)
    delTx = tangle * (dely*mu[iDip][2] - delz*mu[iDip][1]);
    delTy = tangle * (delz*mu[iDip][0] - delx*mu[iDip][2]);
    delTz = tangle * (delx*mu[iDip][1] - dely*mu[iDip][0]);
-    
+
    torque[iDip][0] += delTx;
    torque[iDip][1] += delTy;
    torque[iDip][2] += delTz;
 	
    // Force couple that counterbalances dipolar torque
    fx = dely*delTz - delz*delTy; // direction (fi): - r x (-T)
-    fy = delz*delTx - delx*delTz; 
+    fy = delz*delTx - delx*delTz;
    fz = delx*delTy - dely*delTx;

    fmod = sqrt(delTx*delTx + delTy*delTy + delTz*delTz) / r; // magnitude
@ -142,11 +142,11 @@ void AngleDipoleOMP::eval(int nfrom, int nto, ThrData * const thr)
    fj[0] = -fi[0];
    fj[1] = -fi[1];
    fj[2] = -fi[2];
-    
+
    f[iDip][0] += fj[0];
    f[iDip][1] += fj[1];
    f[iDip][2] += fj[2];
-    
+
    f[iRef][0] += fi[0];
    f[iRef][1] += fi[1];
    f[iRef][2] += fi[2];
--- a/src/USER-OMP/fix_qeq_reax_omp.cpp
+++ b/src/USER-OMP/fix_qeq_reax_omp.cpp
--- a/src/USER-OMP/fix_qeq_reax_omp.h
+++ b/src/USER-OMP/fix_qeq_reax_omp.h
@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Hasan Metin Aktulga, Purdue University
+   (now at Lawrence Berkeley National Laboratory, hmaktulga@lbl.gov)
+
+   Please cite the related publication:
+   H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+   "Parallel Reactive Molecular Dynamics: Numerical Methods and
+   Algorithmic Techniques", Parallel Computing, in press.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(qeq/reax/omp,FixQEqReaxOMP)
+
+#else
+
+#ifndef LMP_FIX_QEQ_REAX_OMP_H
+#define LMP_FIX_QEQ_REAX_OMP_H
+
+#include "fix_qeq_reax.h"
+
+namespace LAMMPS_NS {
+
+class FixQEqReaxOMP : public FixQEqReax {
+
+ public:
+  FixQEqReaxOMP(class LAMMPS *, int, char **);
+  ~FixQEqReaxOMP();
+  virtual void init();
+  virtual void init_storage();
+  virtual void pre_force(int);
+  virtual void post_constructor();
+
+ protected:
+  double **b_temp;
+
+  int do_aspc;
+  int aspc_order, aspc_order_max;
+  double aspc_omega;
+  double * aspc_b;
+
+  virtual void allocate_storage();
+  virtual void deallocate_storage();
+  virtual void init_matvec();
+  virtual void compute_H();
+
+  virtual int CG(double*,double*);
+  virtual void sparse_matvec(sparse_matrix*,double*,double*);
+  virtual void calculate_Q();
+
+  virtual void vector_sum(double*,double,double*,double,double*,int);
+  virtual void vector_add(double*, double, double*,int);
+
+  // dual CG support
+  virtual int dual_CG(double*,double*,double*,double*);
+  virtual void dual_sparse_matvec(sparse_matrix*,double*,double*,double*);
+  virtual void dual_sparse_matvec(sparse_matrix*,double*,double*);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_reaxc_omp.cpp
+++ b/src/USER-OMP/pair_reaxc_omp.cpp
@ -0,0 +1,603 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Hasan Metin Aktulga, Purdue University
+   (now at Lawrence Berkeley National Laboratory, hmaktulga@lbl.gov)
+   Per-atom energy/virial added by Ray Shan (Sandia)
+   Fix reax/c/bonds and fix reax/c/species for pair_style reax/c added by
+   	Ray Shan (Sandia)
+------------------------------------------------------------------------- */
+
+#include "pair_reaxc_omp.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "modify.h"
+#include "fix.h"
+#include "fix_reaxc.h"
+#include "citeme.h"
+#include "memory.h"
+#include "error.h"
+#include "timer.h"
+
+#include "reaxc_types.h"
+#include "reaxc_allocate.h"
+#include "reaxc_control.h"
+#include "reaxc_ffield.h"
+#include "reaxc_forces_omp.h"
+#include "reaxc_init_md_omp.h"
+#include "reaxc_io_tools.h"
+#include "reaxc_list.h"
+#include "reaxc_lookup.h"
+#include "reaxc_reset_tools.h"
+#include "reaxc_tool_box.h"
+#include "reaxc_traj.h"
+#include "reaxc_vector.h"
+#include "fix_reaxc_bonds.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+#ifdef OMP_TIMING
+double ompTimingData[LASTTIMINGINDEX];
+int ompTimingCount[LASTTIMINGINDEX];
+int ompTimingCGCount[LASTTIMINGINDEX];
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PairReaxCOMP::PairReaxCOMP(LAMMPS *lmp) : PairReaxC(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  system->pair_ptr = this;
+
+  num_nbrs_offset = NULL;
+
+#ifdef OMP_TIMING
+  for (int i=0;i<LASTTIMINGINDEX;i++) {
+    ompTimingData[i] = 0;
+    ompTimingCount[i] = 0;
+    ompTimingCGCount[i] = 0;
+  }
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairReaxCOMP::~PairReaxCOMP()
+{
+  reax_list * bonds = lists+BONDS;
+  for (int i=0; i<bonds->num_intrs; ++i)
+    sfree(bonds->select.bond_list[i].bo_data.CdboReduction, "CdboReduction");
+
+  memory->destroy(num_nbrs_offset);
+
+#ifdef OMP_TIMING
+  int myrank;
+
+  MPI_Comm_rank(mpi_data->world,&myrank);
+
+  // Write screen output
+  if (timer->has_full() && myrank == 0 && screen) {
+    fprintf(screen,"\n\nWrite_Lists    took %11.3lf seconds", ompTimingData[COMPUTEWLINDEX]);
+
+    fprintf(screen,"\n\nCompute_Forces took %11.3lf seconds:", ompTimingData[COMPUTEINDEX]);
+    fprintf(screen,"\n ->Initial Forces: %11.3lf seconds", ompTimingData[COMPUTEIFINDEX]);
+    fprintf(screen,"\n ->Bond Order:     %11.3lf seconds", ompTimingData[COMPUTEBOINDEX]);
+    fprintf(screen,"\n ->Atom Energy:    %11.3lf seconds", ompTimingData[COMPUTEATOMENERGYINDEX]);
+    fprintf(screen,"\n ->Bond:           %11.3lf seconds", ompTimingData[COMPUTEBONDSINDEX]);
+    fprintf(screen,"\n ->Hydrogen bonds: %11.3lf seconds", ompTimingData[COMPUTEHBONDSINDEX]);
+    fprintf(screen,"\n ->Torsion Angles: %11.3lf seconds", ompTimingData[COMPUTETORSIONANGLESBOINDEX]);
+    fprintf(screen,"\n ->Valence Angles: %11.3lf seconds", ompTimingData[COMPUTEVALENCEANGLESBOINDEX]);
+    fprintf(screen,"\n ->Non-Bonded For: %11.3lf seconds", ompTimingData[COMPUTENBFINDEX]);
+    fprintf(screen,"\n ->Total Forces:   %11.3lf seconds", ompTimingData[COMPUTETFINDEX]);
+
+    fprintf(screen,"\n\nfixQEQ:          %11.3lf seconds", ompTimingData[COMPUTEQEQINDEX]);
+    fprintf(screen,"\n ->QEQ init:       %11.3lf seconds", ompTimingData[COMPUTEINITMVINDEX]);
+
+    double avg = double(ompTimingCGCount[COMPUTECG1INDEX]) / double(ompTimingCount[COMPUTECG1INDEX]);
+    fprintf(screen,"\n ->QEQ CG1:        %11.3lf seconds with %4.1lf iterations on average.", ompTimingData[COMPUTECG1INDEX], avg);
+
+    avg = double(ompTimingCGCount[COMPUTECG2INDEX]) / double(ompTimingCount[COMPUTECG2INDEX]);
+    fprintf(screen,"\n ->QEQ CG2:        %11.3lf seconds with %4.1lf iterations on average.", ompTimingData[COMPUTECG2INDEX], avg);
+    fprintf(screen,"\n ->QEQ CalcQ:      %11.3lf seconds\n", ompTimingData[COMPUTECALCQINDEX]);
+  }
+
+  // Write logfile output
+  if (timer->has_full() && myrank == 0 && logfile) {
+    fprintf(logfile,"\n\nWrite_Lists    took %11.3lf seconds", ompTimingData[COMPUTEWLINDEX]);
+
+    fprintf(logfile,"\n\nCompute_Forces took %11.3lf seconds:", ompTimingData[COMPUTEINDEX]);
+    fprintf(logfile,"\n ->Initial Forces: %11.3lf seconds", ompTimingData[COMPUTEIFINDEX]);
+    fprintf(logfile,"\n ->Bond Order:     %11.3lf seconds", ompTimingData[COMPUTEBOINDEX]);
+    fprintf(logfile,"\n ->Atom Energy:    %11.3lf seconds", ompTimingData[COMPUTEATOMENERGYINDEX]);
+    fprintf(logfile,"\n ->Bond:           %11.3lf seconds", ompTimingData[COMPUTEBONDSINDEX]);
+    fprintf(logfile,"\n ->Hydrogen bonds: %11.3lf seconds", ompTimingData[COMPUTEHBONDSINDEX]);
+    fprintf(logfile,"\n ->Torsion Angles: %11.3lf seconds", ompTimingData[COMPUTETORSIONANGLESBOINDEX]);
+    fprintf(logfile,"\n ->Valence Angles: %11.3lf seconds", ompTimingData[COMPUTEVALENCEANGLESBOINDEX]);
+    fprintf(logfile,"\n ->Non-Bonded For: %11.3lf seconds", ompTimingData[COMPUTENBFINDEX]);
+    fprintf(logfile,"\n ->Total Forces:   %11.3lf seconds", ompTimingData[COMPUTETFINDEX]);
+
+    fprintf(logfile,"\n\nfixQEQ:          %11.3lf seconds", ompTimingData[COMPUTEQEQINDEX]);
+    fprintf(logfile,"\n ->QEQ init:       %11.3lf seconds", ompTimingData[COMPUTEINITMVINDEX]);
+
+    double avg = double(ompTimingCGCount[COMPUTECG1INDEX]) / double(ompTimingCount[COMPUTECG1INDEX]);
+    fprintf(logfile,"\n ->QEQ CG1:        %11.3lf seconds with %4.1lf iterations on average.", ompTimingData[COMPUTECG1INDEX], avg);
+
+    avg = double(ompTimingCGCount[COMPUTECG2INDEX]) / double(ompTimingCount[COMPUTECG2INDEX]);
+    fprintf(logfile,"\n ->QEQ CG2:        %11.3lf seconds with %4.1lf iterations on average.", ompTimingData[COMPUTECG2INDEX], avg);
+    fprintf(logfile,"\n ->QEQ CalcQ:      %11.3lf seconds\n", ompTimingData[COMPUTECALCQINDEX]);
+  }
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairReaxCOMP::compute(int eflag, int vflag)
+{
+  double evdwl,ecoul;
+  double t_start, t_end;
+
+  // communicate num_bonds once every reneighboring
+  // 2 num arrays stored by fix, grab ptr to them
+
+  if (neighbor->ago == 0) comm->forward_comm_fix(fix_reax);
+  int *num_bonds = fix_reax->num_bonds;
+  int *num_hbonds = fix_reax->num_hbonds;
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else ev_unset();
+
+  if (vflag_global) control->virial = 1;
+  else control->virial = 0;
+
+  system->n = atom->nlocal; // my atoms
+  system->N = atom->nlocal + atom->nghost; // mine + ghosts
+  system->bigN = static_cast<int> (atom->natoms);  // all atoms in the system
+
+  system->big_box.V = 0;
+  system->big_box.box_norms[0] = 0;
+  system->big_box.box_norms[1] = 0;
+  system->big_box.box_norms[2] = 0;
+  if( comm->me == 0 ) t_start = MPI_Wtime();
+  // setup data structures
+
+  setup();
+
+  Reset( system, control, data, workspace, &lists, world );
+
+  // Why not update workspace like in MPI-only code?
+  // Using the MPI-only way messes up the hb energy
+  //workspace->realloc.num_far = write_reax_lists();
+  write_reax_lists();
+
+  // timing for filling in the reax lists
+  if( comm->me == 0 ) {
+    t_end = MPI_Wtime();
+    data->timing.nbrs = t_end - t_start;
+  }
+
+  // forces
+
+#ifdef OMP_TIMING
+  double startTimeBase,endTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  Compute_ForcesOMP(system,control,data,workspace,&lists,out_control,mpi_data);
+  read_reax_forces(vflag);
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEINDEX] += (endTimeBase-startTimeBase);
+#endif
+
+#if defined(_OPENMP)
+#pragma omp parallel for schedule(static)
+#endif
+  for(int k = 0; k < system->N; ++k) {
+    num_bonds[k] = system->my_atoms[k].num_bonds;
+    num_hbonds[k] = system->my_atoms[k].num_hbonds;
+  }
+
+  // energies and pressure
+
+  if (eflag_global) {
+    evdwl += data->my_en.e_bond;
+    evdwl += data->my_en.e_ov;
+    evdwl += data->my_en.e_un;
+    evdwl += data->my_en.e_lp;
+    evdwl += data->my_en.e_ang;
+    evdwl += data->my_en.e_pen;
+    evdwl += data->my_en.e_coa;
+    evdwl += data->my_en.e_hb;
+    evdwl += data->my_en.e_tor;
+    evdwl += data->my_en.e_con;
+    evdwl += data->my_en.e_vdW;
+
+    ecoul += data->my_en.e_ele;
+    ecoul += data->my_en.e_pol;
+
+    // Store the different parts of the energy
+    // in a list for output by compute pair command
+
+    pvector[0] = data->my_en.e_bond;
+    pvector[1] = data->my_en.e_ov + data->my_en.e_un;
+    pvector[2] = data->my_en.e_lp;
+    pvector[3] = 0.0;
+    pvector[4] = data->my_en.e_ang;
+    pvector[5] = data->my_en.e_pen;
+    pvector[6] = data->my_en.e_coa;
+    pvector[7] = data->my_en.e_hb;
+    pvector[8] = data->my_en.e_tor;
+    pvector[9] = data->my_en.e_con;
+    pvector[10] = data->my_en.e_vdW;
+    pvector[11] = data->my_en.e_ele;
+    pvector[12] = 0.0;
+    pvector[13] = data->my_en.e_pol;
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+
+// Set internal timestep counter to that of LAMMPS
+
+  data->step = update->ntimestep;
+
+  Output_Results( system, control, data, &lists, out_control, mpi_data );
+
+  // populate tmpid and tmpbo arrays for fix reax/c/species
+  int i, j;
+
+  if(fixspecies_flag) {
+    if (system->N > nmax) {
+      memory->destroy(tmpid);
+      memory->destroy(tmpbo);
+      nmax = system->N;
+      memory->create(tmpid,nmax,MAXSPECBOND,"pair:tmpid");
+      memory->create(tmpbo,nmax,MAXSPECBOND,"pair:tmpbo");
+    }
+
+#if defined(_OPENMP)
+#pragma omp parallel for collapse(2) schedule(static) default(shared)
+#endif
+    for (i = 0; i < system->N; i ++)
+      for (j = 0; j < MAXSPECBOND; j ++) {
+        tmpbo[i][j] = 0.0;
+        tmpid[i][j] = 0;
+      }
+
+    FindBond();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairReaxCOMP::init_style( )
+{
+  if (!atom->q_flag)
+    error->all(FLERR,"Pair reax/c/omp requires atom attribute q");
+
+  // firstwarn = 1;
+
+  int iqeq = modify->find_fix_by_style("qeq/reax/omp");
+  if (iqeq < 0 && qeqflag == 1)
+    error->all(FLERR,"Pair reax/c/omp requires use of fix qeq/reax/omp");
+
+  system->n = atom->nlocal; // my atoms
+  system->N = atom->nlocal + atom->nghost; // mine + ghosts
+  system->bigN = static_cast<int> (atom->natoms);  // all atoms in the system
+  system->wsize = comm->nprocs;
+
+  system->big_box.V = 0;
+  system->big_box.box_norms[0] = 0;
+  system->big_box.box_norms[1] = 0;
+  system->big_box.box_norms[2] = 0;
+
+  if (atom->tag_enable == 0)
+    error->all(FLERR,"Pair style reax/c/omp requires atom IDs");
+  if (force->newton_pair == 0)
+    error->all(FLERR,"Pair style reax/c/omp requires newton pair on");
+
+  // need a half neighbor list w/ Newton off and ghost neighbors
+  // built whenever re-neighboring occurs
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->newton = 2;
+  neighbor->requests[irequest]->ghost = 1;
+
+  cutmax = MAX3(control->nonb_cut, control->hbond_cut, 2*control->bond_cut);
+
+  for( int i = 0; i < LIST_N; ++i )
+    lists[i].allocated = 0;
+
+  if (fix_reax == NULL) {
+    char **fixarg = new char*[3];
+    fixarg[0] = (char *) "REAXC";
+    fixarg[1] = (char *) "all";
+    fixarg[2] = (char *) "REAXC";
+    modify->add_fix(3,fixarg);
+    delete [] fixarg;
+    fix_reax = (FixReaxC *) modify->fix[modify->nfix-1];
+  }
+
+#if defined(_OPENMP)
+  control->nthreads = omp_get_max_threads();
+#else
+  control->nthreads = 1;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairReaxCOMP::setup( )
+{
+  int oldN;
+  int mincap = system->mincap;
+  double safezone = system->safezone;
+
+  system->n = atom->nlocal; // my atoms
+  system->N = atom->nlocal + atom->nghost; // mine + ghosts
+  oldN = system->N;
+  system->bigN = static_cast<int> (atom->natoms);  // all atoms in the system
+
+  if (system->N > nmax) {
+    memory->destroy(num_nbrs_offset);
+    // Don't update nmax here. It is updated at end of compute().
+    memory->create(num_nbrs_offset, system->N, "pair:num_nbrs_offset");
+  }
+
+  if (setup_flag == 0) {
+
+    setup_flag = 1;
+
+    int *num_bonds = fix_reax->num_bonds;
+    int *num_hbonds = fix_reax->num_hbonds;
+
+    control->vlist_cut = neighbor->cutneighmax;
+
+    // determine the local and total capacity
+
+    system->local_cap = MAX( (int)(system->n * safezone), mincap );
+    system->total_cap = MAX( (int)(system->N * safezone), mincap );
+
+    // initialize my data structures
+
+    PreAllocate_Space( system, control, workspace, world );
+    write_reax_atoms();
+
+    int num_nbrs = estimate_reax_lists();
+    if(!Make_List(system->total_cap, num_nbrs, TYP_FAR_NEIGHBOR,
+                  lists+FAR_NBRS, world))
+      error->all(FLERR,"Pair reax/c problem in far neighbor list");
+
+    write_reax_lists();
+
+    InitializeOMP( system, control, data, workspace, &lists, out_control,
+                mpi_data, world );
+
+    for( int k = 0; k < system->N; ++k ) {
+      num_bonds[k] = system->my_atoms[k].num_bonds;
+      num_hbonds[k] = system->my_atoms[k].num_hbonds;
+    }
+
+  } else {
+
+    // fill in reax datastructures
+
+    write_reax_atoms();
+
+    // reset the bond list info for new atoms
+
+    for(int k = oldN; k < system->N; ++k)
+      Set_End_Index( k, Start_Index( k, lists+BONDS ), lists+BONDS );
+
+    // estimate far neighbor list size
+    // Not present in MPI-only version
+    workspace->realloc.num_far = estimate_reax_lists();
+
+    // check if I need to shrink/extend my data-structs
+
+    ReAllocate( system, control, data, workspace, &lists, mpi_data );
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairReaxCOMP::write_reax_atoms()
+{
+  int *num_bonds = fix_reax->num_bonds;
+  int *num_hbonds = fix_reax->num_hbonds;
+
+  if (system->N > system->total_cap)
+    error->all(FLERR,"Too many ghost atoms");
+
+#if defined(_OPENMP)
+#pragma omp parallel for schedule(static) default(shared)
+#endif
+  for( int i = 0; i < system->N; ++i ){
+    system->my_atoms[i].orig_id = atom->tag[i];
+    system->my_atoms[i].type = map[atom->type[i]];
+    system->my_atoms[i].x[0] = atom->x[i][0];
+    system->my_atoms[i].x[1] = atom->x[i][1];
+    system->my_atoms[i].x[2] = atom->x[i][2];
+    system->my_atoms[i].q = atom->q[i];
+    system->my_atoms[i].num_bonds = num_bonds[i];
+    system->my_atoms[i].num_hbonds = num_hbonds[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int PairReaxCOMP::estimate_reax_lists()
+{
+  int i;
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int numall = list->inum + list->gnum;
+  int mincap = system->mincap;
+
+  // for good performance in the OpenMP implementation, each thread needs
+  // to know where to place the neighbors of the atoms it is responsible for.
+  // The sumscan values for the list->numneigh will be used to determine the
+  // neighbor offset of each atom. Note that this may cause some significant
+  // memory overhead if delayed neighboring is used - so it may be desirable
+  // to work on this part to reduce the memory footprint of the far_nbrs list.
+
+  int num_nbrs = 0;
+
+  for (int itr_i = 0; itr_i < numall; ++itr_i) {
+    i = ilist[itr_i];
+    num_nbrs += numneigh[i];
+  }
+
+  int new_estimate = MAX (num_nbrs, mincap*MIN_NBRS);
+
+  return new_estimate;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int PairReaxCOMP::write_reax_lists()
+{
+#ifdef OMP_TIMING
+  double startTimeBase, endTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  int itr_i, itr_j, i, j, num_mynbrs;
+  int *jlist;
+  double d_sqr, dist, cutoff_sqr;
+  rvec dvec;
+
+  double **x = atom->x;
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  reax_list *far_nbrs = lists + FAR_NBRS;
+  far_neighbor_data *far_list = far_nbrs->select.far_nbr_list;
+
+  int num_nbrs = 0;
+  int inum = list->inum;
+  int gnum = list->gnum;
+  int numall = inum + gnum;
+
+  // sumscan of the number of neighbors per atom to determine the offsets
+  // most likely, we are overallocating. desirable to work on this part
+  // to reduce the memory footprint of the far_nbrs list.
+
+  num_nbrs = 0;
+
+  for (itr_i = 0; itr_i < numall; ++itr_i) {
+    i = ilist[itr_i];
+    num_nbrs_offset[i] = num_nbrs;
+    num_nbrs += numneigh[i];
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel for schedule(dynamic,50) default(shared)           \
+  private(itr_i, itr_j, i, j, jlist, cutoff_sqr, num_mynbrs, d_sqr, dvec, dist)
+#endif
+  for (itr_i = 0; itr_i < numall; ++itr_i) {
+    i = ilist[itr_i];
+    jlist = firstneigh[i];
+    Set_Start_Index( i, num_nbrs_offset[i], far_nbrs );
+
+    if (i < inum)
+      cutoff_sqr = control->nonb_cut*control->nonb_cut;
+    else
+      cutoff_sqr = control->bond_cut*control->bond_cut;
+
+    num_mynbrs = 0;
+
+    for (itr_j = 0; itr_j < numneigh[i]; ++itr_j) {
+      j = jlist[itr_j];
+      j &= NEIGHMASK;
+      get_distance( x[j], x[i], &d_sqr, &dvec );
+
+      if (d_sqr <= cutoff_sqr) {
+        dist = sqrt( d_sqr );
+        set_far_nbr( &far_list[num_nbrs_offset[i] + num_mynbrs], j, dist, dvec );
+        ++num_mynbrs;
+      }
+    }
+    Set_End_Index( i, num_nbrs_offset[i] + num_mynbrs, far_nbrs );
+  }
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEWLINDEX] += (endTimeBase-startTimeBase);
+#endif
+
+  return num_nbrs;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairReaxCOMP::read_reax_forces(int vflag)
+{
+#if defined(_OPENMP)
+#pragma omp parallel for schedule(static) default(shared)
+#endif
+  for( int i = 0; i < system->N; ++i ) {
+    system->my_atoms[i].f[0] = workspace->f[i][0];
+    system->my_atoms[i].f[1] = workspace->f[i][1];
+    system->my_atoms[i].f[2] = workspace->f[i][2];
+
+    atom->f[i][0] = -workspace->f[i][0];
+    atom->f[i][1] = -workspace->f[i][1];
+    atom->f[i][2] = -workspace->f[i][2];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairReaxCOMP::FindBond()
+{
+  int i, ii, j, pj, jtag, nj, jtmp, jj;
+  double bo_tmp, bo_cut, rij, rsq;
+
+  bond_data *bo_ij;
+  bo_cut = 0.10;
+
+#if defined(_OPENMP)
+#pragma omp parallel for schedule(static) default(shared)   \
+  private(i, nj, pj, bo_ij, j, bo_tmp)
+#endif
+  for (i = 0; i < system->n; i++) {
+    nj = 0;
+    for( pj = Start_Index(i, lists); pj < End_Index(i, lists); ++pj ) {
+      bo_ij = &( lists->select.bond_list[pj] );
+      j = bo_ij->nbr;
+      if (j < i) continue;
+
+      bo_tmp = bo_ij->bo_data.BO;
+
+      if (bo_tmp >= bo_cut ) {
+        tmpid[i][nj] = j;
+        tmpbo[i][nj] = bo_tmp;
+        nj ++;
+        if (nj > MAXSPECBOND) error->all(FLERR,"Increase MAXSPECBOND in fix_reaxc_species.h");
+      }
+    }
+  }
+}
+
--- a/src/USER-OMP/pair_reaxc_omp.h
+++ b/src/USER-OMP/pair_reaxc_omp.h
@ -0,0 +1,113 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Hasan Metin Aktulga, Purdue University
+   (now at Lawrence Berkeley National Laboratory, hmaktulga@lbl.gov)
+
+   Please cite the related publication:
+   H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+   "Parallel Reactive Molecular Dynamics: Numerical Methods and
+   Algorithmic Techniques", Parallel Computing, in press.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(reax/c/omp,PairReaxCOMP)
+
+#else
+
+#ifndef LMP_PAIR_REAXC_OMP_H
+#define LMP_PAIR_REAXC_OMP_H
+
+#include "pair_reaxc.h"
+#include "thr_omp.h"
+#include "suffix.h"
+
+namespace LAMMPS_NS {
+
+class PairReaxCOMP : public PairReaxC, public ThrOMP {
+ public:
+  PairReaxCOMP(class LAMMPS *);
+  ~PairReaxCOMP();
+  virtual void compute(int, int);
+  virtual void init_style();
+
+  inline FixOMP *getFixOMP() {
+	return fix;
+  };
+
+  inline void ev_setup_thr_proxy(int eflagparm, int vflagparm, int nallparm,
+				 double *eatomparm, double **vatomparm, ThrData *thrparm) {
+    ev_setup_thr(eflagparm, vflagparm, nallparm, eatomparm, vatomparm, thrparm);
+  };
+
+  // reduce per thread data as needed
+  inline void reduce_thr_proxy(void * const styleparm, const int eflagparm,
+			       const int vflagparm, ThrData * const thrparm) {
+    reduce_thr(styleparm, eflagparm, vflagparm, thrparm);
+  }
+
+  inline void ev_tally_thr_proxy(Pair * const pairparm, const int iparm, const int jparm,
+				 const int nlocalparm, const int newton_pairparm,
+				 const double evdwlparm, const double ecoulparm,
+				 const double fpairparm, const double delxparm,
+				 const double delyparm, const double delzparm,
+				 ThrData * const thrparm) {
+    ev_tally_thr(pairparm, iparm, jparm, nlocalparm, newton_pairparm,
+		 evdwlparm, ecoulparm, fpairparm, delxparm, delyparm, delzparm, thrparm);
+  }
+
+  inline void ev_tally_xyz_thr_proxy(Pair * const pairparm, const int iparm, const int jparm,
+				     const int nlocalparm, const int newton_pairparm,
+				     const double evdwlparm, const double ecoulparm,
+				     const double fxparm, const double fyparm, const double fzparm,
+				     const double delxparm, const double delyparm,
+				     const double delzparm, ThrData * const thrparm) {
+    ev_tally_xyz_thr(pairparm, iparm, jparm, nlocalparm, newton_pairparm,
+		     evdwlparm, ecoulparm, fxparm, fyparm, fzparm,
+		     delxparm, delyparm, delzparm, thrparm);
+  }
+
+  inline void ev_tally3_thr_proxy(Pair * const pairparm,int i, int j, int k,
+				  double evdwl, double ecoul, double *fj, double *fk,
+				  double *drji, double *drki, ThrData * const thrparm) {
+    ev_tally3_thr(pairparm, i, j, k, evdwl, ecoul, fj, fk, drji, drki, thrparm);
+  }
+
+ protected:
+  virtual void setup();
+  virtual void write_reax_atoms();
+  virtual int estimate_reax_lists();
+  virtual int write_reax_lists();
+  virtual void read_reax_forces(int);
+  virtual void FindBond();
+
+  // work array used in write_reax_lists()
+  int * num_nbrs_offset;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Too many ghost atoms
+
+Number of ghost atoms has increased too much during simulation and has exceeded
+the size of reax/c arrays.  Increase safe_zone and min_cap in pair_style reax/c
+command
+
+*/
--- a/src/USER-OMP/reaxc_bond_orders_omp.cpp
+++ b/src/USER-OMP/reaxc_bond_orders_omp.cpp
@ -0,0 +1,736 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "reaxc_types.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include  <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+void Add_dBond_to_ForcesOMP( reax_system *system, int i, int pj,
+                             storage *workspace, reax_list **lists ) {
+  reax_list *bonds = (*lists) + BONDS;
+  bond_data *nbr_j, *nbr_k;
+  bond_order_data *bo_ij, *bo_ji;
+  dbond_coefficients coef;
+  int pk, k, j;
+
+  PairReaxCOMP *pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+#else
+  int tid = 0;
+#endif
+  ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+  long reductionOffset = (system->N * tid);
+
+  /* Virial Tallying variables */
+  double f_scaler;
+  rvec fi_tmp, fj_tmp, fk_tmp, delij, delji, delki, delkj, temp;
+
+  /* Initializations */
+  nbr_j = &(bonds->select.bond_list[pj]);
+  j = nbr_j->nbr;
+  bo_ij = &(nbr_j->bo_data);
+  bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+  double c = bo_ij->Cdbo + bo_ji->Cdbo;
+  coef.C1dbo = bo_ij->C1dbo * c;
+  coef.C2dbo = bo_ij->C2dbo * c;
+  coef.C3dbo = bo_ij->C3dbo * c;
+
+  c = bo_ij->Cdbopi + bo_ji->Cdbopi;
+  coef.C1dbopi = bo_ij->C1dbopi * c;
+  coef.C2dbopi = bo_ij->C2dbopi * c;
+  coef.C3dbopi = bo_ij->C3dbopi * c;
+  coef.C4dbopi = bo_ij->C4dbopi * c;
+
+  c = bo_ij->Cdbopi2 + bo_ji->Cdbopi2;
+  coef.C1dbopi2 = bo_ij->C1dbopi2 * c;
+  coef.C2dbopi2 = bo_ij->C2dbopi2 * c;
+  coef.C3dbopi2 = bo_ij->C3dbopi2 * c;
+  coef.C4dbopi2 = bo_ij->C4dbopi2 * c;
+
+  c = workspace->CdDelta[i] + workspace->CdDelta[j];
+  coef.C1dDelta = bo_ij->C1dbo * c;
+  coef.C2dDelta = bo_ij->C2dbo * c;
+  coef.C3dDelta = bo_ij->C3dbo * c;
+
+  // The same "c" refactoring here can be replicated below in Add_dBond_to_Forces_NPTOMP(), but
+  // I'd prefer to wait for a test to verify changes before doing so (just to be safe).
+
+  // forces on i
+  // rvec_Scale(     temp, coef.C1dbo,    bo_ij->dBOp );
+  // rvec_ScaledAdd( temp, coef.C2dbo,    workspace->dDeltap_self[i] );
+  // rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );
+  // rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );
+  // rvec_ScaledAdd( temp, coef.C1dbopi,  bo_ij->dln_BOp_pi );
+  // rvec_ScaledAdd( temp, coef.C2dbopi,  bo_ij->dBOp );
+  // rvec_ScaledAdd( temp, coef.C3dbopi,  workspace->dDeltap_self[i]);
+  // rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+  // rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );
+  // rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );
+
+  c = (coef.C1dbo + coef.C1dDelta + coef.C2dbopi + coef.C2dbopi2);
+  rvec_Scale(     temp, c,    bo_ij->dBOp );
+
+  c = (coef.C2dbo + coef.C2dDelta + coef.C3dbopi + coef.C3dbopi2);
+  rvec_ScaledAdd( temp, c,    workspace->dDeltap_self[i] );
+
+  rvec_ScaledAdd( temp, coef.C1dbopi,  bo_ij->dln_BOp_pi );
+  rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+
+  rvec_Add(workspace->forceReduction[reductionOffset+i],temp );
+
+  if( system->pair_ptr->vflag_atom) {
+    rvec_Scale(fi_tmp, -1.0, temp);
+    rvec_ScaledSum( delij, 1., system->my_atoms[i].x,-1., system->my_atoms[j].x );
+
+    pair_reax_ptr->ev_tally_xyz_thr_proxy(system->pair_ptr,i,j,system->N,0,0,0,
+					  fi_tmp[0],fi_tmp[1],fi_tmp[2],
+					  delij[0],delij[1],delij[2],thr);
+  }
+
+  // forces on j
+  // rvec_Scale(     temp, -coef.C1dbo,    bo_ij->dBOp );
+  // rvec_ScaledAdd( temp,  coef.C3dbo,    workspace->dDeltap_self[j] );
+  // rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );
+  // rvec_ScaledAdd( temp,  coef.C3dDelta, workspace->dDeltap_self[j]);
+  // rvec_ScaledAdd( temp, -coef.C1dbopi,  bo_ij->dln_BOp_pi );
+  // rvec_ScaledAdd( temp, -coef.C2dbopi,  bo_ij->dBOp );
+  // rvec_ScaledAdd( temp,  coef.C4dbopi,  workspace->dDeltap_self[j]);
+  // rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+  // rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );
+  // rvec_ScaledAdd( temp,  coef.C4dbopi2, workspace->dDeltap_self[j]);
+
+
+  c = -(coef.C1dbo + coef.C1dDelta + coef.C2dbopi + coef.C2dbopi2);
+  rvec_Scale(     temp, c,    bo_ij->dBOp );
+
+  c = (coef.C3dbo + coef.C3dDelta + coef.C4dbopi + coef.C4dbopi2);
+  rvec_ScaledAdd( temp,  c,    workspace->dDeltap_self[j] );
+
+  rvec_ScaledAdd( temp, -coef.C1dbopi,  bo_ij->dln_BOp_pi );
+  rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+
+
+  rvec_Add(workspace->forceReduction[reductionOffset+j],temp );
+
+  if( system->pair_ptr->vflag_atom) {
+    rvec_Scale(fj_tmp, -1.0, temp);
+    rvec_ScaledSum( delji, 1., system->my_atoms[j].x,-1., system->my_atoms[i].x );
+
+    pair_reax_ptr->ev_tally_xyz_thr_proxy(system->pair_ptr,j,i,system->N,0,0,0,
+					  fj_tmp[0],fj_tmp[1],fj_tmp[2],
+					  delji[0],delji[1],delji[2],thr);
+  }
+
+  // forces on k: i neighbor
+  for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+    nbr_k = &(bonds->select.bond_list[pk]);
+    k = nbr_k->nbr;
+
+    // rvec_Scale(     temp, -coef.C2dbo,    nbr_k->bo_data.dBOp);
+    // rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);
+    // rvec_ScaledAdd( temp, -coef.C3dbopi,  nbr_k->bo_data.dBOp);
+    // rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);
+
+    const double c = -(coef.C2dbo + coef.C2dDelta + coef.C3dbopi + coef.C3dbopi2);
+    rvec_Scale(temp, c, nbr_k->bo_data.dBOp);
+
+    rvec_Add(workspace->forceReduction[reductionOffset+k],temp );
+
+    if( system->pair_ptr->vflag_atom ) {
+      rvec_Scale(fk_tmp, -1.0, temp);
+      rvec_ScaledSum(delki,1.,system->my_atoms[k].x,-1.,system->my_atoms[i].x);
+
+      pair_reax_ptr->ev_tally_xyz_thr_proxy(system->pair_ptr,k,i,system->N,0,0,0,
+					    fk_tmp[0],fk_tmp[1],fk_tmp[2],
+					    delki[0],delki[1],delki[2],thr);
+      rvec_ScaledSum(delkj,1.,system->my_atoms[k].x,-1.,system->my_atoms[j].x);
+
+      pair_reax_ptr->ev_tally_xyz_thr_proxy(system->pair_ptr,k,j,system->N,0,0,0,
+					    fk_tmp[0],fk_tmp[1],fk_tmp[2],
+					    delkj[0],delkj[1],delkj[2],thr);
+    }
+  }
+
+  // forces on k: j neighbor
+  for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
+    nbr_k = &(bonds->select.bond_list[pk]);
+    k = nbr_k->nbr;
+
+    // rvec_Scale(     temp, -coef.C3dbo,    nbr_k->bo_data.dBOp );
+    // rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);
+    // rvec_ScaledAdd( temp, -coef.C4dbopi,  nbr_k->bo_data.dBOp);
+    // rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);
+
+    const double c = -(coef.C3dbo + coef.C3dDelta + coef.C4dbopi + coef.C4dbopi2);
+    rvec_Scale(temp, c, nbr_k->bo_data.dBOp);
+
+    rvec_Add(workspace->forceReduction[reductionOffset+k],temp );
+
+    if( system->pair_ptr->vflag_atom ) {
+      rvec_Scale(fk_tmp, -1.0, temp);
+      rvec_ScaledSum(delki,1.,system->my_atoms[k].x,-1.,system->my_atoms[i].x);
+
+      pair_reax_ptr->ev_tally_xyz_thr_proxy(system->pair_ptr,k,i,system->N,0,0,0,
+					    fk_tmp[0],fk_tmp[1],fk_tmp[2],
+					    delki[0],delki[1],delki[2],thr);
+
+      rvec_ScaledSum(delkj,1.,system->my_atoms[k].x,-1.,system->my_atoms[j].x);
+
+      pair_reax_ptr->ev_tally_xyz_thr_proxy(system->pair_ptr,k,j,system->N,0,0,0,
+					    fk_tmp[0],fk_tmp[1],fk_tmp[2],
+					    delkj[0],delkj[1],delkj[2],thr);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Add_dBond_to_Forces_NPTOMP( reax_system *system, int i, int pj, simulation_data *data,
+				 storage *workspace, reax_list **lists ) {
+  reax_list *bonds = (*lists) + BONDS;
+  bond_data *nbr_j, *nbr_k;
+  bond_order_data *bo_ij, *bo_ji;
+  dbond_coefficients coef;
+  rvec temp, ext_press;
+  ivec rel_box;
+  int pk, k, j;
+
+  PairReaxCOMP *pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+#else
+  int tid = 0;
+#endif
+  ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+  long reductionOffset = (system->N * tid);
+
+  /* Initializations */
+  nbr_j = &(bonds->select.bond_list[pj]);
+  j = nbr_j->nbr;
+  bo_ij = &(nbr_j->bo_data);
+  bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+  coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+  coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+  coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+  coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+  coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+  coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+  coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+  coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+  coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+  coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+  coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+  coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+  coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+  coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+
+  /************************************
+   * forces related to atom i          *
+   * first neighbors of atom i         *
+   ************************************/
+  for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+    nbr_k = &(bonds->select.bond_list[pk]);
+    k = nbr_k->nbr;
+
+    rvec_Scale(temp, -coef.C2dbo, nbr_k->bo_data.dBOp);       /*2nd, dBO*/
+    rvec_ScaledAdd(temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
+    rvec_ScaledAdd(temp, -coef.C3dbopi, nbr_k->bo_data.dBOp); /*3rd, dBOpi*/
+    rvec_ScaledAdd(temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);/*3rd, dBOpi2*/
+
+    /* force */
+    rvec_Add(workspace->forceReduction[reductionOffset+k],temp );
+
+    /* pressure */
+    rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
+    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+  }
+
+  /* then atom i itself  */
+  rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st,dBO*/
+  rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd,dBO*/
+  rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st,dBO*/
+  rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/
+  rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
+  rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
+  rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/
+
+  rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );  /*1st,dBO_pi2*/
+  rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );         /*2nd,dBO_pi2*/
+  rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/
+
+  /* force */
+  rvec_Add(workspace->forceReduction[reductionOffset+i],temp );
+
+  for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
+    nbr_k = &(bonds->select.bond_list[pk]);
+    k = nbr_k->nbr;
+
+    rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );      /*3rd,dBO*/
+    rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
+    rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/
+    rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/
+
+    /* force */
+    rvec_Add(workspace->forceReduction[reductionOffset+k],temp );
+
+    /* pressure */
+    if( k != i ) {
+      ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i)
+      rvec_iMultiply( ext_press, rel_box, temp );
+      rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+    }
+  }
+
+  /* then atom j itself */
+  rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                    /*1st, dBO*/
+  rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );  /*2nd, dBO*/
+  rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );             /*1st, dBO*/
+  rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/
+
+  rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );       /*1st,dBOpi*/
+  rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );             /*2nd,dBOpi*/
+  rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/
+
+  rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );    /*1st,dBOpi2*/
+  rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );           /*2nd,dBOpi2*/
+  rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+
+  /* force */
+  rvec_Add(workspace->forceReduction[reductionOffset+j],temp );
+
+  /* pressure */
+  rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
+    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+}
+
+/* ---------------------------------------------------------------------- */
+
+int BOp_OMP( storage *workspace, reax_list *bonds, double bo_cut,
+	     int i, int btop_i, far_neighbor_data *nbr_pj,
+	     single_body_parameters *sbp_i, single_body_parameters *sbp_j,
+	     two_body_parameters *twbp,
+	     int btop_j, double C12, double C34, double C56, double BO, double BO_s, double BO_pi, double BO_pi2) {
+  int j;
+  double rr2;
+  double Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+  bond_data *ibond, *jbond;
+  bond_order_data *bo_ij, *bo_ji;
+
+  j = nbr_pj->nbr;
+  rr2 = 1.0 / SQR(nbr_pj->d);
+
+  // Top portion of BOp() moved to reaxc_forces_omp.cpp::Init_Forces_noQEq_OMP()
+
+  /* Initially BO values are the uncorrected ones, page 1 */
+
+  /****** bonds i-j and j-i ******/
+  ibond = &( bonds->select.bond_list[btop_i] );
+  jbond = &( bonds->select.bond_list[btop_j] );
+
+  ibond->nbr = j;
+  jbond->nbr = i;
+  ibond->d = nbr_pj->d;
+  jbond->d = nbr_pj->d;
+  rvec_Copy( ibond->dvec, nbr_pj->dvec );
+  rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+  ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+  ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+  ibond->dbond_index = btop_i;
+  jbond->dbond_index = btop_i;
+  ibond->sym_index = btop_j;
+  jbond->sym_index = btop_i;
+
+  bo_ij = &( ibond->bo_data );
+  bo_ji = &( jbond->bo_data );
+  bo_ji->BO     = bo_ij->BO     = BO;
+  bo_ji->BO_s   = bo_ij->BO_s   = BO_s;
+  bo_ji->BO_pi  = bo_ij->BO_pi  = BO_pi;
+  bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
+
+  /* Bond Order page2-3, derivative of total bond order prime */
+  Cln_BOp_s   = twbp->p_bo2 * C12 * rr2;
+  Cln_BOp_pi  = twbp->p_bo4 * C34 * rr2;
+  Cln_BOp_pi2 = twbp->p_bo6 * C56 * rr2;
+
+  /* Only dln_BOp_xx wrt. dr_i is stored here, note that
+     dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+  rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+  rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+  rvec_Scale(bo_ij->dln_BOp_pi2,
+	     -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+  rvec_Scale(bo_ji->dln_BOp_s,   -1., bo_ij->dln_BOp_s);
+  rvec_Scale(bo_ji->dln_BOp_pi,  -1., bo_ij->dln_BOp_pi );
+  rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+
+  rvec_Scale( bo_ij->dBOp,
+	      -(bo_ij->BO_s * Cln_BOp_s +
+		bo_ij->BO_pi * Cln_BOp_pi +
+		bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+  rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
+
+  bo_ij->BO_s -= bo_cut;
+  bo_ij->BO   -= bo_cut;
+  bo_ji->BO_s -= bo_cut;
+  bo_ji->BO   -= bo_cut;
+
+  bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+  bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+
+  return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BOOMP( reax_system *system, control_params *control, simulation_data *data,
+	    storage *workspace, reax_list **lists, output_controls *out_control )
+{
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  double p_lp1 = system->reax_param.gp.l[15];
+  int  num_bonds = 0;
+  double p_boc1 = system->reax_param.gp.l[0];
+  double p_boc2 = system->reax_param.gp.l[1];
+  reax_list *bonds = (*lists) + BONDS;
+  int  natoms = system->N;
+  int  nthreads = control->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared)
+#endif
+  {
+    int  i, j, pj, type_i, type_j;
+    int  start_i, end_i, sym_index;
+    double val_i, Deltap_i, Deltap_boc_i;
+    double val_j, Deltap_j, Deltap_boc_j;
+    double f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+    double exp_p1i, exp_p2i, exp_p1j, exp_p2j, explp1;
+    double temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+    double Cf45_ij, Cf45_ji; //u_ij, u_ji
+    double A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij, *bo_ji;
+
+#if defined(_OPENMP)
+    int tid = omp_get_thread_num();
+#else
+    int tid = 0;
+#endif
+    /* Calculate Deltaprime, Deltaprime_boc values */
+#if defined(_OPENMP)
+#pragma omp for schedule(static)
+#endif
+    for (i = 0; i < system->N; ++i) {
+      type_i = system->my_atoms[i].type;
+      if(type_i < 0) continue;
+      sbp_i = &(system->reax_param.sbp[type_i]);
+      workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
+      workspace->Deltap_boc[i] =
+	workspace->total_bond_order[i] - sbp_i->valency_val;
+
+      workspace->total_bond_order[i] = 0;
+    }
+
+    // Wait till initialization complete
+#if defined(_OPENMP)
+#pragma omp barrier
+#endif
+    
+    /* Corrected Bond Order calculations */
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for (i = 0; i < system->N; ++i) {
+      type_i = system->my_atoms[i].type;
+      if(type_i < 0) continue;
+      sbp_i = &(system->reax_param.sbp[type_i]);
+      val_i = sbp_i->valency;
+      Deltap_i = workspace->Deltap[i];
+      Deltap_boc_i = workspace->Deltap_boc[i];
+      start_i = Start_Index(i, bonds);
+      end_i = End_Index(i, bonds);
+
+      for (pj = start_i; pj < end_i; ++pj) {
+	j = bonds->select.bond_list[pj].nbr;
+	type_j = system->my_atoms[j].type;
+	if(type_j < 0) continue;
+	bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+	if( i < j || workspace->bond_mark[j] > 3) {
+	  twbp = &( system->reax_param.tbp[type_i][type_j] );
+
+	  if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
+	    bo_ij->C1dbo = 1.000000;
+	    bo_ij->C2dbo = 0.000000;
+	    bo_ij->C3dbo = 0.000000;
+	
+	    bo_ij->C1dbopi = bo_ij->BO_pi;
+	    bo_ij->C2dbopi = 0.000000;
+	    bo_ij->C3dbopi = 0.000000;
+	    bo_ij->C4dbopi = 0.000000;
+	
+	    bo_ij->C1dbopi2 = bo_ij->BO_pi2;
+	    bo_ij->C2dbopi2 = 0.000000;
+	    bo_ij->C3dbopi2 = 0.000000;
+	    bo_ij->C4dbopi2 = 0.000000;
+	
+	  }
+	  else {
+	    val_j = system->reax_param.sbp[type_j].valency;
+	    Deltap_j = workspace->Deltap[j];
+	    Deltap_boc_j = workspace->Deltap_boc[j];
+	
+	    /* on page 1 */
+	    if( twbp->ovc >= 0.001 ) {
+	      /* Correction for overcoordination */
+	      exp_p1i = exp( -p_boc1 * Deltap_i );
+	      exp_p2i = exp( -p_boc2 * Deltap_i );
+	      exp_p1j = exp( -p_boc1 * Deltap_j );
+	      exp_p2j = exp( -p_boc2 * Deltap_j );
+	
+	      f2 = exp_p1i + exp_p1j;
+	      f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+	      f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) +
+			   ( val_j + f2 )/( val_j + f2 + f3 ) );
+	
+	      /* Now come the derivates */
+	      /* Bond Order pages 5-7, derivative of f1 */
+	      temp = f2 + f3;
+	      u1_ij = val_i + temp;
+	      u1_ji = val_j + temp;
+	      Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) +
+				    1.0 / SQR( u1_ji ));
+	      Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
+				( u1_ji - f3 ) / SQR( u1_ji ));
+	
+	      Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
+				((val_i+f2) / SQR(u1_ij)) *
+				( -p_boc1 * exp_p1i +
+				  exp_p2i / ( exp_p2i + exp_p2j ) ) +
+				-p_boc1 * exp_p1i / u1_ji -
+				((val_j+f2) / SQR(u1_ji)) *
+				( -p_boc1 * exp_p1i +
+				  exp_p2i / ( exp_p2i + exp_p2j ) ));
+	
+	
+	      Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
+		Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
+	    }
+	    else {
+	      /* No overcoordination correction! */
+	      f1 = 1.0;
+	      Cf1_ij = Cf1_ji = 0.0;
+	    }
+
+	    if( twbp->v13cor >= 0.001 ) {
+	      /* Correction for 1-3 bond orders */
+	      exp_f4 =exp(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+			    Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+	      exp_f5 =exp(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+			    Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+	
+	      f4 = 1. / (1. + exp_f4);
+	      f5 = 1. / (1. + exp_f5);
+	      f4f5 = f4 * f5;
+	
+	      /* Bond Order pages 8-9, derivative of f4 and f5 */
+	      Cf45_ij = -f4 * exp_f4;
+	      Cf45_ji = -f5 * exp_f5;
+	    }
+	    else {
+	      f4 = f5 = f4f5 = 1.0;
+	      Cf45_ij = Cf45_ji = 0.0;
+	    }
+	
+	    /* Bond Order page 10, derivative of total bond order */
+	    A0_ij = f1 * f4f5;
+	    A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
+	      (Cf45_ij + Cf45_ji);
+	    A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+	    A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+	    A3_ij = A2_ij + Cf1_ij / f1;
+	    A3_ji = A2_ji + Cf1_ji / f1;
+	
+	    /* find corrected bond orders and their derivative coef */
+	    bo_ij->BO    = bo_ij->BO    * A0_ij;
+	    bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
+	    bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
+	    bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+	
+	    bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+	    bo_ij->C2dbo = bo_ij->BO * A2_ij;
+	    bo_ij->C3dbo = bo_ij->BO * A2_ji;
+	
+	    bo_ij->C1dbopi = f1*f1*f4*f5;
+	    bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+	    bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+	    bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+	
+	    bo_ij->C1dbopi2 = f1*f1*f4*f5;
+	    bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+	    bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+	    bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
+	  }
+	
+	  /* neglect bonds that are < 1e-10 */
+	  if( bo_ij->BO < 1e-10 )
+	    bo_ij->BO = 0.0;
+	  if( bo_ij->BO_s < 1e-10 )
+	    bo_ij->BO_s = 0.0;
+	  if( bo_ij->BO_pi < 1e-10 )
+	    bo_ij->BO_pi = 0.0;
+	  if( bo_ij->BO_pi2 < 1e-10 )
+	    bo_ij->BO_pi2 = 0.0;
+	
+	  workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO
+	}
+	// else {
+	//   /* We only need to update bond orders from bo_ji
+	//      everything else is set in uncorrected_bo calculations */
+	//   sym_index = bonds->select.bond_list[pj].sym_index;
+	//   bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+	//   bo_ij->BO = bo_ji->BO;
+	//   bo_ij->BO_s = bo_ji->BO_s;
+	//   bo_ij->BO_pi = bo_ji->BO_pi;
+	//   bo_ij->BO_pi2 = bo_ji->BO_pi2;
+	
+	//   workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
+	// }
+      }
+
+    }
+
+    // Wait for bo_ij to be updated
+#if defined(_OPENMP)
+#pragma omp barrier
+#endif
+    // Try to combine the following for-loop back into the for-loop above
+    /*-------------------------*/
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for (i = 0; i < system->N; ++i) {
+      type_i = system->my_atoms[i].type;
+      if(type_i < 0) continue;
+      start_i = Start_Index(i, bonds);
+      end_i = End_Index(i, bonds);
+
+      for (pj = start_i; pj < end_i; ++pj) {
+	j = bonds->select.bond_list[pj].nbr;
+	type_j = system->my_atoms[j].type;
+	if(type_j < 0) continue;
+
+	if( i < j || workspace->bond_mark[j] > 3) {
+	  // Computed in previous for-loop
+	} else {
+	  /* We only need to update bond orders from bo_ji
+	     everything else is set in uncorrected_bo calculations */
+	  sym_index = bonds->select.bond_list[pj].sym_index;
+	
+	  bo_ij = &( bonds->select.bond_list[pj].bo_data );
+	  bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+	  bo_ij->BO = bo_ji->BO;
+	  bo_ij->BO_s = bo_ji->BO_s;
+	  bo_ij->BO_pi = bo_ji->BO_pi;
+	  bo_ij->BO_pi2 = bo_ji->BO_pi2;
+	
+	  workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
+	}
+      }
+
+    }
+
+    /*-------------------------*/
+
+    // Need to wait for total_bond_order to be accumulated.
+#if defined(_OPENMP)
+#pragma omp barrier
+#endif
+    /* Calculate some helper variables that are  used at many places
+       throughout force calculations */
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for(j = 0; j < system->N; ++j ) {
+      type_j = system->my_atoms[j].type;
+      if(type_j < 0) continue;
+      sbp_j = &(system->reax_param.sbp[ type_j ]);
+
+      workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
+      workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
+      workspace->Delta_boc[j] = workspace->total_bond_order[j] -
+	sbp_j->valency_boc;
+      workspace->Delta_val[j] = workspace->total_bond_order[j] -
+	sbp_j->valency_val;
+
+      workspace->vlpex[j] = workspace->Delta_e[j] -
+	2.0 * (int)(workspace->Delta_e[j]/2.0);
+      explp1 = exp(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
+      workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
+      workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
+      workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
+      workspace->dDelta_lp[j] = workspace->Clp[j];
+
+      if( sbp_j->mass > 21.0 ) {
+	workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+	workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+	workspace->dDelta_lp_temp[j] = 0.;
+      }
+      else {
+	workspace->nlp_temp[j] = workspace->nlp[j];
+	workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+	workspace->dDelta_lp_temp[j] = workspace->Clp[j];
+      }
+    }
+
+  } // parallel region
+
+#ifdef OMP_TIMING
+	endTimeBase = MPI_Wtime();
+	ompTimingData[COMPUTEBOINDEX] += (endTimeBase-startTimeBase);
+
+#endif
+}
--- a/src/USER-OMP/reaxc_bond_orders_omp.h
+++ b/src/USER-OMP/reaxc_bond_orders_omp.h
@ -0,0 +1,43 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __BOND_ORDERS_OMP_H_
+#define __BOND_ORDERS_OMP_H_
+
+#include "reaxc_types.h"
+#include "reaxc_bond_orders.h"
+
+void Add_dBond_to_ForcesOMP( reax_system*, int, int, storage*, reax_list** );
+void Add_dBond_to_Forces_NPTOMP( reax_system *system, int, int, simulation_data*,
+                              storage*, reax_list** );
+
+int BOp_OMP(storage*, reax_list*, double, int, int, far_neighbor_data*,
+	    single_body_parameters*, single_body_parameters*, two_body_parameters*,
+	    int, double, double, double, double, double, double, double);
+
+void BOOMP( reax_system*, control_params*, simulation_data*,
+         storage*, reax_list**, output_controls* );
+#endif
--- a/src/USER-OMP/reaxc_bonds_omp.cpp
+++ b/src/USER-OMP/reaxc_bonds_omp.cpp
@ -0,0 +1,186 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+
+#include "reaxc_bonds_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_tool_box.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include  <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+void BondsOMP( reax_system *system, control_params *control,
+            simulation_data *data, storage *workspace, reax_list **lists,
+            output_controls *out_control )
+{
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  int natoms = system->n;
+  int nthreads = control->nthreads;
+  reax_list *bonds = (*lists) + BONDS;
+  double gp3 = system->reax_param.gp.l[3];
+  double gp4 = system->reax_param.gp.l[4];
+  double gp7 = system->reax_param.gp.l[7];
+  double gp10 = system->reax_param.gp.l[10];
+  double gp37 = (int) system->reax_param.gp.l[37];
+  double total_Ebond = 0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) reduction(+: total_Ebond)
+#endif
+  {
+  int  i, j, pj;
+  int start_i, end_i;
+  int type_i, type_j;
+  double ebond, ebond_thr=0.0, pow_BOs_be2, exp_be12, CEbo;
+  double gp3, gp4, gp7, gp10, gp37;
+  double exphu, exphua1, exphub1, exphuov, hulpov, estriph, estriph_thr=0.0;
+  double decobdbo, decobdboua, decobdboub;
+  single_body_parameters *sbp_i, *sbp_j;
+  two_body_parameters *twbp;
+  bond_order_data *bo_ij;
+  
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+#else
+  int tid = 0;
+#endif
+  long reductionOffset = (system->N * tid);
+
+  class PairReaxCOMP *pair_reax_ptr;
+  pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+  class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+  pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+				    system->pair_ptr->vflag_either, natoms,
+				    system->pair_ptr->eatom, system->pair_ptr->vatom, thr);
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+  for (i = 0; i < natoms; ++i) {
+    start_i = Start_Index(i, bonds);
+    end_i = End_Index(i, bonds);
+
+    for (pj = start_i; pj < end_i; ++pj) {
+      j = bonds->select.bond_list[pj].nbr;
+
+      if( system->my_atoms[i].orig_id > system->my_atoms[j].orig_id ) continue;
+
+      if( system->my_atoms[i].orig_id == system->my_atoms[j].orig_id ) {
+        if (system->my_atoms[j].x[2] <  system->my_atoms[i].x[2]) continue;
+      	if (system->my_atoms[j].x[2] == system->my_atoms[i].x[2] &&
+      	    system->my_atoms[j].x[1] <  system->my_atoms[i].x[1]) continue;
+        if (system->my_atoms[j].x[2] == system->my_atoms[i].x[2] &&
+      	    system->my_atoms[j].x[1] == system->my_atoms[i].x[1] &&
+      	    system->my_atoms[j].x[0] <  system->my_atoms[i].x[0]) continue;
+      }
+
+      /* set the pointers */
+      type_i = system->my_atoms[i].type;
+      type_j = system->my_atoms[j].type;
+      sbp_i = &( system->reax_param.sbp[type_i] );
+      sbp_j = &( system->reax_param.sbp[type_j] );
+      twbp = &( system->reax_param.tbp[type_i][type_j] );
+      bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+      /* calculate the constants */
+      pow_BOs_be2 = pow( bo_ij->BO_s, twbp->p_be2 );
+      exp_be12 = exp( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+      CEbo = -twbp->De_s * exp_be12 *
+	( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+
+      /* calculate the Bond Energy */
+      total_Ebond += ebond =
+	-twbp->De_s * bo_ij->BO_s * exp_be12
+	-twbp->De_p * bo_ij->BO_pi
+	-twbp->De_pp * bo_ij->BO_pi2;
+
+      /* tally into per-atom energy */
+      if (system->pair_ptr->evflag)
+	pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, i, j, natoms, 1,
+					  ebond, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+
+      /* calculate derivatives of Bond Orders */
+      bo_ij->Cdbo += CEbo;
+      bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+      bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+
+      /* Stabilisation terminal triple bond */
+      if (bo_ij->BO >= 1.00) {
+	if (gp37 == 2 ||
+	    (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) ||
+	    (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990)) {
+	  exphu = exp( -gp7 * SQR(bo_ij->BO - 2.50) );
+	  exphua1 = exp(-gp3 * (workspace->total_bond_order[i]-bo_ij->BO));
+	  exphub1 = exp(-gp3 * (workspace->total_bond_order[j]-bo_ij->BO));
+	  exphuov = exp(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
+	  hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+	
+	  estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+	  total_Ebond += estriph;
+	
+	  decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
+	    ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
+	  decobdboua = -gp10 * exphu * hulpov *
+	    (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+	  decobdboub = -gp10 * exphu * hulpov *
+	    (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+	
+	  /* tally into per-atom energy */
+	  if (system->pair_ptr->evflag)
+	    pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, i, j, natoms, 1,
+					      estriph, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+	
+	  bo_ij->Cdbo += decobdbo;
+	  workspace->CdDelta[i] += decobdboua;
+	  workspace->CdDeltaReduction[reductionOffset+j] += decobdboub;
+        }
+      }
+    }
+  } // for(i)
+
+ } // omp
+
+ data->my_en.e_bond += total_Ebond;
+
+#ifdef OMP_TIMING
+ endTimeBase = MPI_Wtime();
+ ompTimingData[COMPUTEBONDSINDEX] += (endTimeBase-startTimeBase);
+#endif
+
+}
--- a/src/USER-OMP/reaxc_bonds_omp.h
+++ b/src/USER-OMP/reaxc_bonds_omp.h
@ -0,0 +1,35 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __BONDS_OMP_H_
+#define __BONDS_OMP_H_
+
+#include "reaxc_types.h"
+
+void BondsOMP( reax_system*, control_params*, simulation_data*,
+            storage*, reax_list**, output_controls* );
+
+#endif
--- a/src/USER-OMP/reaxc_forces_omp.cpp
+++ b/src/USER-OMP/reaxc_forces_omp.cpp
@ -0,0 +1,649 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "thr_data.h"
+
+#include "reaxc_forces_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_bonds_omp.h"
+#include "reaxc_hydrogen_bonds_omp.h"
+#include "reaxc_io_tools.h"
+#include "reaxc_list.h"
+#include "reaxc_lookup.h"
+#include "reaxc_multi_body_omp.h"
+#include "reaxc_nonbonded_omp.h"
+#include "reaxc_tool_box.h"
+#include "reaxc_torsion_angles_omp.h"
+#include "reaxc_valence_angles_omp.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+// Functions defined in reaxc_forces.cpp
+extern interaction_function Interaction_Functions[];
+extern double Compute_H(double, double, double*);
+extern double Compute_tabH(double, int, int);
+extern void Dummy_Interaction(reax_system*, control_params*, simulation_data*, storage*, reax_list**, output_controls*);
+
+/* ---------------------------------------------------------------------- */
+
+void Init_Force_FunctionsOMP( control_params *control )
+{
+  Interaction_Functions[0] = BOOMP;
+  Interaction_Functions[1] = BondsOMP; //Dummy_Interaction;
+  Interaction_Functions[2] = Atom_EnergyOMP; //Dummy_Interaction;
+  Interaction_Functions[3] = Valence_AnglesOMP; //Dummy_Interaction;
+  Interaction_Functions[4] = Torsion_AnglesOMP; //Dummy_Interaction;
+  if( control->hbond_cut > 0 )
+    Interaction_Functions[5] = Hydrogen_BondsOMP;
+  else Interaction_Functions[5] = Dummy_Interaction;
+  Interaction_Functions[6] = Dummy_Interaction; //empty
+  Interaction_Functions[7] = Dummy_Interaction; //empty
+  Interaction_Functions[8] = Dummy_Interaction; //empty
+  Interaction_Functions[9] = Dummy_Interaction; //empty
+}
+
+/* ---------------------------------------------------------------------- */
+
+// Only difference with MPI-only version is inclusion of OMP_TIMING statements
+void Compute_Bonded_ForcesOMP( reax_system *system, control_params *control,
+                            simulation_data *data, storage *workspace,
+                            reax_list **lists, output_controls *out_control,
+                            MPI_Comm comm )
+{
+  int i;
+
+#ifdef OMP_TIMING
+  double startTimeBase, endTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  /* Implement all force calls as function pointers */
+  for( i = 0; i < NUM_INTRS; i++ ) {
+    (Interaction_Functions[i])( system, control, data, workspace,
+                                lists, out_control );
+  }
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEBFINDEX] += (endTimeBase-startTimeBase);
+#endif
+
+}
+
+// Only difference with MPI-only version is inclusion of OMP_TIMING statements
+void Compute_NonBonded_ForcesOMP( reax_system *system, control_params *control,
+                               simulation_data *data, storage *workspace,
+                               reax_list **lists, output_controls *out_control,
+                               MPI_Comm comm )
+{
+  /* van der Waals and Coulomb interactions */
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  if( control->tabulate == 0 )
+    vdW_Coulomb_Energy_OMP( system, control, data, workspace,
+			    lists, out_control );
+  else
+    Tabulated_vdW_Coulomb_Energy_OMP( system, control, data, workspace,
+				      lists, out_control );
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTENBFINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* this version of Compute_Total_Force computes forces from
+   coefficients accumulated by all interaction functions.
+   Saves enormous time & space! */
+void Compute_Total_ForceOMP( reax_system *system, control_params *control,
+                          simulation_data *data, storage *workspace,
+                          reax_list **lists, mpi_datatypes *mpi_data )
+{
+#ifdef OMP_TIMING
+  double startTimeBase,endTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  int natoms = system->N;
+  int nthreads = control->nthreads;
+  long totalReductionSize = system->N * nthreads;
+  reax_list *bonds = (*lists) + BONDS;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) //default(none)
+#endif
+  {
+    int i, j, k, pj, pk, start_j, end_j;
+#if defined(_OPENMP)
+    int tid = omp_get_thread_num();
+#else
+    int tid = 0;
+#endif
+    bond_order_data *bo_jk;
+
+    class PairReaxCOMP *pair_reax_ptr;
+    pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+    class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+    pair_reax_ptr->ev_setup_thr_proxy(0, 1, natoms, system->pair_ptr->eatom,
+ 				      system->pair_ptr->vatom, thr);
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for (i = 0; i < system->N; ++i) {
+      for (j = 0; j < nthreads; ++j)
+	workspace->CdDelta[i] += workspace->CdDeltaReduction[system->N*j+i];
+    }
+
+#if defined(_OPENMP)
+#pragma omp for schedule(dynamic,50)
+#endif
+    for (j = 0; j < system->N; ++j) {
+      start_j = Start_Index(j, bonds);
+      end_j = End_Index(j, bonds);
+
+      for (pk = start_j; pk < end_j; ++pk) {
+ 	bo_jk = &( bonds->select.bond_list[pk].bo_data );
+ 	for (k = 0; k < nthreads; ++k)
+ 	  bo_jk->Cdbo += bo_jk->CdboReduction[k];
+      }
+    }
+
+// #pragma omp for schedule(guided) //(dynamic,50)
+//     for (i = 0; i < system->N; ++i)
+//       for (pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj)
+//  	if (i < bonds->select.bond_list[pj].nbr) {
+//  	  if (control->virial == 0)
+// 	    Add_dBond_to_ForcesOMP( system, i, pj, workspace, lists );
+// 	  else
+//  	    Add_dBond_to_Forces_NPTOMP(system, i, pj, data, workspace, lists );
+//  	}
+
+    if(control->virial == 0) {
+
+#if defined(_OPENMP)
+#pragma omp for schedule(dynamic,50)
+#endif
+      for (i = 0; i < system->N; ++i) {
+	const int startj = Start_Index(i, bonds);
+	const int endj  = End_Index(i, bonds);
+	for (pj = startj; pj < endj; ++pj)
+	  if (i < bonds->select.bond_list[pj].nbr)
+	    Add_dBond_to_ForcesOMP( system, i, pj, workspace, lists );
+      }
+
+    } else {
+
+#if defined(_OPENMP)
+#pragma omp for schedule(dynamic,50)
+#endif
+      for (i = 0; i < system->N; ++i) {
+	const int startj = Start_Index(i, bonds);
+	const int endj  = End_Index(i, bonds);
+	for (pj = startj; pj < endj; ++pj)
+	  if (i < bonds->select.bond_list[pj].nbr)
+	    Add_dBond_to_Forces_NPTOMP(system, i, pj, data, workspace, lists );
+      }
+
+    } // if(virial == 0)
+
+    pair_reax_ptr->reduce_thr_proxy(system->pair_ptr, 0, 1, thr);
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for (i = 0; i < system->N; ++i) {
+      for (j = 0; j < nthreads; ++j)
+ 	rvec_Add( workspace->f[i], workspace->forceReduction[system->N*j+i] );
+    }
+
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for (i = 0; i < totalReductionSize; i++) {
+      workspace->forceReduction[i][0] = 0;
+      workspace->forceReduction[i][1] = 0;
+      workspace->forceReduction[i][2] = 0;
+      workspace->CdDeltaReduction[i] = 0;
+    }
+  } // parallel region
+
+  if (control->virial)
+    for (int i=0; i < nthreads; ++i) {
+      rvec_Add(data->my_ext_press, workspace->my_ext_pressReduction[i]);
+      workspace->my_ext_pressReduction[i][0] = 0;
+      workspace->my_ext_pressReduction[i][1] = 0;
+      workspace->my_ext_pressReduction[i][2] = 0;
+    }
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTETFINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Validate_ListsOMP( reax_system *system, storage *workspace, reax_list **lists,
+                     int step, int n, int N, int numH, MPI_Comm comm )
+{
+  int i, comp, Hindex;
+  reax_list *bonds, *hbonds;
+  reallocate_data *realloc = &(workspace->realloc);
+  double saferzone = system->saferzone;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) private(i, comp, Hindex)
+#endif
+  {
+
+  /* bond list */
+  if( N > 0 ) {
+    bonds = *lists + BONDS;
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for( i = 0; i < N; ++i ) {
+      system->my_atoms[i].num_bonds = MAX(Num_Entries(i,bonds)*2, MIN_BONDS);
+
+      if( i < N-1 )
+        comp = Start_Index(i+1, bonds);
+      else comp = bonds->num_intrs;
+
+      if( End_Index(i, bonds) > comp ) {
+        fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                 step, i, End_Index(i,bonds), comp );
+        MPI_Abort( comm, INSUFFICIENT_MEMORY );
+      }
+    }
+  }
+
+
+  /* hbonds list */
+  if( numH > 0 ) {
+    hbonds = *lists + HBONDS;
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+    for( i = 0; i < n; ++i ) {
+      Hindex = system->my_atoms[i].Hindex;
+      if( Hindex > -1 ) {
+        system->my_atoms[i].num_hbonds =
+          (int)(MAX( Num_Entries(Hindex, hbonds)*saferzone, MIN_HBONDS ));
+
+        if( Hindex < numH-1 )
+          comp = Start_Index(Hindex+1, hbonds);
+        else comp = hbonds->num_intrs;
+
+        if( End_Index(Hindex, hbonds) > comp ) {
+          fprintf(stderr,"step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
+                  step, Hindex, End_Index(Hindex,hbonds), comp );
+          MPI_Abort( comm, INSUFFICIENT_MEMORY );
+        }
+      }
+    }
+  }
+
+  } // omp parallel
+}
+
+
+void Init_Forces_noQEq_OMP( reax_system *system, control_params *control,
+			    simulation_data *data, storage *workspace,
+			    reax_list **lists, output_controls *out_control,
+			    MPI_Comm comm ) {
+#ifdef OMP_TIMING
+  double startTimeBase, endTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  int i, j, pi, pj;
+  int start_i, end_i, start_j, end_j;
+  int type_i, type_j;
+  int ihb, jhb, ihb_top, jhb_top;
+  int local, flag;
+  double r_ij, cutoff;
+  single_body_parameters *sbp_i, *sbp_j;
+  two_body_parameters *twbp;
+  far_neighbor_data *nbr_pj;
+  reax_atom *atom_i, *atom_j;
+  bond_data *ibond, *jbond;
+  reax_list *far_nbrs = *lists + FAR_NBRS;
+  reax_list *bonds = *lists + BONDS;
+  reax_list *hbonds = *lists + HBONDS;
+  int num_bonds = 0;
+  int num_hbonds = 0;
+  int btop_i = 0;
+  int btop_j = 0;
+  int renbr = (data->step-data->prev_steps) % control->reneighbor == 0;
+
+  // We will use CdDeltaReduction as a temporary (double) buffer to accumulate total_bond_order
+  // This is safe because CdDeltaReduction is currently zeroed and its accumulation doesn't start until BondsOMP()
+  double * tmp_bond_order = workspace->CdDeltaReduction;
+
+  // We do the same with forceReduction as a temporary (rvec) buffer to accumulate dDeltap_self
+  // This is safe because forceReduction is currently zeroed and its accumulation does start until Hydrogen_BondsOMP()
+  rvec * tmp_ddelta = workspace->forceReduction;
+
+  /* uncorrected bond orders */
+  cutoff = control->bond_cut;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) \
+  private(i, atom_i, type_i, pi, start_i, end_i, sbp_i, btop_i, ibond, ihb, ihb_top, \
+          j, atom_j, type_j, pj, start_j, end_j, sbp_j, nbr_pj, jbond, jhb, twbp)
+#endif
+  {
+
+    int nthreads = control->nthreads;
+#if defined(_OPENMP)
+    int tid = omp_get_thread_num();
+#else
+    int tid = 0;
+#endif
+    long reductionOffset = system->N * tid;
+    long totalReductionSize = system->N * nthreads;
+
+#if defined(_OPENMP)
+#pragma omp for schedule(dynamic,50) reduction(+ : num_bonds)
+#endif
+  for (i = 0; i < system->N; ++i) {
+    atom_i = &(system->my_atoms[i]);
+    type_i  = atom_i->type;
+    sbp_i = &(system->reax_param.sbp[type_i]);
+
+    start_i = Start_Index(i, far_nbrs);
+    end_i   = End_Index(i, far_nbrs);
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+      nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+      if (nbr_pj->d <= cutoff) {
+	j = nbr_pj->nbr;
+	atom_j = &(system->my_atoms[j]);
+	type_j = atom_j->type;
+	sbp_j = &(system->reax_param.sbp[type_j]);
+	twbp = &(system->reax_param.tbp[type_i][type_j]);
+
+// #pragma omp critical
+// 	{
+// 	  btop_i = End_Index(i, bonds);
+// 	  if( BOp(workspace, bonds, control->bo_cut, i, btop_i, nbr_pj, sbp_i, sbp_j, twbp) ) {
+//             num_bonds++;
+//             btop_i++;
+//             Set_End_Index(i, btop_i, bonds);
+// 	  }
+	
+// 	}
+	
+	// Trying to minimize time spent in critical section by moving initial part of BOp()
+	// outside of critical section.
+
+	// Start top portion of BOp()
+	int jj = nbr_pj->nbr;
+	double C12, C34, C56;
+	double BO, BO_s, BO_pi, BO_pi2;
+	double bo_cut = control->bo_cut;
+
+	if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 ) {
+	  C12 = twbp->p_bo1 * pow( nbr_pj->d / twbp->r_s, twbp->p_bo2 );
+	  BO_s = (1.0 + bo_cut) * exp( C12 );
+	}
+	else BO_s = C12 = 0.0;
+	
+	if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 ) {
+	  C34 = twbp->p_bo3 * pow( nbr_pj->d / twbp->r_p, twbp->p_bo4 );
+	  BO_pi = exp( C34 );
+	}
+	else BO_pi = C34 = 0.0;
+	
+	if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 ) {
+	  C56 = twbp->p_bo5 * pow( nbr_pj->d / twbp->r_pp, twbp->p_bo6 );
+	  BO_pi2= exp( C56 );
+	}
+	else BO_pi2 = C56 = 0.0;
+	
+	/* Initially BO values are the uncorrected ones, page 1 */
+	BO = BO_s + BO_pi + BO_pi2;
+	// End top portion of BOp()
+	
+	if(BO >= bo_cut) {
+	  int btop_j;
+	
+	  // Update indices in critical section
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+ 	  {
+	    btop_i = End_Index( i, bonds );
+	    btop_j = End_Index( j, bonds );
+	    Set_End_Index( j, btop_j+1, bonds );
+	    Set_End_Index( i, btop_i+1, bonds );
+ 	  } // omp critical
+	
+	  // Finish remaining BOp() work
+	  BOp_OMP(workspace, bonds, bo_cut,
+		  i , btop_i, nbr_pj, sbp_i, sbp_j, twbp, btop_j,
+		  C12, C34, C56, BO, BO_s, BO_pi, BO_pi2);
+	
+ 	  bond_data * ibond = &(bonds->select.bond_list[btop_i]);
+ 	  bond_order_data * bo_ij = &(ibond->bo_data);
+	
+ 	  bond_data * jbond = &(bonds->select.bond_list[btop_j]);
+ 	  bond_order_data * bo_ji = &(jbond->bo_data);
+	
+	  workspace->total_bond_order[i]      += bo_ij->BO;
+	  tmp_bond_order[reductionOffset + j] += bo_ji->BO;
+	
+	  rvec_Add(workspace->dDeltap_self[i],      bo_ij->dBOp);
+	  rvec_Add(tmp_ddelta[reductionOffset + j], bo_ji->dBOp);
+	
+	  btop_i++;
+	  num_bonds++;
+	} // if(BO>=bo_cut)
+	
+      } // if(cutoff)
+
+    } // for(pj)
+  } // for(i)
+
+  // Need to wait for all indices and tmp arrays accumulated.
+#if defined(_OPENMP)
+#pragma omp barrier
+#endif
+
+#if defined(_OPENMP)
+#pragma omp for schedule(dynamic,50)
+#endif
+  for(i=0; i<system->N; i++)
+    for(int t=0; t<nthreads; t++) {
+      const int indx = t*system->N + i;
+      workspace->dDeltap_self[i][0]  += tmp_ddelta[indx][0];
+      workspace->dDeltap_self[i][1]  += tmp_ddelta[indx][1];
+      workspace->dDeltap_self[i][2]  += tmp_ddelta[indx][2];
+      workspace->total_bond_order[i] += tmp_bond_order[indx];
+    }
+
+  /* hydrogen bond list */
+  if (control->hbond_cut > 0) {
+    cutoff = control->hbond_cut;
+
+#if defined(_OPENMP)
+#pragma omp for schedule(dynamic,50) reduction(+ : num_hbonds)
+#endif
+     for (i = 0; i < system->n; ++i) {
+       atom_i = &(system->my_atoms[i]);
+       type_i  = atom_i->type;
+       sbp_i = &(system->reax_param.sbp[type_i]);
+       ihb = sbp_i->p_hbond;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+       {
+
+       if (ihb == 1 || ihb == 2) {
+	 start_i = Start_Index(i, far_nbrs);
+	 end_i   = End_Index(i, far_nbrs);
+	
+	 for (pj = start_i; pj < end_i; ++pj) {
+	   nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+	   j = nbr_pj->nbr;
+	   atom_j = &(system->my_atoms[j]);
+	   type_j = atom_j->type;
+	   if(type_j < 0) continue;
+	   sbp_j = &(system->reax_param.sbp[type_j]);
+	   jhb = sbp_j->p_hbond;
+
+	   if (nbr_pj->d <= control->hbond_cut) {
+	     int iflag = 0;
+	     int jflag = 0;
+	
+	     if(ihb==1 && jhb==2) iflag = 1;
+	     else if(j<system->n && ihb == 2 && jhb == 1) jflag = 1;
+	
+	     if(iflag || jflag) {
+		 if(iflag) {
+		   ihb_top = End_Index(atom_i->Hindex, hbonds);
+		   Set_End_Index(atom_i->Hindex, ihb_top+1, hbonds);
+		 } else if(jflag) {
+		   jhb_top = End_Index(atom_j->Hindex, hbonds);
+		   Set_End_Index(atom_j->Hindex, jhb_top+1, hbonds);
+		 }
+
+	       if(iflag) {
+		 hbonds->select.hbond_list[ihb_top].nbr = j;
+		 hbonds->select.hbond_list[ihb_top].scl = 1;
+		 hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
+	       } else if(jflag) {
+		 hbonds->select.hbond_list[jhb_top].nbr = i;
+		 hbonds->select.hbond_list[jhb_top].scl = -1;
+		 hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
+	       }
+	
+	       num_hbonds++;
+	     } // if(iflag || jflag)
+
+	   }
+	 }
+       }
+
+       } // omp critical
+     }
+
+  } // if(control->hbond > 0)
+
+  // Zero buffers for others to use as intended.
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+  for(i=0; i<totalReductionSize; i++) {
+    tmp_ddelta[i][0]  = 0.0;
+    tmp_ddelta[i][1]  = 0.0;
+    tmp_ddelta[i][2]  = 0.0;
+    tmp_bond_order[i] = 0.0;
+  }
+
+  } // omp
+
+  workspace->realloc.num_bonds = num_bonds;
+  workspace->realloc.num_hbonds = num_hbonds;
+
+  Validate_ListsOMP( system, workspace, lists, data->step,
+                  system->n, system->N, system->numH, comm );
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEIFINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Compute_ForcesOMP( reax_system *system, control_params *control,
+                     simulation_data *data, storage *workspace,
+                     reax_list **lists, output_controls *out_control,
+                     mpi_datatypes *mpi_data )
+{
+  int qeq_flag;
+  MPI_Comm comm = mpi_data->world;
+
+  // Init Forces
+#if defined(LOG_PERFORMANCE)
+  double t_start = 0;
+  if( system->my_rank == MASTER_NODE )
+    t_start = Get_Time( );
+#endif
+
+  Init_Forces_noQEq_OMP( system, control, data, workspace,
+			 lists, out_control, comm );
+
+#if defined(LOG_PERFORMANCE)
+  //MPI_Barrier( comm );
+  if( system->my_rank == MASTER_NODE )
+    Update_Timing_Info( &t_start, &(data->timing.init_forces) );
+#endif
+
+  // Bonded Interactions
+  Compute_Bonded_ForcesOMP( system, control, data, workspace,
+                         lists, out_control, mpi_data->world );
+
+#if defined(LOG_PERFORMANCE)
+  if( system->my_rank == MASTER_NODE )
+    Update_Timing_Info( &t_start, &(data->timing.bonded) );
+#endif
+
+  // Nonbonded Interactions
+  Compute_NonBonded_ForcesOMP( system, control, data, workspace,
+                            lists, out_control, mpi_data->world );
+
+#if defined(LOG_PERFORMANCE)
+  if( system->my_rank == MASTER_NODE )
+    Update_Timing_Info( &t_start, &(data->timing.nonb) );
+#endif
+
+  // Total Force
+  Compute_Total_ForceOMP( system, control, data, workspace, lists, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+  if( system->my_rank == MASTER_NODE )
+    Update_Timing_Info( &t_start, &(data->timing.bonded) );
+#endif
+}
--- a/src/USER-OMP/reaxc_forces_omp.h
+++ b/src/USER-OMP/reaxc_forces_omp.h
@ -0,0 +1,36 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __FORCES_OMP_H_
+#define __FORCES_OMP_H_
+
+#include "reaxc_types.h"
+#include "reaxc_defs.h"
+
+void Init_Force_FunctionsOMP( control_params* );
+void Compute_ForcesOMP( reax_system*, control_params*, simulation_data*,
+                     storage*, reax_list**, output_controls*, mpi_datatypes* );
+#endif
--- a/src/USER-OMP/reaxc_hydrogen_bonds_omp.cpp
+++ b/src/USER-OMP/reaxc_hydrogen_bonds_omp.cpp
@ -0,0 +1,252 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+
+#include "reaxc_hydrogen_bonds_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_valence_angles.h"     // To access Calculate_Theta()
+#include "reaxc_valence_angles_omp.h" // To access Calculate_dCos_ThetaOMP()
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include  <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+void Hydrogen_BondsOMP( reax_system *system, control_params *control,
+                     simulation_data *data, storage *workspace,
+                     reax_list **lists, output_controls *out_control )
+{
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  const int nthreads = control->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) //default(none)
+#endif
+  {
+  int  i, j, k, pi, pk;
+  int  type_i, type_j, type_k;
+  int  start_j, end_j, hb_start_j, hb_end_j;
+  int  hblist[MAX_BONDS];
+  int  itr, top;
+  int  num_hb_intrs = 0;
+  ivec rel_jk;
+  double r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+  double e_hb, e_hb_thr = 0.0, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+  rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+  rvec dvec_jk, force, ext_press;
+  hbond_parameters *hbp;
+  bond_order_data *bo_ij;
+  bond_data *pbond_ij;
+  far_neighbor_data *nbr_jk;
+  reax_list *bonds, *hbonds;
+  bond_data *bond_list;
+  hbond_data *hbond_list;
+
+  // tally variables
+  double fi_tmp[3], fk_tmp[3], delij[3], delkj[3];
+
+  bonds = (*lists) + BONDS;
+  bond_list = bonds->select.bond_list;
+  hbonds = (*lists) + HBONDS;
+  hbond_list = hbonds->select.hbond_list;
+
+  int natoms = system->n;
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+#else
+  int tid = 0;
+#endif
+  const int idelta = 1 + natoms/nthreads;
+  int ifrom = tid*idelta;
+  int ito   = ((ifrom + idelta) > natoms) ? natoms : ifrom + idelta;
+
+  long reductionOffset = (system->N * tid);
+
+  class PairReaxCOMP *pair_reax_ptr;
+  pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+
+  class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+  pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+				    system->pair_ptr->vflag_either,
+				    natoms, system->pair_ptr->eatom,
+				    system->pair_ptr->vatom, thr);
+
+  /* loops below discover the Hydrogen bonds between i-j-k triplets.
+     here j is H atom and there has to be some bond between i and j.
+     Hydrogen bond is between j and k.
+     so in this function i->X, j->H, k->Z when we map
+     variables onto the ones in the handout.*/
+  //  for( j = 0; j < system->n; ++j )
+  for( j = ifrom; j < ito; ++j ) {
+    /* j has to be of type H */
+    if( system->reax_param.sbp[system->my_atoms[j].type].p_hbond == 1 ) {
+      /*set j's variables */
+      type_j     = system->my_atoms[j].type;
+      start_j    = Start_Index(j, bonds);
+      end_j      = End_Index(j, bonds);
+      hb_start_j = Start_Index( system->my_atoms[j].Hindex, hbonds );
+      hb_end_j   = End_Index( system->my_atoms[j].Hindex, hbonds );
+      if(type_j < 0) continue;
+
+      top = 0;
+      for( pi = start_j; pi < end_j; ++pi )  {
+        pbond_ij = &( bond_list[pi] );
+        i = pbond_ij->nbr;
+        type_i = system->my_atoms[i].type;
+	if(type_i < 0) continue;
+        bo_ij = &(pbond_ij->bo_data);
+
+        if( system->reax_param.sbp[type_i].p_hbond == 2 &&
+            bo_ij->BO >= HB_THRESHOLD )
+          hblist[top++] = pi;
+      }
+
+      for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+        /* set k's varibles */
+        k = hbond_list[pk].nbr;
+        type_k = system->my_atoms[k].type;
+	if(type_k < 0) continue;
+        nbr_jk = hbond_list[pk].ptr;
+        r_jk = nbr_jk->d;
+        rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+
+        for( itr = 0; itr < top; ++itr ) {
+          pi = hblist[itr];
+          pbond_ij = &( bonds->select.bond_list[pi] );
+          i = pbond_ij->nbr;
+
+          if( system->my_atoms[i].orig_id != system->my_atoms[k].orig_id ) {
+            bo_ij = &(pbond_ij->bo_data);
+            type_i = system->my_atoms[i].type;
+	    if(type_i < 0) continue;
+            hbp = &(system->reax_param.hbp[ type_i ][ type_j ][ type_k ]);
+            ++num_hb_intrs;
+
+            Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                             &theta, &cos_theta );
+            /* the derivative of cos(theta) */
+            Calculate_dCos_ThetaOMP( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+				     &dcos_theta_di, &dcos_theta_dj,
+				     &dcos_theta_dk );
+
+            /* hydrogen bond energy*/
+            sin_theta2 = sin( theta/2.0 );
+            sin_xhz4 = SQR(sin_theta2);
+            sin_xhz4 *= sin_xhz4;
+            cos_xhz1 = ( 1.0 - cos_theta );
+            exp_hb2 = exp( -hbp->p_hb2 * bo_ij->BO );
+            exp_hb3 = exp( -hbp->p_hb3 * ( hbp->r0_hb / r_jk +
+                                           r_jk / hbp->r0_hb - 2.0 ) );
+
+            e_hb_thr += e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+
+            CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
+            CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+            CEhb3 = -hbp->p_hb3 *
+              (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
+
+            /* hydrogen bond forces */
+            bo_ij->Cdbo += CEhb1; // dbo term
+
+            if( control->virial == 0 ) {
+	      // dcos terms
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+i], +CEhb2, dcos_theta_di );
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+j], +CEhb2, dcos_theta_dj );
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+k], +CEhb2, dcos_theta_dk );
+              // dr terms
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+j], -CEhb3/r_jk, dvec_jk );
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+k], +CEhb3/r_jk, dvec_jk );
+            }
+            else {
+              /* for pressure coupling, terms that are not related to bond order
+                 derivatives are added directly into pressure vector/tensor */
+              rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+	      rvec_Add(workspace->forceReduction[reductionOffset+i], force );
+              rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+              rvec_ScaledAdd( workspace->my_ext_pressReduction[tid],1.0, ext_press );
+
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+j], +CEhb2, dcos_theta_dj );
+
+              ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+              rvec_Scale( force, +CEhb2, dcos_theta_dk );
+              rvec_Add(workspace->forceReduction[reductionOffset+k], force );
+              rvec_iMultiply( ext_press, rel_jk, force );
+              rvec_ScaledAdd( workspace->my_ext_pressReduction[tid],1.0, ext_press );
+              // dr terms
+              rvec_ScaledAdd(workspace->forceReduction[reductionOffset+j],-CEhb3/r_jk, dvec_jk );
+
+              rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+              rvec_Add(workspace->forceReduction[reductionOffset+k], force );
+              rvec_iMultiply( ext_press, rel_jk, force );
+              rvec_ScaledAdd( workspace->my_ext_pressReduction[tid],1.0, ext_press );
+            }
+
+            /* tally into per-atom virials */
+            if (system->pair_ptr->vflag_atom || system->pair_ptr->evflag) {
+              rvec_ScaledSum( delij, 1., system->my_atoms[j].x,
+                                    -1., system->my_atoms[i].x );
+              rvec_ScaledSum( delkj, 1., system->my_atoms[j].x,
+                                     -1., system->my_atoms[k].x );
+
+              rvec_Scale(fi_tmp, CEhb2, dcos_theta_di);
+              rvec_Scale(fk_tmp, CEhb2, dcos_theta_dk);
+              rvec_ScaledAdd(fk_tmp, CEhb3/r_jk, dvec_jk);
+
+              pair_reax_ptr->ev_tally3_thr_proxy(system->pair_ptr,i,j,k,e_hb,0.0,fi_tmp,fk_tmp,delij,delkj,thr);
+	    }
+          }
+        }
+      }
+
+    }
+  }
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+  {
+    data->my_en.e_hb += e_hb_thr;
+  }
+
+  pair_reax_ptr->reduce_thr_proxy(system->pair_ptr, system->pair_ptr->eflag_either,
+				  system->pair_ptr->vflag_either, thr);
+}
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEHBONDSINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
--- a/src/USER-OMP/reaxc_hydrogen_bonds_omp.h
+++ b/src/USER-OMP/reaxc_hydrogen_bonds_omp.h
@ -0,0 +1,35 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __HBONDS_OMP_H_
+#define __HBONDS_OMP_H_
+
+#include "reaxc_types.h"
+
+void Hydrogen_BondsOMP( reax_system*, control_params*, simulation_data*,
+                     storage*, reax_list**, output_controls* );
+
+#endif
--- a/src/USER-OMP/reaxc_init_md_omp.cpp
+++ b/src/USER-OMP/reaxc_init_md_omp.cpp
@ -0,0 +1,186 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "reaxc_init_md_omp.h"
+#include "reaxc_allocate.h"
+#include "reaxc_forces.h"
+#include "reaxc_forces_omp.h"
+#include "reaxc_io_tools.h"
+#include "reaxc_list.h"
+#include "reaxc_lookup.h"
+#include "reaxc_reset_tools.h"
+#include "reaxc_system_props.h"
+#include "reaxc_tool_box.h"
+#include "reaxc_vector.h"
+
+// Functions definedd in reaxc_init_md.cpp
+extern int Init_MPI_Datatypes(reax_system*, storage*, mpi_datatypes*, MPI_Comm, char*);
+extern int Init_System(reax_system*, control_params*, char*);
+extern int Init_Simulation_Data(reax_system*, control_params*, simulation_data*, char*);
+extern int Init_Workspace(reax_system*, control_params*, storage*, MPI_Comm, char*);
+
+/* ---------------------------------------------------------------------- */
+
+int Init_ListsOMP( reax_system *system, control_params *control,
+                 simulation_data *data, storage *workspace, reax_list **lists,
+                 mpi_datatypes *mpi_data, char *msg )
+{
+  int i, total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
+  int *hb_top, *bond_top;
+  MPI_Comm comm;
+
+  int mincap = system->mincap;
+  double safezone = system->safezone;
+  double saferzone = system->saferzone;
+
+  comm = mpi_data->world;
+  bond_top = (int*) calloc( system->total_cap, sizeof(int) );
+  hb_top = (int*) calloc( system->local_cap, sizeof(int) );
+  Estimate_Storages( system, control, lists,
+                     &Htop, hb_top, bond_top, &num_3body, comm );
+
+  if( control->hbond_cut > 0 ) {
+    /* init H indexes */
+    total_hbonds = 0;
+    for( i = 0; i < system->n; ++i ) {
+      system->my_atoms[i].num_hbonds = hb_top[i];
+      total_hbonds += hb_top[i];
+    }
+    total_hbonds = (int)(MAX( total_hbonds*saferzone, mincap*MIN_HBONDS ));
+
+    if( !Make_List( system->Hcap, total_hbonds, TYP_HBOND,
+                    *lists+HBONDS, comm ) ) {
+      fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
+      MPI_Abort( comm, INSUFFICIENT_MEMORY );
+    }
+  }
+
+  total_bonds = 0;
+  for( i = 0; i < system->N; ++i ) {
+    system->my_atoms[i].num_bonds = bond_top[i];
+    total_bonds += bond_top[i];
+  }
+  bond_cap = (int)(MAX( total_bonds*safezone, mincap*MIN_BONDS ));
+
+  if( !Make_List( system->total_cap, bond_cap, TYP_BOND,
+                  *lists+BONDS, comm ) ) {
+    fprintf( stderr, "not enough space for bonds list. terminating!\n" );
+    MPI_Abort( comm, INSUFFICIENT_MEMORY );
+  }
+
+  int nthreads = control->nthreads;
+  reax_list *bonds = (*lists)+BONDS;
+
+  for (i = 0; i < bonds->num_intrs; ++i)
+    bonds->select.bond_list[i].bo_data.CdboReduction =
+      (double*) smalloc(sizeof(double)*nthreads, "CdboReduction", comm);
+
+  /* 3bodies list */
+  cap_3body = (int)(MAX( num_3body*safezone, MIN_3BODIES ));
+  if( !Make_List( bond_cap, cap_3body, TYP_THREE_BODY,
+		  *lists+THREE_BODIES, comm ) ){
+
+    fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+    MPI_Abort( comm, INSUFFICIENT_MEMORY );
+  }
+
+  free( hb_top );
+  free( bond_top );
+
+  return SUCCESS;
+}
+
+/* ---------------------------------------------------------------------- */
+
+// The only difference with the MPI-only function is calls to Init_ListsOMP and Init_Force_FunctionsOMP().
+void InitializeOMP( reax_system *system, control_params *control,
+                 simulation_data *data, storage *workspace,
+                 reax_list **lists, output_controls *out_control,
+                 mpi_datatypes *mpi_data, MPI_Comm comm )
+{
+  char msg[MAX_STR];
+
+
+  if( Init_MPI_Datatypes(system, workspace, mpi_data, comm, msg) == FAILURE ) {
+    fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
+             system->my_rank );
+    fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
+             system->my_rank );
+    MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+  }
+
+  if( Init_System(system, control, msg) == FAILURE ){
+    fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+    fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+             system->my_rank );
+    MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+  }
+
+  if( Init_Simulation_Data( system, control, data, msg ) == FAILURE ) {
+    fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+    fprintf( stderr, "p%d: sim_data couldn't be initialized! terminating.\n",
+             system->my_rank );
+    MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+  }
+
+  if( Init_Workspace( system, control, workspace, mpi_data->world, msg ) ==
+      FAILURE ) {
+    fprintf( stderr, "p%d:init_workspace: not enough memory\n",
+             system->my_rank );
+    fprintf( stderr, "p%d:workspace couldn't be initialized! terminating.\n",
+             system->my_rank );
+    MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+  }
+
+  if( Init_ListsOMP( system, control, data, workspace, lists, mpi_data, msg ) ==
+      FAILURE ) {
+      fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+      fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+               system->my_rank );
+      MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+    }
+
+  if( Init_Output_Files(system,control,out_control,mpi_data,msg)== FAILURE) {
+    fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+    fprintf( stderr, "p%d: could not open output files! terminating...\n",
+             system->my_rank );
+    MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+  }
+
+  if( control->tabulate ) {
+    if( Init_Lookup_Tables( system, control, workspace, mpi_data, msg ) == FAILURE ) {
+      fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+      fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
+               system->my_rank );
+      MPI_Abort( mpi_data->world, CANNOT_INITIALIZE );
+    }
+  }
+
+
+  Init_Force_FunctionsOMP( control );
+}
+
--- a/src/USER-OMP/reaxc_init_md_omp.h
+++ b/src/USER-OMP/reaxc_init_md_omp.h
@ -0,0 +1,34 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __INIT_MD_OMP_H_
+#define __INIT_MD_OMP_H_
+
+#include "reaxc_types.h"
+
+void InitializeOMP( reax_system*, control_params*, simulation_data*, storage*,
+                 reax_list**, output_controls*, mpi_datatypes*, MPI_Comm );
+#endif
--- a/src/USER-OMP/reaxc_multi_body_omp.cpp
+++ b/src/USER-OMP/reaxc_multi_body_omp.cpp
@ -0,0 +1,297 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "thr_data.h"
+
+#include "reaxc_multi_body_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include  <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+void Atom_EnergyOMP( reax_system *system, control_params *control,
+                  simulation_data *data, storage *workspace, reax_list **lists,
+                  output_controls *out_control )
+{
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  /* Initialize parameters */
+  double p_lp1 = system->reax_param.gp.l[15];
+  double p_lp3 = system->reax_param.gp.l[5];
+  double p_ovun3 = system->reax_param.gp.l[32];
+  double p_ovun4 = system->reax_param.gp.l[31];
+  double p_ovun6 = system->reax_param.gp.l[6];
+  double p_ovun7 = system->reax_param.gp.l[8];
+  double p_ovun8 = system->reax_param.gp.l[9];
+
+  int natoms = system->n;
+  int nthreads = control->nthreads;
+  reax_list *bonds = (*lists) + BONDS;
+
+  double total_Elp = 0.0;
+  double total_Eun = 0.0;
+  double total_Eov = 0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) reduction(+:total_Elp, total_Eun, total_Eov)
+#endif
+{
+  int i, j, pj, type_i, type_j;
+  double Delta_lpcorr, dfvl;
+  double e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+  double e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+  double e_ov, CEover1, CEover2, CEover3, CEover4;
+  double exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+  double exp_ovun2n, exp_ovun6, exp_ovun8;
+  double inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+  double e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+  double eng_tmp, f_tmp;
+  double p_lp2, p_ovun2, p_ovun5;
+  int numbonds;
+
+  single_body_parameters *sbp_i, *sbp_j;
+  two_body_parameters *twbp;
+  bond_data *pbond;
+  bond_order_data *bo_ij;
+
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+#else
+  int tid = 0;
+#endif
+
+  long reductionOffset = (system->N * tid);
+  class PairReaxCOMP *pair_reax_ptr;
+  pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+  class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+  pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+				    system->pair_ptr->vflag_either, natoms,
+				    system->pair_ptr->eatom, system->pair_ptr->vatom, thr);
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+  for ( i = 0; i < system->n; ++i) {
+    type_i = system->my_atoms[i].type;
+    if(type_i < 0) continue;
+    sbp_i = &(system->reax_param.sbp[ type_i ]);
+
+    /* lone-pair Energy */
+    p_lp2 = sbp_i->p_lp2;
+    expvd2 = exp( -75 * workspace->Delta_lp[i] );
+    inv_expvd2 = 1. / (1. + expvd2 );
+
+    numbonds = 0;
+    e_lp = 0.0;
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+      numbonds ++;
+
+    /* calculate the energy */
+    if(numbonds > 0)
+      total_Elp += e_lp =
+	p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+
+    dElp = p_lp2 * inv_expvd2 +
+      75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+    CElp = dElp * workspace->dDelta_lp[i];
+
+    if(numbonds > 0) workspace->CdDelta[i] += CElp;  // lp - 1st term
+
+    /* tally into per-atom energy */
+    if( system->pair_ptr->evflag)
+      pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, i, i, system->n, 1,
+					e_lp, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+
+    /* correction for C2 */
+    if( p_lp3 > 0.001 && !strcmp(system->reax_param.sbp[type_i].name, "C") )
+      for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+        j = bonds->select.bond_list[pj].nbr;
+        type_j = system->my_atoms[j].type;
+	if(type_j < 0) continue;
+
+        if( !strcmp( system->reax_param.sbp[type_j].name, "C" ) ) {
+          twbp = &( system->reax_param.tbp[type_i][type_j]);
+          bo_ij = &( bonds->select.bond_list[pj].bo_data );
+          Di = workspace->Delta[i];
+          vov3 = bo_ij->BO - Di - 0.040*pow(Di, 4.);
+	
+          if( vov3 > 3. ) {
+	    total_Elp += e_lph = p_lp3 * SQR(vov3-3.0);
+
+            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*pow(Di, 3.));
+	
+            bo_ij->Cdbo += deahu2dbo;
+            workspace->CdDelta[i] += deahu2dsbo;
+	
+            /* tally into per-atom energy */
+            if( system->pair_ptr->evflag)
+              pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, i, j, system->n, 1,
+						e_lph, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+          }
+        }
+      }
+  }
+#if defined(_OPENMP)
+#pragma omp barrier
+#pragma omp for schedule(guided)
+#endif
+  for (i = 0; i < system->n; ++i) {
+    type_i = system->my_atoms[i].type;
+    if(type_i < 0) continue;
+    sbp_i = &(system->reax_param.sbp[ type_i ]);
+
+    /* over-coordination energy */
+    if( sbp_i->mass > 21.0 )
+      dfvl = 0.0;
+    else dfvl = 1.0; // only for 1st-row elements
+
+    p_ovun2 = sbp_i->p_ovun2;
+    sum_ovun1 = sum_ovun2 = 0;
+    for (pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj) {
+      j = bonds->select.bond_list[pj].nbr;
+      type_j = system->my_atoms[j].type;
+      if(type_j < 0) continue;
+      bo_ij = &(bonds->select.bond_list[pj].bo_data);
+      twbp = &(system->reax_param.tbp[ type_i ][ type_j ]);
+
+      sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
+      sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
+	( bo_ij->BO_pi + bo_ij->BO_pi2 );
+    }
+
+    exp_ovun1 = p_ovun3 * exp( p_ovun4 * sum_ovun2 );
+    inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
+    Delta_lpcorr  = workspace->Delta[i] -
+      (dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+
+    exp_ovun2 = exp( p_ovun2 * Delta_lpcorr );
+    inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
+
+    DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8);
+    CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
+
+    total_Eov += e_ov = sum_ovun1 * CEover1;
+
+    CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
+      (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
+
+    CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 );
+
+    CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) *
+      p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
+
+
+    /* under-coordination potential */
+    p_ovun2 = sbp_i->p_ovun2;
+    p_ovun5 = sbp_i->p_ovun5;
+
+    exp_ovun2n = 1.0 / exp_ovun2;
+    exp_ovun6 = exp( p_ovun6 * Delta_lpcorr );
+    exp_ovun8 = p_ovun7 * exp(p_ovun8 * sum_ovun2);
+    inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
+    inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
+
+    numbonds = 0;
+    e_un = 0.0;
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+      numbonds ++;
+
+    if(numbonds > 0) total_Eun += e_un =
+		       -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+
+    CEunder1 = inv_exp_ovun2n *
+      ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
+        p_ovun2 * e_un * exp_ovun2n );
+    CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
+    CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
+    CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) *
+      p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
+
+    /* tally into per-atom energy */
+    if (system->pair_ptr->evflag) {
+      eng_tmp = e_ov;
+      if(numbonds > 0) eng_tmp+= e_un;
+      pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, i, i, system->n, 1,
+					eng_tmp, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+    }
+
+    /* forces */
+    workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
+    if(numbonds > 0) workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
+
+    for (pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj) {
+      pbond = &(bonds->select.bond_list[pj]);
+      j = pbond->nbr;
+      bo_ij = &(pbond->bo_data);
+      twbp  = &(system->reax_param.tbp[ system->my_atoms[i].type ]
+                [system->my_atoms[pbond->nbr].type]);
+
+      bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor-1st
+      workspace->CdDeltaReduction[reductionOffset+j] +=
+	CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a
+
+      bo_ij->Cdbopi += CEover4 *
+        (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b
+      bo_ij->Cdbopi2 += CEover4 *
+        (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // OvCoor-3b
+
+      workspace->CdDeltaReduction[reductionOffset+j] +=
+	CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
+
+      bo_ij->Cdbopi += CEunder4 *
+        (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
+      bo_ij->Cdbopi2 += CEunder4 *
+        (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
+    }
+  }
+
+  pair_reax_ptr->reduce_thr_proxy(system->pair_ptr, system->pair_ptr->eflag_either,
+				  system->pair_ptr->vflag_either, thr);
+
+ }
+
+ data->my_en.e_lp += total_Elp;
+ data->my_en.e_ov += total_Eov;
+ data->my_en.e_un += total_Eun;
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEATOMENERGYINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
--- a/src/USER-OMP/reaxc_multi_body_omp.h
+++ b/src/USER-OMP/reaxc_multi_body_omp.h
@ -0,0 +1,35 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __MULTI_BODY_OMP_H_
+#define __MULTI_BODY_OMP_H_
+
+#include "reaxc_types.h"
+
+void Atom_EnergyOMP( reax_system*, control_params*, simulation_data*,
+                  storage*, reax_list**, output_controls* );
+
+#endif
--- a/src/USER-OMP/reaxc_nonbonded_omp.cpp
+++ b/src/USER-OMP/reaxc_nonbonded_omp.cpp
@ -0,0 +1,400 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "thr_data.h"
+
+#include "reaxc_types.h"
+
+#include "reaxc_nonbonded.h"
+#include "reaxc_nonbonded_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include  <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+void vdW_Coulomb_Energy_OMP( reax_system *system, control_params *control,
+			     simulation_data *data, storage *workspace,
+			     reax_list **lists, output_controls *out_control ) {
+
+  int natoms = system->n;
+  int nthreads = control->nthreads;
+  long totalReductionSize = system->N * nthreads;
+  reax_list *far_nbrs = (*lists) + FAR_NBRS;
+  double p_vdW1 = system->reax_param.gp.l[28];
+  double p_vdW1i = 1.0 / p_vdW1;
+  double total_EvdW = 0.;
+  double total_Eele = 0.;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) reduction(+: total_EvdW, total_Eele)
+#endif
+  {
+#if defined(_OPENMP)
+    int tid = omp_get_thread_num();
+#else
+    int tid = 0;
+#endif
+  int i, j, pj;
+  int start_i, end_i, orig_i, orig_j, flag;
+  double powr_vdW1, powgi_vdW1;
+  double tmp, r_ij, fn13, exp1, exp2;
+  double Tap, dTap, dfn13, CEvd, CEclmb, de_core;
+  double dr3gamij_1, dr3gamij_3;
+  double e_ele, e_ele_thr, e_vdW, e_vdW_thr, e_core, SMALL = 0.0001;
+  double e_lg, de_lg, r_ij5, r_ij6, re6;
+  rvec temp, ext_press;
+  two_body_parameters *twbp;
+  far_neighbor_data *nbr_pj;
+
+  // Tallying variables:
+  double pe_vdw, f_tmp, delij[3];
+
+  long reductionOffset = (system->N * tid);
+
+  class PairReaxCOMP *pair_reax_ptr;
+  pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+  class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+  pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+				    system->pair_ptr->vflag_either,
+				    natoms, system->pair_ptr->eatom,
+				    system->pair_ptr->vatom, thr);
+  e_core = 0;
+  e_vdW = 0;
+  e_vdW_thr = 0;
+  e_lg = 0;
+  de_lg = 0.0;
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+  for( i = 0; i < natoms; ++i ) {
+    if(system->my_atoms[i].type < 0) continue;
+    start_i = Start_Index(i, far_nbrs);
+    end_i   = End_Index(i, far_nbrs);
+    orig_i  = system->my_atoms[i].orig_id;
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+      nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
+      j = nbr_pj->nbr;
+      orig_j  = system->my_atoms[j].orig_id;
+
+      flag = 0;
+      if(nbr_pj->d <= control->nonb_cut) {
+	if(j < natoms) flag = 1;
+	else if (orig_i < orig_j) flag = 1;
+	else if (orig_i == orig_j) {
+	  if (nbr_pj->dvec[2] > SMALL) flag = 1;
+	  else if (fabs(nbr_pj->dvec[2]) < SMALL) {
+	    if (nbr_pj->dvec[1] > SMALL) flag = 1;
+	    else if (fabs(nbr_pj->dvec[1]) < SMALL && nbr_pj->dvec[0] > SMALL)
+	      flag = 1;
+	  }
+	}
+      }
+
+      if (flag) {
+	
+	r_ij = nbr_pj->d;
+	twbp = &(system->reax_param.tbp[ system->my_atoms[i].type ]
+		 [ system->my_atoms[j].type ]);
+	
+	/* Calculate Taper and its derivative */
+	// Tap = nbr_pj->Tap;   -- precomputed during compte_H
+	Tap = workspace->Tap[7] * r_ij + workspace->Tap[6];
+	Tap = Tap * r_ij + workspace->Tap[5];
+	Tap = Tap * r_ij + workspace->Tap[4];
+	Tap = Tap * r_ij + workspace->Tap[3];
+	Tap = Tap * r_ij + workspace->Tap[2];
+	Tap = Tap * r_ij + workspace->Tap[1];
+	Tap = Tap * r_ij + workspace->Tap[0];
+	
+	dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6];
+	dTap = dTap * r_ij + 5*workspace->Tap[5];
+	dTap = dTap * r_ij + 4*workspace->Tap[4];
+	dTap = dTap * r_ij + 3*workspace->Tap[3];
+	dTap = dTap * r_ij + 2*workspace->Tap[2];
+	dTap += workspace->Tap[1]/r_ij;
+	
+	/*vdWaals Calculations*/
+	if(system->reax_param.gp.vdw_type==1 || system->reax_param.gp.vdw_type==3)
+	  { // shielding
+	    powr_vdW1 = pow(r_ij, p_vdW1);
+	    powgi_vdW1 = pow( 1.0 / twbp->gamma_w, p_vdW1);
+	
+	    fn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i );
+	    exp1 = exp( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+	    exp2 = exp( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+	
+	    e_vdW = twbp->D * (exp1 - 2.0 * exp2);
+	    total_EvdW += Tap * e_vdW;
+	
+	    dfn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+	      pow(r_ij, p_vdW1 - 2.0);
+
+	    CEvd = dTap * e_vdW -
+	      Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+	  }
+	else{ // no shielding
+	  exp1 = exp( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+	  exp2 = exp( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+	
+	  e_vdW = twbp->D * (exp1 - 2.0 * exp2);
+	  total_EvdW += Tap * e_vdW;
+	
+	  CEvd = dTap * e_vdW -
+	      Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) / r_ij;
+	}
+	
+	if(system->reax_param.gp.vdw_type==2 || system->reax_param.gp.vdw_type==3)
+	  { // innner wall
+	    e_core = twbp->ecore * exp(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+	    total_EvdW += Tap * e_core;
+	
+	    de_core = -(twbp->acore/twbp->rcore) * e_core;
+	    CEvd += dTap * e_core + Tap * de_core / r_ij;
+	
+	    //  lg correction, only if lgvdw is yes
+	    if (control->lgflag) {
+	      r_ij5 = pow( r_ij, 5.0 );
+	      r_ij6 = pow( r_ij, 6.0 );
+	      re6 = pow( twbp->lgre, 6.0 );
+	
+	      e_lg = -(twbp->lgcij/( r_ij6 + re6 ));
+	      total_EvdW += Tap * e_lg;
+	
+	      de_lg = -6.0 * e_lg *  r_ij5 / ( r_ij6 + re6 ) ;
+	      CEvd += dTap * e_lg + Tap * de_lg / r_ij;
+	    }
+	
+	  }
+	
+	/*Coulomb Calculations*/
+	dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+	dr3gamij_3 = pow( dr3gamij_1 , 0.33333333333333 );
+	
+	tmp = Tap / dr3gamij_3;
+	total_Eele += e_ele =
+	  C_ele * system->my_atoms[i].q * system->my_atoms[j].q * tmp;
+	
+	CEclmb = C_ele * system->my_atoms[i].q * system->my_atoms[j].q *
+	  ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+	
+	/* tally into per-atom energy */
+	if( system->pair_ptr->evflag || system->pair_ptr->vflag_atom) {
+	  pe_vdw = Tap * (e_vdW + e_core + e_lg);
+	  rvec_ScaledSum( delij, 1., system->my_atoms[i].x,
+			  -1., system->my_atoms[j].x );
+	  f_tmp = -(CEvd + CEclmb);
+	  pair_reax_ptr->ev_tally_thr_proxy( system->pair_ptr, i, j, natoms,
+					     1, pe_vdw, e_ele, f_tmp,
+					     delij[0], delij[1], delij[2], thr);
+	}
+	
+	if( control->virial == 0 ) {
+	  rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+	  rvec_ScaledAdd( workspace->forceReduction[reductionOffset+j],
+			  +(CEvd + CEclmb), nbr_pj->dvec );
+	}
+	else { /* NPT, iNPT or sNPT */
+	  /* for pressure coupling, terms not related to bond order
+	     derivatives are added directly into pressure vector/tensor */
+	
+	  rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+	  rvec_ScaledAdd( workspace->f[reductionOffset+i], -1., temp );
+	  rvec_Add( workspace->forceReduction[reductionOffset+j], temp);
+	
+	  rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+	
+	  rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+	}
+	}
+      }
+    }
+
+  pair_reax_ptr->reduce_thr_proxy(system->pair_ptr, system->pair_ptr->eflag_either,
+				  system->pair_ptr->vflag_either, thr);
+  } // parallel region
+
+  data->my_en.e_vdW = total_EvdW;
+  data->my_en.e_ele = total_Eele;
+
+  Compute_Polarization_Energy( system, data );
+}
+
+/* ---------------------------------------------------------------------- */
+
+void Tabulated_vdW_Coulomb_Energy_OMP(reax_system *system,control_params *control,
+				      simulation_data *data, storage *workspace,
+				      reax_list **lists,
+				      output_controls *out_control ) {
+
+  double SMALL = 0.0001;
+  int  natoms = system->n;
+  reax_list *far_nbrs = (*lists) + FAR_NBRS;
+  int  nthreads = control->nthreads;
+  long totalReductionSize = system->N * nthreads;
+  double total_EvdW = 0.;
+  double total_Eele = 0.;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared) reduction(+:total_EvdW, total_Eele)
+#endif
+  {
+  int i, j, pj, r;
+  int type_i, type_j, tmin, tmax;
+  int start_i, end_i, orig_i, orig_j, flag;
+  double r_ij, base, dif;
+  double e_vdW, e_ele;
+  double CEvd, CEclmb;
+  double f_tmp, delij[3];
+  rvec temp, ext_press;
+  far_neighbor_data *nbr_pj;
+  LR_lookup_table *t;
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+  #else
+  int tid = 0;
+#endif
+  long froffset = (system->N * tid);
+
+  class PairReaxCOMP *pair_reax_ptr;
+  pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+  class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+  pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+				    system->pair_ptr->vflag_either,
+				    natoms, system->pair_ptr->eatom,
+				    system->pair_ptr->vatom, thr);
+
+#if defined(_OPENMP)
+#pragma omp for schedule(guided)
+#endif
+  for (i = 0; i < natoms; ++i) {
+    type_i  = system->my_atoms[i].type;
+    if(type_i < 0) continue;
+    start_i = Start_Index(i,far_nbrs);
+    end_i   = End_Index(i,far_nbrs);
+    orig_i  = system->my_atoms[i].orig_id;
+
+    for (pj = start_i; pj < end_i; ++pj) {
+      nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
+      j = nbr_pj->nbr;
+      type_j = system->my_atoms[j].type;
+      if(type_j < 0) continue;
+      orig_j  = system->my_atoms[j].orig_id;
+
+      flag = 0;
+      if(nbr_pj->d <= control->nonb_cut) {
+	if(j < natoms) flag = 1;
+	else if (orig_i < orig_j) flag = 1;
+	else if (orig_i == orig_j) {
+	  if (nbr_pj->dvec[2] > SMALL) flag = 1;
+	  else if (fabs(nbr_pj->dvec[2]) < SMALL) {
+	    if (nbr_pj->dvec[1] > SMALL) flag = 1;
+	    else if (fabs(nbr_pj->dvec[1]) < SMALL && nbr_pj->dvec[0] > SMALL)
+	      flag = 1;
+	  }
+	}
+
+      }
+
+      if (flag) {
+	
+	r_ij   = nbr_pj->d;
+	tmin  = MIN( type_i, type_j );
+	tmax  = MAX( type_i, type_j );
+	t = &( LR[tmin][tmax] );
+	
+	/* Cubic Spline Interpolation */
+	r = (int)(r_ij * t->inv_dx);
+	if( r == 0 )  ++r;
+	base = (double)(r+1) * t->dx;
+	dif = r_ij - base;
+	
+	e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif +
+	  t->vdW[r].a;
+	
+	e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif +
+	  t->ele[r].a;
+	e_ele *= system->my_atoms[i].q * system->my_atoms[j].q;
+	
+	total_EvdW += e_vdW;
+	total_Eele += e_ele;
+	
+	CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif +
+	  t->CEvd[r].a;
+	
+	CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif +
+	  t->CEclmb[r].a;
+	CEclmb *= system->my_atoms[i].q * system->my_atoms[j].q;
+	
+	/* tally into per-atom energy */
+	if( system->pair_ptr->evflag || system->pair_ptr->vflag_atom) {
+	  rvec_ScaledSum( delij, 1., system->my_atoms[i].x,
+			  -1., system->my_atoms[j].x );
+	  f_tmp = -(CEvd + CEclmb);
+	  pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, i, j, natoms, 1, e_vdW, e_ele,
+					    f_tmp, delij[0], delij[1], delij[2], thr);
+	}
+	
+	if( control->virial == 0 ) {
+	  rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+	  rvec_ScaledAdd( workspace->forceReduction[froffset+j],
+			  +(CEvd + CEclmb), nbr_pj->dvec );
+	}
+	else { // NPT, iNPT or sNPT
+	  /* for pressure coupling, terms not related to bond order derivatives
+	     are added directly into pressure vector/tensor */
+	  rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+	
+	  rvec_ScaledAdd( workspace->f[i], -1., temp );
+	  rvec_Add( workspace->forceReduction[froffset+j], temp );
+	
+	  rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+	  rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+	}
+      }
+    }
+  }
+
+  pair_reax_ptr->reduce_thr_proxy(system->pair_ptr, system->pair_ptr->eflag_either,
+				  system->pair_ptr->vflag_either, thr);
+  } // end omp parallel
+
+  data->my_en.e_vdW = total_EvdW;
+  data->my_en.e_ele = total_Eele;
+
+  Compute_Polarization_Energy( system, data );
+}
--- a/src/USER-OMP/reaxc_nonbonded_omp.h
+++ b/src/USER-OMP/reaxc_nonbonded_omp.h
@ -0,0 +1,38 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __NONBONDED_OMP_H_
+#define __NONBONDED_OMP_H_
+
+#include "reaxc_types.h"
+
+void vdW_Coulomb_Energy_OMP( reax_system*, control_params*, simulation_data*,
+			    storage*, reax_list**, output_controls* );
+
+void Tabulated_vdW_Coulomb_Energy_OMP( reax_system*, control_params*,
+				       simulation_data*, storage*,
+				       reax_list**, output_controls* );
+#endif
--- a/src/USER-OMP/reaxc_torsion_angles_omp.cpp
+++ b/src/USER-OMP/reaxc_torsion_angles_omp.cpp
@ -0,0 +1,471 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "thr_data.h"
+
+#include "reaxc_types.h"
+#include "reaxc_torsion_angles_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_tool_box.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#define MIN_SINE 1e-10
+
+using namespace LAMMPS_NS;
+
+// Functions defined in reaxc_torsion_angles.cpp
+extern double Calculate_Omega(rvec, double, rvec, double, rvec, double, rvec, double,
+			    three_body_interaction_data*, three_body_interaction_data*,
+			    rvec, rvec, rvec, rvec, output_controls*);
+
+/* ---------------------------------------------------------------------- */
+
+void Torsion_AnglesOMP( reax_system *system, control_params *control,
+                     simulation_data *data, storage *workspace,
+                     reax_list **lists, output_controls *out_control )
+{
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  int natoms = system->n;
+  reax_list *bonds = (*lists) + BONDS;
+  reax_list *thb_intrs = (*lists) + THREE_BODIES;
+  double p_tor2 = system->reax_param.gp.l[23];
+  double p_tor3 = system->reax_param.gp.l[24];
+  double p_tor4 = system->reax_param.gp.l[25];
+  double p_cot2 = system->reax_param.gp.l[27];
+  double total_Etor = 0;
+  double total_Econ = 0;
+  int  nthreads = control->nthreads;
+
+#pragma omp parallel default(shared) reduction(+: total_Etor, total_Econ)
+ {
+  int i, j, k, l, pi, pj, pk, pl, pij, plk;
+  int type_i, type_j, type_k, type_l;
+  int start_j, end_j;
+  int start_pj, end_pj, start_pk, end_pk;
+  int num_frb_intrs = 0;
+
+  double Delta_j, Delta_k;
+  double r_ij, r_jk, r_kl, r_li;
+  double BOA_ij, BOA_jk, BOA_kl;
+
+  double exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
+  double exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
+  double exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
+  double fn10, f11_DjDk, dfn11, fn12;
+  double theta_ijk, theta_jkl;
+  double sin_ijk, sin_jkl;
+  double cos_ijk, cos_jkl;
+  double tan_ijk_i, tan_jkl_i;
+  double omega, cos_omega, cos2omega, cos3omega;
+  rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
+  double CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
+  double CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
+  double Cconj, CEconj1, CEconj2, CEconj3;
+  double CEconj4, CEconj5, CEconj6;
+  double e_tor, e_con;
+  rvec dvec_li;
+  rvec force, ext_press;
+  ivec rel_box_jl;
+  // rtensor total_rtensor, temp_rtensor;
+  four_body_header *fbh;
+  four_body_parameters *fbp;
+  bond_data *pbond_ij, *pbond_jk, *pbond_kl;
+  bond_order_data *bo_ij, *bo_jk, *bo_kl;
+  three_body_interaction_data *p_ijk, *p_jkl;
+
+  // Virial tallying variables
+  double delil[3], deljl[3], delkl[3];
+  double eng_tmp, fi_tmp[3], fj_tmp[3], fk_tmp[3];
+
+#if defined(_OPENMP)
+  int tid = omp_get_thread_num();
+#else
+  int tid = 0;
+#endif
+  long reductionOffset = (system->N * tid);
+  class PairReaxCOMP *pair_reax_ptr;
+  pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+  class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+  pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+                                    system->pair_ptr->vflag_either,
+                                    system->N, system->pair_ptr->eatom,
+                                    system->pair_ptr->vatom, thr);
+
+#pragma omp for schedule(static)
+  for (j = 0; j < system->N; ++j) {
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+
+    for (pk = start_j; pk < end_j; ++pk) {
+      bo_jk = &( bonds->select.bond_list[pk].bo_data );
+      for (k = 0; k < nthreads; ++k)
+	bo_jk->CdboReduction[k] = 0.;
+    }
+  }
+
+#pragma omp for schedule(dynamic,50)
+  for (j = 0; j < natoms; ++j) {
+    type_j = system->my_atoms[j].type;
+    Delta_j = workspace->Delta_boc[j];
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+
+    for (pk = start_j; pk < end_j; ++pk) {
+      pbond_jk = &( bonds->select.bond_list[pk] );
+      k = pbond_jk->nbr;
+      bo_jk = &( pbond_jk->bo_data );
+      BOA_jk = bo_jk->BO - control->thb_cut;
+
+      /* see if there are any 3-body interactions involving j&k
+         where j is the central atom. Otherwise there is no point in
+         trying to form a 4-body interaction out of this neighborhood */
+      if (system->my_atoms[j].orig_id < system->my_atoms[k].orig_id &&
+          bo_jk->BO > control->thb_cut/*0*/ && Num_Entries(pk, thb_intrs)) {
+        pj = pbond_jk->sym_index; // pj points to j on k's list
+	
+        /* do the same check as above:
+           are there any 3-body interactions involving k&j
+           where k is the central atom */
+        if (Num_Entries(pj, thb_intrs)) {
+          type_k = system->my_atoms[k].type;
+          Delta_k = workspace->Delta_boc[k];
+          r_jk = pbond_jk->d;
+	
+          start_pk = Start_Index(pk, thb_intrs );
+          end_pk = End_Index(pk, thb_intrs );
+          start_pj = Start_Index(pj, thb_intrs );
+          end_pj = End_Index(pj, thb_intrs );
+	
+          exp_tor2_jk = exp( -p_tor2 * BOA_jk );
+          exp_cot2_jk = exp( -p_cot2 * SQR(BOA_jk - 1.5) );
+          exp_tor3_DjDk = exp( -p_tor3 * (Delta_j + Delta_k) );
+          exp_tor4_DjDk = exp( p_tor4  * (Delta_j + Delta_k) );
+          exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
+          f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+	
+	
+          /* pick i up from j-k interaction where j is the central atom */
+          for (pi = start_pk; pi < end_pk; ++pi) {
+            p_ijk = &( thb_intrs->select.three_body_list[pi] );
+	    pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
+            pbond_ij = &( bonds->select.bond_list[pij] );
+            bo_ij = &( pbond_ij->bo_data );
+	
+	    if (bo_ij->BO > control->thb_cut/*0*/) {
+              i = p_ijk->thb;
+              type_i = system->my_atoms[i].type;
+              r_ij = pbond_ij->d;
+              BOA_ij = bo_ij->BO - control->thb_cut;
+
+              theta_ijk = p_ijk->theta;
+              sin_ijk = sin( theta_ijk );
+              cos_ijk = cos( theta_ijk );
+              //tan_ijk_i = 1. / tan( theta_ijk );
+              if( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+                tan_ijk_i = cos_ijk / MIN_SINE;
+              else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+                tan_ijk_i = cos_ijk / -MIN_SINE;
+              else tan_ijk_i = cos_ijk / sin_ijk;
+
+              exp_tor2_ij = exp( -p_tor2 * BOA_ij );
+              exp_cot2_ij = exp( -p_cot2 * SQR(BOA_ij -1.5) );
+
+
+              /* pick l up from j-k interaction where k is the central atom */
+              for (pl = start_pj; pl < end_pj; ++pl) {
+                p_jkl = &( thb_intrs->select.three_body_list[pl] );
+                l = p_jkl->thb;
+                plk = p_jkl->pthb; //pointer to l on k's bond_list!
+                pbond_kl = &( bonds->select.bond_list[plk] );
+                bo_kl = &( pbond_kl->bo_data );
+                type_l = system->my_atoms[l].type;
+                fbh = &(system->reax_param.fbp[type_i][type_j]
+                        [type_k][type_l]);
+                fbp = &(system->reax_param.fbp[type_i][type_j]
+                        [type_k][type_l].prm[0]);
+
+		if (i != l && fbh->cnt &&
+                    bo_kl->BO > control->thb_cut/*0*/ &&
+                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/) {
+                  ++num_frb_intrs;
+                  //fprintf(stderr,
+		  //      "%5d: %6d %6d %6d %6d\n", num_frb_intrs,
+		  //      system->my_atoms[i].orig_id,system->my_atoms[j].orig_id,
+		  //      system->my_atoms[k].orig_id,system->my_atoms[l].orig_id);
+
+                  r_kl = pbond_kl->d;
+                  BOA_kl = bo_kl->BO - control->thb_cut;
+		
+                  theta_jkl = p_jkl->theta;
+                  sin_jkl = sin( theta_jkl );
+                  cos_jkl = cos( theta_jkl );
+                  //tan_jkl_i = 1. / tan( theta_jkl );
+                  if( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+                    tan_jkl_i = cos_jkl / MIN_SINE;
+                  else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+                    tan_jkl_i = cos_jkl / -MIN_SINE;
+                  else tan_jkl_i = cos_jkl /sin_jkl;
+		
+                  rvec_ScaledSum( dvec_li, 1., system->my_atoms[i].x,
+                                  -1., system->my_atoms[l].x );
+                  r_li = rvec_Norm( dvec_li );
+		
+		
+                  /* omega and its derivative */
+                  omega = Calculate_Omega( pbond_ij->dvec, r_ij,
+                                           pbond_jk->dvec, r_jk,
+                                           pbond_kl->dvec, r_kl,
+                                           dvec_li, r_li,
+                                           p_ijk, p_jkl,
+                                           dcos_omega_di, dcos_omega_dj,
+                                           dcos_omega_dk, dcos_omega_dl,
+                                           out_control );
+
+                  cos_omega = cos( omega );
+                  cos2omega = cos( 2. * omega );
+                  cos3omega = cos( 3. * omega );
+                  /* end omega calculations */
+		
+                  /* torsion energy */
+                  exp_tor1 = exp( fbp->p_tor1 *
+                                  SQR(2.0 - bo_jk->BO_pi - f11_DjDk) );
+                  exp_tor2_kl = exp( -p_tor2 * BOA_kl );
+                  exp_cot2_kl = exp( -p_cot2 * SQR(BOA_kl - 1.5) );
+                  fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) *
+                    (1.0 - exp_tor2_kl);
+
+                  CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) +
+                               fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                               fbp->V3 * (1.0 + cos3omega) );
+
+		  total_Etor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+
+                  dfn11 = (-p_tor3 * exp_tor3_DjDk +
+                           (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                           (2.0 + exp_tor3_DjDk) * exp_tor34_inv) *
+                    exp_tor34_inv;
+
+                  CEtors1 = sin_ijk * sin_jkl * CV;
+
+                  CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
+                    (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
+                    sin_ijk * sin_jkl;
+                  CEtors3 = CEtors2 * dfn11;
+
+                  CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij *
+                    (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+                  CEtors5 = CEtors1 * p_tor2 *
+                    (1.0 - exp_tor2_ij) * exp_tor2_jk * (1.0 - exp_tor2_kl);
+                  CEtors6 = CEtors1 * p_tor2 *
+                    (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * exp_tor2_kl;
+
+                  cmn = -fn10 * CV;
+                  CEtors7 = cmn * sin_jkl * tan_ijk_i;
+                  CEtors8 = cmn * sin_ijk * tan_jkl_i;
+
+                  CEtors9 = fn10 * sin_ijk * sin_jkl *
+                    (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                     1.5 * fbp->V3 * (cos2omega + 2.0 * SQR(cos_omega)));
+                  /* end  of torsion energy */
+
+
+                  /* 4-body conjugation energy */
+                  fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
+                  //data->my_en.e_con += e_con =
+		  total_Econ += e_con =
+                    fbp->p_cot1 * fn12 *
+                    (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+
+                  Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 *
+                    (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+
+                  CEconj1 = Cconj * (BOA_ij - 1.5e0);
+                  CEconj2 = Cconj * (BOA_jk - 1.5e0);
+                  CEconj3 = Cconj * (BOA_kl - 1.5e0);
+
+                  CEconj4 = -fbp->p_cot1 * fn12 *
+                    (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                  CEconj5 = -fbp->p_cot1 * fn12 *
+                    (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                  CEconj6 = 2.0 * fbp->p_cot1 * fn12 *
+                    cos_omega * sin_ijk * sin_jkl;
+                  /* end 4-body conjugation energy */
+
+                  /* FORCES */
+                  bo_jk->Cdbopi += CEtors2;
+                  workspace->CdDelta[j] += CEtors3;
+		  //workspace->CdDelta[k] += CEtors3;
+                  workspace->CdDeltaReduction[reductionOffset+k] += CEtors3;
+                  bo_ij->Cdbo += (CEtors4 + CEconj1);
+                  bo_jk->Cdbo += (CEtors5 + CEconj2);
+                  //bo_kl->Cdbo += (CEtors6 + CEconj3);
+		  bo_kl->CdboReduction[tid] += (CEtors6 + CEconj3);
+
+                  if( control->virial == 0 ) {
+                    /* dcos_theta_ijk */
+                    rvec_ScaledAdd( workspace->f[j],
+                                    CEtors7 + CEconj4, p_ijk->dcos_dj );
+		    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+i],
+                                    CEtors7 + CEconj4, p_ijk->dcos_dk );
+                    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+k],
+                                    CEtors7 + CEconj4, p_ijk->dcos_di );
+
+                    /* dcos_theta_jkl */
+                    rvec_ScaledAdd( workspace->f[j],
+                                    CEtors8 + CEconj5, p_jkl->dcos_di );
+		    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+k],
+                                    CEtors8 + CEconj5, p_jkl->dcos_dj );
+                    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+l],
+                                    CEtors8 + CEconj5, p_jkl->dcos_dk );
+
+                    /* dcos_omega */
+                    rvec_ScaledAdd( workspace->f[j],
+                                    CEtors9 + CEconj6, dcos_omega_dj );
+		    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+i],
+                                    CEtors9 + CEconj6, dcos_omega_di );
+                    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+k],
+                                    CEtors9 + CEconj6, dcos_omega_dk );
+                    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+l],
+                                    CEtors9 + CEconj6, dcos_omega_dl );
+                  }
+                  else {
+                    ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
+		
+                    /* dcos_theta_ijk */
+                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
+		    rvec_Add( workspace->forceReduction[reductionOffset+i], force );
+                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+		
+                    rvec_ScaledAdd( workspace->f[j],
+                                    CEtors7 + CEconj4, p_ijk->dcos_dj );
+		
+                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
+		    rvec_Add( workspace->forceReduction[reductionOffset+k], force );
+                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+		
+                    /* dcos_theta_jkl */
+                    rvec_ScaledAdd( workspace->f[j],
+                                    CEtors8 + CEconj5, p_jkl->dcos_di );
+		
+                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
+		    rvec_Add( workspace->forceReduction[reductionOffset+k], force );
+                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+		
+                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
+		    rvec_Add( workspace->forceReduction[reductionOffset+l], force );
+                    rvec_iMultiply( ext_press, rel_box_jl, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+		
+                    /* dcos_omega */
+                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
+		    rvec_Add( workspace->forceReduction[reductionOffset+i], force );
+                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+
+                    rvec_ScaledAdd( workspace->f[j],
+                                    CEtors9 + CEconj6, dcos_omega_dj );
+
+                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
+		    rvec_Add( workspace->forceReduction[reductionOffset+k], force );
+                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+
+                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
+		    rvec_Add( workspace->forceReduction[reductionOffset+i], force );
+                    rvec_iMultiply( ext_press, rel_box_jl, force );
+		    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+                  }
+
+                  /* tally into per-atom virials */
+                  if( system->pair_ptr->vflag_atom || system->pair_ptr->evflag) {
+		
+                    // acquire vectors
+                    rvec_ScaledSum( delil, 1., system->my_atoms[l].x,
+                                          -1., system->my_atoms[i].x );
+                    rvec_ScaledSum( deljl, 1., system->my_atoms[l].x,
+                                          -1., system->my_atoms[j].x );
+                    rvec_ScaledSum( delkl, 1., system->my_atoms[l].x,
+                                          -1., system->my_atoms[k].x );
+                    // dcos_theta_ijk
+                    rvec_Scale( fi_tmp, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                    rvec_Scale( fj_tmp, CEtors7 + CEconj4, p_ijk->dcos_dj );
+                    rvec_Scale( fk_tmp, CEtors7 + CEconj4, p_ijk->dcos_di );
+		
+                    // dcos_theta_jkl
+                    rvec_ScaledAdd( fj_tmp, CEtors8 + CEconj5, p_jkl->dcos_di );
+                    rvec_ScaledAdd( fk_tmp, CEtors8 + CEconj5, p_jkl->dcos_dj );
+		
+                    // dcos_omega
+                    rvec_ScaledAdd( fi_tmp, CEtors9 + CEconj6, dcos_omega_di );
+                    rvec_ScaledAdd( fj_tmp, CEtors9 + CEconj6, dcos_omega_dj );
+                    rvec_ScaledAdd( fk_tmp, CEtors9 + CEconj6, dcos_omega_dk );
+		
+                    // tally
+                    eng_tmp = e_tor + e_con;
+		
+                    if (system->pair_ptr->evflag)
+		      pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, j, k, system->n, 1,
+						        eng_tmp, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+
+		    // NEED TO MAKE AN OMP VERSION OF THIS CALL!
+                    if (system->pair_ptr->vflag_atom)
+		      system->pair_ptr->v_tally4(i, j, k, l, fi_tmp, fj_tmp, fk_tmp,
+						 delil, deljl, delkl );
+                  }
+		
+                } // pl check ends
+              } // pl loop ends
+            } // pi check ends
+          } // pi loop ends
+        } // k-j neighbor check ends
+      } // j<k && j-k neighbor check ends
+    } // pk loop ends
+  } // j loop
+
+ } // end omp parallel
+
+  data->my_en.e_tor = total_Etor;
+  data->my_en.e_con = total_Econ;
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTETORSIONANGLESBOINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
--- a/src/USER-OMP/reaxc_torsion_angles_omp.h
+++ b/src/USER-OMP/reaxc_torsion_angles_omp.h
@ -0,0 +1,36 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __TORSION_ANGLES_OMP_H_
+#define __TORSION_ANGLES_OMP_H_
+
+#include "reaxc_types.h"
+#include "reaxc_torsion_angles.h"
+
+void Torsion_AnglesOMP( reax_system*, control_params*, simulation_data*,
+			storage*, reax_list**, output_controls* );
+
+#endif
--- a/src/USER-OMP/reaxc_valence_angles_omp.cpp
+++ b/src/USER-OMP/reaxc_valence_angles_omp.cpp
@ -0,0 +1,614 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "pair_reaxc_omp.h"
+#include "thr_data.h"
+
+#include "reaxc_types.h"
+#include "reaxc_valence_angles.h"
+#include "reaxc_valence_angles_omp.h"
+#include "reaxc_bond_orders_omp.h"
+#include "reaxc_list.h"
+#include "reaxc_vector.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+void Calculate_dCos_ThetaOMP( rvec dvec_ji, double d_ji, rvec dvec_jk, double d_jk,
+			      rvec* dcos_theta_di,
+			      rvec* dcos_theta_dj,
+			      rvec* dcos_theta_dk )
+{
+  double sqr_d_ji = SQR(d_ji);
+  double sqr_d_jk = SQR(d_jk);
+  double inv_dists = 1.0 / (d_ji * d_jk);
+  double inv_dists3 = inv_dists * inv_dists * inv_dists;
+  double dot_dvecs = dvec_ji[0]*dvec_jk[0] + dvec_ji[1]*dvec_jk[1] + dvec_ji[2]*dvec_jk[2];
+  double Cdot_inv3 = dot_dvecs * inv_dists3;
+
+  double csqr_jk = Cdot_inv3 * sqr_d_jk;
+  double csqr_ji = Cdot_inv3 * sqr_d_ji;
+
+  // Try to help compiler out by unrolling
+  // x-component
+  double dinv_jk = dvec_jk[0] * inv_dists;
+  double dinv_ji = dvec_ji[0] * inv_dists;
+
+  double cdev_ji = csqr_jk * dvec_ji[0];
+  double cdev_jk = csqr_ji * dvec_jk[0];
+
+  (*dcos_theta_di)[0] =   dinv_jk            - cdev_ji;
+  (*dcos_theta_dj)[0] = -(dinv_jk + dinv_ji) + cdev_ji + cdev_jk;
+  (*dcos_theta_dk)[0] =             dinv_ji            - cdev_jk;
+
+  // y-component
+  dinv_jk = dvec_jk[1] * inv_dists;
+  dinv_ji = dvec_ji[1] * inv_dists;
+
+  cdev_ji = csqr_jk * dvec_ji[1];
+  cdev_jk = csqr_ji * dvec_jk[1];
+
+  (*dcos_theta_di)[1] =   dinv_jk            - cdev_ji;
+  (*dcos_theta_dj)[1] = -(dinv_jk + dinv_ji) + cdev_ji + cdev_jk;
+  (*dcos_theta_dk)[1] =             dinv_ji            - cdev_jk;
+
+  // z-component
+  dinv_jk = dvec_jk[2] * inv_dists;
+  dinv_ji = dvec_ji[2] * inv_dists;
+
+  cdev_ji = csqr_jk * dvec_ji[2];
+  cdev_jk = csqr_ji * dvec_jk[2];
+
+  (*dcos_theta_di)[2] =   dinv_jk            - cdev_ji;
+  (*dcos_theta_dj)[2] = -(dinv_jk + dinv_ji) + cdev_ji + cdev_jk;
+  (*dcos_theta_dk)[2] =             dinv_ji            - cdev_jk;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* this is a 3-body interaction in which the main role is
+   played by j which sits in the middle of the other two. */
+void Valence_AnglesOMP( reax_system *system, control_params *control,
+                        simulation_data *data, storage *workspace,
+                        reax_list **lists, output_controls *out_control )
+{
+
+#ifdef OMP_TIMING
+  double endTimeBase, startTimeBase;
+  startTimeBase = MPI_Wtime();
+#endif
+
+  reax_list *bonds = (*lists) + BONDS;
+  reax_list *thb_intrs =  (*lists) + THREE_BODIES;
+
+  // Precompute and store valence_angle offsets for OpenMP code.
+  int * _my_offset = workspace->valence_angle_atom_myoffset;
+
+  /* global parameters used in these calculations */
+  double p_val6 = system->reax_param.gp.l[14];
+  double p_val8 = system->reax_param.gp.l[33];
+  double p_val9 = system->reax_param.gp.l[16];
+  double p_val10 = system->reax_param.gp.l[17];
+  double total_Eang = 0;
+  double total_Epen = 0;
+  double total_Ecoa = 0;
+
+  int  nthreads = control->nthreads;
+  int  num_thb_intrs = 0;
+  int  TWICE = 2;
+
+#pragma omp parallel default(shared) reduction(+:total_Eang, total_Epen, total_Ecoa, num_thb_intrs)
+  {
+    int i, j, pi, k, pk, t;
+    int type_i, type_j, type_k;
+    int start_j, end_j, start_pk, end_pk;
+    int cnt, my_offset;
+
+    double temp, temp_bo_jt, pBOjt7;
+    double p_val1, p_val2, p_val3, p_val4, p_val5, p_val7;
+    double p_pen1, p_pen2, p_pen3, p_pen4;
+    double p_coa1, p_coa2, p_coa3, p_coa4;
+    double trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    double exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    double dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
+    double CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    double CEpen1, CEpen2, CEpen3;
+    double e_ang, e_coa, e_pen;
+    double CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    double Cf7ij, Cf7jk, Cf8j, Cf9j;
+    double f7_ij, f7_jk, f8_Dj, f9_Dj;
+    double Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    double BOA_ij, BOA_jk;
+    rvec force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+
+    // Tallying variables
+    double eng_tmp, fi_tmp[3], fj_tmp[3], fk_tmp[3];
+    double delij[3], delkj[3];
+
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk, *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+
+#if defined(_OPENMP)
+    int tid = omp_get_thread_num();
+#else
+    int tid = 0;
+#endif
+    long reductionOffset = (system->N * tid);
+    class PairReaxCOMP *pair_reax_ptr;
+    pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
+    class ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
+
+    pair_reax_ptr->ev_setup_thr_proxy(system->pair_ptr->eflag_either,
+                                      system->pair_ptr->vflag_either,
+                                      system->N, system->pair_ptr->eatom,
+                                      system->pair_ptr->vatom, thr);
+
+
+    // Run through a minimal for(j<N) loop once to precompute offsets with safe number of threads
+
+    const int per_thread = thb_intrs->num_intrs / nthreads;
+
+#pragma omp for schedule(dynamic,50)
+    for (j = 0; j < system->N; ++j) {
+      type_j = system->my_atoms[j].type;
+      _my_offset[j] = 0;
+      if(type_j < 0) continue;
+
+      start_j = Start_Index(j, bonds);
+      end_j = End_Index(j, bonds);
+
+      // Always point to start of workspace to count angles
+      my_offset = tid * per_thread;
+
+      for (pi = start_j; pi < end_j; ++pi) {
+        Set_Start_Index( pi, my_offset, thb_intrs );
+        pbond_ij = &(bonds->select.bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+
+        if (BOA_ij > 0.0) {
+          i = pbond_ij->nbr;	
+	
+          /* first copy 3-body intrs from previously computed ones where i>k.
+             in the second for-loop below,
+             we compute only new 3-body intrs where i < k */
+          for (pk = start_j; pk < pi; ++pk) {
+            start_pk = Start_Index( pk, thb_intrs );
+            end_pk = End_Index( pk, thb_intrs );
+	
+            for (t = start_pk; t < end_pk; ++t)
+              if (thb_intrs->select.three_body_list[t].thb == i) {
+
+                p_ijk = &(thb_intrs->select.three_body_list[my_offset] );
+                p_ijk->thb = bonds->select.bond_list[pk].nbr;
+	
+                ++my_offset;
+                break;
+              }
+          } // for(pk)
+
+          /* and this is the second for loop mentioned above */
+          for (pk = pi+1; pk < end_j; ++pk) {
+            pbond_jk = &(bonds->select.bond_list[pk]);
+            k        = pbond_jk->nbr;
+
+            if (j >= system->n && i >= system->n && k >= system->n) continue;
+
+            p_ijk    = &( thb_intrs->select.three_body_list[my_offset] );
+            p_ijk->thb = k;
+	
+            ++my_offset; // add this  to the list of 3-body interactions
+          } // for(pk)
+        } // if()
+
+        Set_End_Index(pi, my_offset, thb_intrs );
+      } // for(pi)
+
+      // Confirm that thb_intrs->num_intrs / nthreads is enough to hold all angles from a single atom
+      if(my_offset >= (tid+1)*per_thread) {
+        int me;
+        MPI_Comm_rank(MPI_COMM_WORLD,&me);
+        fprintf( stderr, "step%d-ran out of space on angle_list on proc %i for atom %i:", data->step, me, j);
+        fprintf( stderr, " nthreads= %d, tid=%d, my_offset=%d, per_thread=%d\n", nthreads, tid, my_offset, per_thread);
+        fprintf( stderr, " num_intrs= %i  N= %i\n",thb_intrs->num_intrs , system->N);
+        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+      }
+
+      // Number of angles owned by this atom
+      _my_offset[j] = my_offset - tid * per_thread;
+    } // for(j)
+
+    // Wait for all threads to finish counting angles
+#pragma omp barrier
+
+    // Master thread uses angle counts to compute offsets
+    // This can be threaded
+#pragma omp master
+    {
+      int current_count = 0;
+      int m = _my_offset[0];
+      _my_offset[0] = current_count;
+      for(j=1; j<system->N; j++) {
+        current_count+= m;
+        m = _my_offset[j];
+        _my_offset[j] = current_count;
+      }
+      _my_offset[system->N] = current_count + m; // Used to test if last particle has any angles
+    }
+
+    // All threads wait till master thread finished computing offsets
+#pragma omp barrier
+
+    // Original loop, but now using precomputed offsets
+    // Safe to use all threads available, regardless of threads tasked above
+    // We also now skip over atoms that have no angles assigned
+#pragma omp for schedule(dynamic,50)//(dynamic,chunksize)//(guided)
+    for (j = 0; j < system->N; ++j) {         // Ray: the first one with system->N
+      type_j = system->my_atoms[j].type;
+      if(type_j < 0) continue;
+
+      // Skip if no angles for this atom
+      if(_my_offset[j] == _my_offset[j+1]) continue;
+
+      start_j = Start_Index(j, bonds);
+      end_j = End_Index(j, bonds);
+
+      type_j = system->my_atoms[j].type;
+
+      my_offset = _my_offset[j];
+
+      p_val3 = system->reax_param.sbp[ type_j ].p_val3;
+      p_val5 = system->reax_param.sbp[ type_j ].p_val5;
+
+      SBOp = 0, prod_SBO = 1;
+      for (t = start_j; t < end_j; ++t) {
+        bo_jt = &(bonds->select.bond_list[t].bo_data);
+        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+        temp = SQR( bo_jt->BO );
+        temp *= temp;
+        temp *= temp;
+        prod_SBO *= exp( -temp );
+      }
+
+      // modifications to match Adri's code - 09/01/09
+      if( workspace->vlpex[j] >= 0 ){
+        vlpadj = 0;
+        dSBO2 = prod_SBO - 1;
+      }
+      else{
+        vlpadj = workspace->nlp[j];
+        dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+      }
+
+      SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+      dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+
+      if( SBO <= 0 )
+        SBO2 = 0, CSBO2 = 0;
+      else if( SBO > 0 && SBO <= 1 ) {
+        SBO2 = pow( SBO, p_val9 );
+        CSBO2 = p_val9 * pow( SBO, p_val9 - 1 );
+      }
+      else if( SBO > 1 && SBO < 2 ) {
+        SBO2 = 2 - pow( 2-SBO, p_val9 );
+        CSBO2 = p_val9 * pow( 2 - SBO, p_val9 - 1 );
+      }
+      else
+        SBO2 = 2, CSBO2 = 0;
+
+      expval6 = exp( p_val6 * workspace->Delta_boc[j] );
+
+      for (pi = start_j; pi < end_j; ++pi) {
+        Set_Start_Index( pi, my_offset, thb_intrs );
+        pbond_ij = &(bonds->select.bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+
+
+        if (BOA_ij > 0.0) {
+          i = pbond_ij->nbr;
+          type_i = system->my_atoms[i].type;
+	
+	
+          /* first copy 3-body intrs from previously computed ones where i>k.
+             in the second for-loop below,
+             we compute only new 3-body intrs where i < k */
+          for (pk = start_j; pk < pi; ++pk) {
+            start_pk = Start_Index( pk, thb_intrs );
+            end_pk = End_Index( pk, thb_intrs );
+	
+            for (t = start_pk; t < end_pk; ++t)
+              if (thb_intrs->select.three_body_list[t].thb == i) {
+                p_ijk = &(thb_intrs->select.three_body_list[my_offset] );
+                p_kji = &(thb_intrs->select.three_body_list[t]);
+	
+                p_ijk->thb = bonds->select.bond_list[pk].nbr;
+                p_ijk->pthb  = pk;
+                p_ijk->theta = p_kji->theta;
+                rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+	
+                ++my_offset;
+                ++num_thb_intrs;
+                break;
+              }
+          } // for(pk)
+	
+	
+          /* and this is the second for loop mentioned above */
+          for (pk = pi+1; pk < end_j; ++pk) {
+            pbond_jk = &(bonds->select.bond_list[pk]);
+            bo_jk    = &(pbond_jk->bo_data);
+            BOA_jk   = bo_jk->BO - control->thb_cut;
+            k        = pbond_jk->nbr;
+            type_k   = system->my_atoms[k].type;
+            p_ijk    = &( thb_intrs->select.three_body_list[my_offset] );
+	
+            // Fix by Sudhir
+            // if (BOA_jk <= 0) continue;
+            if (j >= system->n && i >= system->n && k >= system->n) continue;
+	
+            Calculate_Theta( pbond_ij->dvec, pbond_ij->d,
+                             pbond_jk->dvec, pbond_jk->d,
+                             &theta, &cos_theta );
+	
+            Calculate_dCos_ThetaOMP( pbond_ij->dvec, pbond_ij->d,
+                                     pbond_jk->dvec, pbond_jk->d,
+                                     &(p_ijk->dcos_di), &(p_ijk->dcos_dj),
+                                     &(p_ijk->dcos_dk) );
+            p_ijk->thb = k;
+            p_ijk->pthb = pk;
+            p_ijk->theta = theta;
+	
+            sin_theta = sin( theta );
+            if( sin_theta < 1.0e-5 )
+              sin_theta = 1.0e-5;
+	
+            ++my_offset; // add this  to the list of 3-body interactions
+            ++num_thb_intrs;
+	
+            if ((j < system->n) && (BOA_jk > 0.0) &&
+                (bo_ij->BO > control->thb_cut) &&
+                (bo_jk->BO > control->thb_cut) &&
+                (bo_ij->BO * bo_jk->BO > control->thb_cutsq)) {
+              thbh = &( system->reax_param.thbp[ type_i ][ type_j ][ type_k ] );
+	   	
+              for (cnt = 0; cnt < thbh->cnt; ++cnt) {
+	
+                if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                  thbp = &( thbh->prm[cnt] );
+		
+                  /* ANGLE ENERGY */
+                  p_val1 = thbp->p_val1;
+                  p_val2 = thbp->p_val2;
+                  p_val4 = thbp->p_val4;
+                  p_val7 = thbp->p_val7;
+                  theta_00 = thbp->theta_00;
+		
+                  exp3ij = exp( -p_val3 * pow( BOA_ij, p_val4 ) );
+                  f7_ij = 1.0 - exp3ij;
+                  Cf7ij = p_val3 * p_val4 * pow( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+		
+                  exp3jk = exp( -p_val3 * pow( BOA_jk, p_val4 ) );
+                  f7_jk = 1.0 - exp3jk;
+                  Cf7jk = p_val3 * p_val4 * pow( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+		
+                  expval7 = exp( -p_val7 * workspace->Delta_boc[j] );
+                  trm8 = 1.0 + expval6 + expval7;
+                  f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                  Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                    ( p_val6 * expval6 * trm8 -
+                      (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) );
+		
+                  theta_0 = 180.0 - theta_00 * (1.0 -
+                                                exp(-p_val10 * (2.0 - SBO2)));
+                  theta_0 = DEG2RAD( theta_0 );
+		
+                  expval2theta  = exp( -p_val2 * SQR(theta_0 - theta) );
+                  if (p_val1 >= 0)
+                    expval12theta = p_val1 * (1.0 - expval2theta);
+                  else // To avoid linear Me-H-Me angles (6/6/06)
+                    expval12theta = p_val1 * -expval2theta;
+		
+                  CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                  CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                  CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                  CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj *
+                    expval2theta * (theta_0 - theta);
+		
+                  Ctheta_0 = p_val10 * DEG2RAD(theta_00) *
+                    exp( -p_val10 * (2.0 - SBO2) );
+		
+                  CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                  CEval6 = CEval5 * dSBO1;
+                  CEval7 = CEval5 * dSBO2;
+                  CEval8 = -CEval4 / sin_theta;
+
+                  total_Eang += e_ang =
+                    f7_ij * f7_jk * f8_Dj * expval12theta;
+                  /* END ANGLE ENERGY*/
+		
+		
+                  /* PENALTY ENERGY */
+                  p_pen1 = thbp->p_pen1;
+                  p_pen2 = system->reax_param.gp.l[19];
+                  p_pen3 = system->reax_param.gp.l[20];
+                  p_pen4 = system->reax_param.gp.l[21];
+
+                  exp_pen2ij = exp( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                  exp_pen2jk = exp( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                  exp_pen3 = exp( -p_pen3 * workspace->Delta[j] );
+                  exp_pen4 = exp(  p_pen4 * workspace->Delta[j] );
+                  trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                  f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                  Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 -
+                           (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
+                                                p_pen4 * exp_pen4 ) ) /
+                    SQR( trm_pen34 );
+
+                  total_Epen += e_pen =
+                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+		
+                  CEpen1 = e_pen * Cf9j / f9_Dj;
+                  temp   = -2.0 * p_pen2 * e_pen;
+                  CEpen2 = temp * (BOA_ij - 2.0);
+                  CEpen3 = temp * (BOA_jk - 2.0);
+                  /* END PENALTY ENERGY */
+		
+		
+                  /* COALITION ENERGY */
+                  p_coa1 = thbp->p_coa1;
+                  p_coa2 = system->reax_param.gp.l[2];
+                  p_coa3 = system->reax_param.gp.l[38];
+                  p_coa4 = system->reax_param.gp.l[30];
+		
+                  exp_coa2 = exp( p_coa2 * workspace->Delta_val[j] );
+                  total_Ecoa += e_coa =
+                    p_coa1 / (1. + exp_coa2) *
+                    exp( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
+                    exp( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
+                    exp( -p_coa4 * SQR(BOA_ij - 1.5) ) *
+                    exp( -p_coa4 * SQR(BOA_jk - 1.5) );
+		
+                  CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                  CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                  CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
+                  CEcoa4 = -2 * p_coa3 *
+                    (workspace->total_bond_order[i]-BOA_ij) * e_coa;
+                  CEcoa5 = -2 * p_coa3 *
+                    (workspace->total_bond_order[k]-BOA_jk) * e_coa;
+                  /* END COALITION ENERGY */
+		
+		
+                  /* FORCES */
+                  bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+                  bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+                  workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
+                  workspace->CdDeltaReduction[reductionOffset+i] += CEcoa4;
+                  workspace->CdDeltaReduction[reductionOffset+k] += CEcoa5;
+		
+                  for (t = start_j; t < end_j; ++t) {
+                    pbond_jt = &( bonds->select.bond_list[t] );
+                    bo_jt = &(pbond_jt->bo_data);
+                    temp_bo_jt = bo_jt->BO;
+                    temp = CUBE( temp_bo_jt );
+                    pBOjt7 = temp * temp * temp_bo_jt;
+
+                    bo_jt->Cdbo += (CEval6 * pBOjt7);
+                    bo_jt->Cdbopi += CEval5;
+                    bo_jt->Cdbopi2 += CEval5;
+                  }
+		
+                  if( control->virial == 0 ) {
+                    rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+                    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+i],
+                                    CEval8, p_ijk->dcos_di );
+                    rvec_ScaledAdd( workspace->forceReduction[reductionOffset+k],
+                                    CEval8, p_ijk->dcos_dk );
+                  }
+                  else {
+                    /* terms not related to bond order derivatives are
+                       added directly into forces and pressure vector/tensor */
+                    rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                    rvec_Add( workspace->forceReduction[reductionOffset+i], force );
+		
+                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+		
+                    rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+		
+                    rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                    rvec_Add( workspace->forceReduction[reductionOffset+k], force );
+
+                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                    rvec_Add( workspace->my_ext_pressReduction[tid], ext_press );
+                  }
+		
+                  /* tally into per-atom virials */
+                  if( system->pair_ptr->vflag_atom || system->pair_ptr->evflag) {
+		
+                    /* Acquire vectors */
+                    rvec_ScaledSum( delij, 1., system->my_atoms[i].x,
+                                    -1., system->my_atoms[j].x );
+                    rvec_ScaledSum( delkj, 1., system->my_atoms[k].x,
+                                    -1., system->my_atoms[j].x );
+		
+                    rvec_Scale( fi_tmp, -CEval8, p_ijk->dcos_di );
+                    rvec_Scale( fj_tmp, -CEval8, p_ijk->dcos_dj );
+                    rvec_Scale( fk_tmp, -CEval8, p_ijk->dcos_dk );
+		
+                    eng_tmp = e_ang + e_pen + e_coa;
+		
+                    if( system->pair_ptr->evflag)
+                      pair_reax_ptr->ev_tally_thr_proxy(system->pair_ptr, j, j, system->N, 1,
+                                                        eng_tmp, 0.0, 0.0, 0.0, 0.0, 0.0, thr);
+                    if( system->pair_ptr->vflag_atom)
+                      // NEED TO MAKE AN OMP VERSION OF THIS CALL!
+                      system->pair_ptr->v_tally3( i, j, k, fi_tmp, fk_tmp, delij, delkj);
+                  }
+
+                } // if(p_val1>0.001)
+              } // for(cnt)
+            } // if(j<n && BOA_jk>0)
+          } // for(pk)
+        } // if(BOA_ij>0)
+
+        Set_End_Index(pi, my_offset, thb_intrs );
+      } // for(pi)
+    } // for(j)
+
+    pair_reax_ptr->reduce_thr_proxy(system->pair_ptr, system->pair_ptr->eflag_either,
+                                    system->pair_ptr->vflag_either, thr);
+  } // end omp parallel
+
+  data->my_en.e_ang = total_Eang;
+  data->my_en.e_pen = total_Epen;
+  data->my_en.e_coa = total_Ecoa;
+
+  if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+    workspace->realloc.num_3body = num_thb_intrs * TWICE;
+    if( num_thb_intrs > thb_intrs->num_intrs ) {
+      fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
+               data->step, num_thb_intrs, thb_intrs->num_intrs );
+      MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+    }
+  }
+
+#ifdef OMP_TIMING
+  endTimeBase = MPI_Wtime();
+  ompTimingData[COMPUTEVALENCEANGLESBOINDEX] += (endTimeBase-startTimeBase);
+#endif
+}
--- a/src/USER-OMP/reaxc_valence_angles_omp.h
+++ b/src/USER-OMP/reaxc_valence_angles_omp.h
@ -0,0 +1,37 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, hmaktulga@lbl.gov
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  Please cite the related publication:
+  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
+  "Parallel Reactive Molecular Dynamics: Numerical Methods and
+  Algorithmic Techniques", Parallel Computing, in press.
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __VALENCE_ANGLES_OMP_H_
+#define __VALENCE_ANGLES_OMP_H_
+
+#include "reaxc_types.h"
+
+void Valence_AnglesOMP( reax_system*, control_params*, simulation_data*,
+                     storage*, reax_list**, output_controls* );
+
+void Calculate_dCos_ThetaOMP( rvec, double, rvec, double, rvec*, rvec*, rvec* );
+
+#endif
--- a/src/USER-REAXC/compute_spec_atom.cpp
+++ b/src/USER-REAXC/compute_spec_atom.cpp
@ -43,9 +43,7 @@ ComputeSpecAtom::ComputeSpecAtom(LAMMPS *lmp, int narg, char **arg) :
  else size_peratom_cols = nvalues;

  // Initiate reaxc
-  reaxc = (PairReaxC *) force->pair_match("reax/c",1);
-  if (reaxc == NULL)
-    reaxc = (PairReaxC *) force->pair_match("reax/c/kk",1);
+  reaxc = (PairReaxC *) force->pair_match("reax/c",0);

  pack_choice = new FnPtrPack[nvalues];

--- a/src/USER-REAXC/fix_qeq_reax.cpp
+++ b/src/USER-REAXC/fix_qeq_reax.cpp
@ -64,11 +64,11 @@ static const char cite_fix_qeq_reax[] =
 /* ---------------------------------------------------------------------- */

 FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
+  Fix(lmp, narg, arg), pertype_option(NULL)
 {
  if (lmp->citeme) lmp->citeme->add(cite_fix_qeq_reax);

-  if (narg != 8) error->all(FLERR,"Illegal fix qeq/reax command");
+  if (narg<8 || narg>9) error->all(FLERR,"Illegal fix qeq/reax command");

  nevery = force->inumeric(FLERR,arg[3]);
  if (nevery <= 0) error->all(FLERR,"Illegal fix qeq/reax command");
@ -76,8 +76,17 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) :
  swa = force->numeric(FLERR,arg[4]);
  swb = force->numeric(FLERR,arg[5]);
  tolerance = force->numeric(FLERR,arg[6]);
-  pertype_parameters(arg[7]);
+  int len = strlen(arg[7]) + 1;
+  pertype_option = new char[len];
+  strcpy(pertype_option,arg[7]);

+  // dual CG support only available for USER-OMP variant
+  // check for compatibility is in Fix::post_constructor()
+  dual_enabled = 0;
+  if (narg == 9) {
+    if (strcmp(arg[8],"dual") == 0) dual_enabled = 1;
+    else error->all(FLERR,"Illegal fix qeq/reax command");
+  }
  shld = NULL;

  n = n_cap = 0;
@ -106,31 +115,37 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) :
  H.jlist = NULL;
  H.val = NULL;

-  comm_forward = comm_reverse = 1;
+  // dual CG support
+  // Update comm sizes for this fix
+  if (dual_enabled) comm_forward = comm_reverse = 2;
+  else comm_forward = comm_reverse = 1;

  // perform initial allocation of atom-based arrays
  // register with Atom class

-  s_hist = t_hist = NULL;
-  grow_arrays(atom->nmax);
-  atom->add_callback(0);
-  for( int i = 0; i < atom->nmax; i++ )
-    for (int j = 0; j < nprev; ++j )
-      s_hist[i][j] = t_hist[i][j] = 0;
-
  reaxc = NULL;
-  reaxc = (PairReaxC *) force->pair_match("reax/c",1);
+  reaxc = (PairReaxC *) force->pair_match("reax/c",0);

+  if (reaxc) {
+    s_hist = t_hist = NULL;
+    grow_arrays(atom->nmax);
+    atom->add_callback(0);
+    for (int i = 0; i < atom->nmax; i++)
+      for (int j = 0; j < nprev; ++j)
+        s_hist[i][j] = t_hist[i][j] = 0;
+  }
 }

 /* ---------------------------------------------------------------------- */

 FixQEqReax::~FixQEqReax()
 {
-  // unregister callbacks to this fix from Atom class
-
  if (copymode) return;

+  delete[] pertype_option;
+
+  // unregister callbacks to this fix from Atom class
+
  atom->delete_callback(id,0);

  memory->destroy(s_hist);
@ -150,6 +165,15 @@ FixQEqReax::~FixQEqReax()

 /* ---------------------------------------------------------------------- */

+void FixQEqReax::post_constructor()
+{
+  pertype_parameters(pertype_option);
+  if (dual_enabled)
+    error->all(FLERR,"Dual keyword only supported with fix qeq/reax/omp");
+}
+
+/* ---------------------------------------------------------------------- */
+
 int FixQEqReax::setmask()
 {
  int mask = 0;
@ -165,11 +189,9 @@ void FixQEqReax::pertype_parameters(char *arg)
 {
  if (strcmp(arg,"reax/c") == 0) {
    reaxflag = 1;
-    Pair *pair = force->pair_match("reax/c",1);
-    if (pair == NULL)
-      pair = force->pair_match("reax/c/kk",1);
-
+    Pair *pair = force->pair_match("reax/c",0);
    if (pair == NULL) error->all(FLERR,"No pair reax/c for fix qeq/reax");
+
    int tmp;
    chi = (double *) pair->extract("chi",tmp);
    eta = (double *) pair->extract("eta",tmp);
@ -227,10 +249,14 @@ void FixQEqReax::allocate_storage()
  memory->create(b_prc,nmax,"qeq:b_prc");
  memory->create(b_prm,nmax,"qeq:b_prm");

-  memory->create(p,nmax,"qeq:p");
-  memory->create(q,nmax,"qeq:q");
-  memory->create(r,nmax,"qeq:r");
-  memory->create(d,nmax,"qeq:d");
+  // dual CG support
+  int size = nmax;
+  if (dual_enabled) size*= 2;
+
+  memory->create(p,size,"qeq:p");
+  memory->create(q,size,"qeq:q");
+  memory->create(r,size,"qeq:r");
+  memory->create(d,size,"qeq:d");
 }

 /* ---------------------------------------------------------------------- */
@ -271,7 +297,7 @@ void FixQEqReax::allocate_matrix()
  int mincap;
  double safezone;

-  if( reaxflag ) {
+  if (reaxflag) {
    mincap = reaxc->system->mincap;
    safezone = reaxc->system->safezone;
  } else {
@ -280,7 +306,7 @@ void FixQEqReax::allocate_matrix()
  }

  n = atom->nlocal;
-  n_cap = MAX( (int)(n * safezone), mincap );
+  n_cap = MAX( (int)(n * safezone), mincap);

  // determine the total space for the H matrix

@ -295,11 +321,11 @@ void FixQEqReax::allocate_matrix()
  }

  m = 0;
-  for( ii = 0; ii < inum; ii++ ) {
+  for (ii = 0; ii < inum; ii++) {
    i = ilist[ii];
    m += numneigh[i];
  }
-  m_cap = MAX( (int)(m * safezone), mincap * MIN_NBRS );
+  m_cap = MAX( (int)(m * safezone), mincap * MIN_NBRS);

  H.n = n_cap;
  H.m = m_cap;
@ -331,18 +357,12 @@ void FixQEqReax::reallocate_matrix()

 void FixQEqReax::init()
 {
-  if (!atom->q_flag) error->all(FLERR,"Fix qeq/reax requires atom attribute q");
+  if (!atom->q_flag)
+    error->all(FLERR,"Fix qeq/reax requires atom attribute q");

  ngroup = group->count(igroup);
  if (ngroup == 0) error->all(FLERR,"Fix qeq/reax group has no atoms");

-  /*
-  if (reaxc)
-    if (ngroup != reaxc->ngroup)
-      error->all(FLERR,"Fix qeq/reax group and pair reax/c have "
-		       "different numbers of atoms");
-  */
-
  // need a half neighbor list w/ Newton off and ghost neighbors
  // built whenever re-neighboring occurs

@ -377,9 +397,9 @@ void FixQEqReax::init_shielding()
  if (shld == NULL)
    memory->create(shld,ntypes+1,ntypes+1,"qeq:shielding");

-  for( i = 1; i <= ntypes; ++i )
-    for( j = 1; j <= ntypes; ++j )
-      shld[i][j] = pow( gamma[i] * gamma[j], -1.5 );
+  for (i = 1; i <= ntypes; ++i)
+    for (j = 1; j <= ntypes; ++j)
+      shld[i][j] = pow( gamma[i] * gamma[j], -1.5);
 }

 /* ---------------------------------------------------------------------- */
@ -395,21 +415,21 @@ void FixQEqReax::init_taper()
  else if (swb < 5 && comm->me == 0)
    error->warning(FLERR,"Fix qeq/reax has very low Taper radius cutoff");

-  d7 = pow( swb - swa, 7 );
-  swa2 = SQR( swa );
-  swa3 = CUBE( swa );
-  swb2 = SQR( swb );
-  swb3 = CUBE( swb );
+  d7 = pow( swb - swa, 7);
+  swa2 = SQR( swa);
+  swa3 = CUBE( swa);
+  swb2 = SQR( swb);
+  swb3 = CUBE( swb);

  Tap[7] =  20.0 / d7;
  Tap[6] = -70.0 * (swa + swb) / d7;
  Tap[5] =  84.0 * (swa2 + 3.0*swa*swb + swb2) / d7;
-  Tap[4] = -35.0 * (swa3 + 9.0*swa2*swb + 9.0*swa*swb2 + swb3 ) / d7;
-  Tap[3] = 140.0 * (swa3*swb + 3.0*swa2*swb2 + swa*swb3 ) / d7;
+  Tap[4] = -35.0 * (swa3 + 9.0*swa2*swb + 9.0*swa*swb2 + swb3) / d7;
+  Tap[3] = 140.0 * (swa3*swb + 3.0*swa2*swb2 + swa*swb3) / d7;
  Tap[2] =-210.0 * (swa3*swb2 + swa2*swb3) / d7;
  Tap[1] = 140.0 * swa3 * swb3 / d7;
  Tap[0] = (-35.0*swa3*swb2*swb2 + 21.0*swa2*swb3*swb2 +
-            7.0*swa*swb3*swb3 + swb3*swb3*swb ) / d7;
+            7.0*swa*swb3*swb3 + swb3*swb3*swb) / d7;
 }

 /* ---------------------------------------------------------------------- */
@ -437,6 +457,13 @@ void FixQEqReax::setup_pre_force_respa(int vflag, int ilevel)

 /* ---------------------------------------------------------------------- */

+void FixQEqReax::min_setup_pre_force(int vflag)
+{
+  setup_pre_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
 void FixQEqReax::init_storage()
 {
  int NN;
@ -446,7 +473,7 @@ void FixQEqReax::init_storage()
  else
    NN = list->inum + list->gnum;

-  for( int i = 0; i < NN; i++ ) {
+  for (int i = 0; i < NN; i++) {
    Hdia_inv[i] = 1. / eta[atom->type[i]];
    b_s[i] = -chi[atom->type[i]];
    b_t[i] = -1.0;
@ -463,7 +490,7 @@ void FixQEqReax::pre_force(int vflag)
  double t_start, t_end;

  if (update->ntimestep % nevery) return;
-  if( comm->me == 0 ) t_start = MPI_Wtime();
+  if (comm->me == 0) t_start = MPI_Wtime();

  n = atom->nlocal;
  N = atom->nlocal + atom->nghost;
@ -471,16 +498,19 @@ void FixQEqReax::pre_force(int vflag)
  // grow arrays if necessary
  // need to be atom->nmax in length

-  if( atom->nmax > nmax ) reallocate_storage();
-  if( n > n_cap*DANGER_ZONE || m_fill > m_cap*DANGER_ZONE )
+  if (atom->nmax > nmax) reallocate_storage();
+  if (n > n_cap*DANGER_ZONE || m_fill > m_cap*DANGER_ZONE)
    reallocate_matrix();

  init_matvec();
-  matvecs = CG(b_s, s);    	// CG on s - parallel
-  matvecs += CG(b_t, t); 	// CG on t - parallel
+
+  matvecs_s = CG(b_s, s);    	// CG on s - parallel
+  matvecs_t = CG(b_t, t);       // CG on t - parallel
+  matvecs = matvecs_s + matvecs_t;
+
  calculate_Q();

-  if( comm->me == 0 ) {
+  if (comm->me == 0) {
    t_end = MPI_Wtime();
    qeq_time = t_end - t_start;
  }
@ -518,7 +548,7 @@ void FixQEqReax::init_matvec()
    ilist = list->ilist;
  }

-  for( ii = 0; ii < nn; ++ii ) {
+  for (ii = 0; ii < nn; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit) {

@ -533,7 +563,7 @@ void FixQEqReax::init_matvec()

      /* quadratic extrapolation for s & t from previous solutions */
      //s[i] = s_hist[i][2] + 3 * ( s_hist[i][0] - s_hist[i][1] );
-      t[i] = t_hist[i][2] + 3 * ( t_hist[i][0] - t_hist[i][1] );
+      t[i] = t_hist[i][2] + 3 * ( t_hist[i][0] - t_hist[i][1]);

      /* cubic extrapolation for s & t from previous solutions */
      s[i] = 4*(s_hist[i][0]+s_hist[i][2])-(6*s_hist[i][1]+s_hist[i][3]);
@ -553,12 +583,12 @@ void FixQEqReax::compute_H()
 {
  int inum, jnum, *ilist, *jlist, *numneigh, **firstneigh;
  int i, j, ii, jj, flag;
-  double **x, SMALL = 0.0001;
  double dx, dy, dz, r_sqr;
+  const double SMALL = 0.0001;

  int *type = atom->type;
  tagint *tag = atom->tag;
-  x = atom->x;
+  double **x = atom->x;
  int *mask = atom->mask;

  if (reaxc) {
@ -576,14 +606,14 @@ void FixQEqReax::compute_H()
  // fill in the H matrix
  m_fill = 0;
  r_sqr = 0;
-  for( ii = 0; ii < inum; ii++ ) {
+  for (ii = 0; ii < inum; ii++) {
    i = ilist[ii];
    if (mask[i] & groupbit) {
      jlist = firstneigh[i];
      jnum = numneigh[i];
      H.firstnbr[i] = m_fill;

-      for( jj = 0; jj < jnum; jj++ ) {
+      for (jj = 0; jj < jnum; jj++) {
        j = jlist[jj];

        dx = x[j][0] - x[i][0];
@ -605,9 +635,9 @@ void FixQEqReax::compute_H()
 	  }
 	}

-        if( flag ) {
+        if (flag) {
          H.jlist[m_fill] = j;
-          H.val[m_fill] = calculate_H( sqrt(r_sqr), shld[type[i]][type[j]] );
+          H.val[m_fill] = calculate_H( sqrt(r_sqr), shld[type[i]][type[j]]);
          m_fill++;
        }
      }
@ -618,7 +648,7 @@ void FixQEqReax::compute_H()
  if (m_fill >= H.m) {
    char str[128];
    sprintf(str,"H matrix size has been exceeded: m_fill=%d H.m=%d\n",
-             m_fill, H.m );
+             m_fill, H.m);
    error->warning(FLERR,str);
    error->all(FLERR,"Fix qeq/reax has insufficient QEq matrix size");
  }
@ -626,7 +656,7 @@ void FixQEqReax::compute_H()

 /* ---------------------------------------------------------------------- */

-double FixQEqReax::calculate_H( double r, double gamma )
+double FixQEqReax::calculate_H( double r, double gamma)
 {
  double Taper, denom;

@ -646,7 +676,7 @@ double FixQEqReax::calculate_H( double r, double gamma )

 /* ---------------------------------------------------------------------- */

-int FixQEqReax::CG( double *b, double *x )
+int FixQEqReax::CG( double *b, double *x)
 {
  int  i, j, imax;
  double tmp, alpha, beta, b_norm;
@ -665,21 +695,21 @@ int FixQEqReax::CG( double *b, double *x )
  imax = 200;

  pack_flag = 1;
-  sparse_matvec( &H, x, q );
-  comm->reverse_comm_fix( this ); //Coll_Vector( q );
+  sparse_matvec( &H, x, q);
+  comm->reverse_comm_fix(this); //Coll_Vector( q );

-  vector_sum( r , 1.,  b, -1., q, nn );
+  vector_sum( r , 1.,  b, -1., q, nn);

-  for( jj = 0; jj < nn; ++jj ) {
+  for (jj = 0; jj < nn; ++jj) {
    j = ilist[jj];
    if (atom->mask[j] & groupbit)
      d[j] = r[j] * Hdia_inv[j]; //pre-condition
  }

-  b_norm = parallel_norm( b, nn );
+  b_norm = parallel_norm( b, nn);
  sig_new = parallel_dot( r, d, nn);

-  for( i = 1; i < imax && sqrt(sig_new) / b_norm > tolerance; ++i ) {
+  for (i = 1; i < imax && sqrt(sig_new) / b_norm > tolerance; ++i) {
    comm->forward_comm_fix(this); //Dist_vector( d );
    sparse_matvec( &H, d, q );
    comm->reverse_comm_fix(this); //Coll_vector( q );
@ -691,7 +721,7 @@ int FixQEqReax::CG( double *b, double *x )
    vector_add( r, -alpha, q, nn );

    // pre-conditioning
-    for( jj = 0; jj < nn; ++jj ) {
+    for (jj = 0; jj < nn; ++jj) {
      j = ilist[jj];
      if (atom->mask[j] & groupbit)
        p[j] = r[j] * Hdia_inv[j];
@ -702,7 +732,6 @@ int FixQEqReax::CG( double *b, double *x )

    beta = sig_new / sig_old;
    vector_sum( d, 1., p, beta, d, nn );
-
  }

  if (i >= imax && comm->me == 0) {
@ -718,7 +747,7 @@ int FixQEqReax::CG( double *b, double *x )

 /* ---------------------------------------------------------------------- */

-void FixQEqReax::sparse_matvec( sparse_matrix *A, double *x, double *b )
+void FixQEqReax::sparse_matvec( sparse_matrix *A, double *x, double *b)
 {
  int i, j, itr_j;
  int nn, NN, ii;
@ -734,22 +763,22 @@ void FixQEqReax::sparse_matvec( sparse_matrix *A, double *x, double *b )
    ilist = list->ilist;
  }

-  for( ii = 0; ii < nn; ++ii ) {
+  for (ii = 0; ii < nn; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit)
      b[i] = eta[ atom->type[i] ] * x[i];
  }

-  for( ii = nn; ii < NN; ++ii ) {
+  for (ii = nn; ii < NN; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit)
      b[i] = 0;
  }

-  for( ii = 0; ii < nn; ++ii ) {
+  for (ii = 0; ii < nn; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit) {
-      for( itr_j=A->firstnbr[i]; itr_j<A->firstnbr[i]+A->numnbrs[i]; itr_j++) {
+      for (itr_j=A->firstnbr[i]; itr_j<A->firstnbr[i]+A->numnbrs[i]; itr_j++) {
        j = A->jlist[itr_j];
        b[i] += A->val[itr_j] * x[j];
        b[j] += A->val[itr_j] * x[i];
@ -778,17 +807,17 @@ void FixQEqReax::calculate_Q()
    ilist = list->ilist;
  }

-  s_sum = parallel_vector_acc( s, nn );
+  s_sum = parallel_vector_acc( s, nn);
  t_sum = parallel_vector_acc( t, nn);
  u = s_sum / t_sum;

-  for( ii = 0; ii < nn; ++ii ) {
+  for (ii = 0; ii < nn; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit) {
      q[i] = s[i] - u * t[i];

      /* backup s & t */
-      for( k = 4; k > 0; --k ) {
+      for (k = 4; k > 0; --k) {
        s_hist[i][k] = s_hist[i][k-1];
        t_hist[i][k] = t_hist[i][k-1];
      }
@ -798,7 +827,7 @@ void FixQEqReax::calculate_Q()
  }

  pack_flag = 4;
-  comm->forward_comm_fix( this ); //Dist_vector( atom->q );
+  comm->forward_comm_fix(this); //Dist_vector( atom->q );
 }

 /* ---------------------------------------------------------------------- */
@ -808,15 +837,23 @@ int FixQEqReax::pack_forward_comm(int n, int *list, double *buf,
 {
  int m;

-  if( pack_flag == 1)
+  if (pack_flag == 1)
    for(m = 0; m < n; m++) buf[m] = d[list[m]];
-  else if( pack_flag == 2 )
+  else if (pack_flag == 2)
    for(m = 0; m < n; m++) buf[m] = s[list[m]];
-  else if( pack_flag == 3 )
+  else if (pack_flag == 3)
    for(m = 0; m < n; m++) buf[m] = t[list[m]];
-  else if( pack_flag == 4 )
+  else if (pack_flag == 4)
    for(m = 0; m < n; m++) buf[m] = atom->q[list[m]];
-
+  else if (pack_flag == 5) {
+    m = 0;
+    for(int i = 0; i < n; i++) {
+      int j = 2 * list[i];
+      buf[m++] = d[j  ];
+      buf[m++] = d[j+1];
+    }
+    return m;
+  }
  return n;
 }

@ -826,14 +863,23 @@ void FixQEqReax::unpack_forward_comm(int n, int first, double *buf)
 {
  int i, m;

-  if( pack_flag == 1)
+  if (pack_flag == 1)
    for(m = 0, i = first; m < n; m++, i++) d[i] = buf[m];
-  else if( pack_flag == 2)
+  else if (pack_flag == 2)
    for(m = 0, i = first; m < n; m++, i++) s[i] = buf[m];
-  else if( pack_flag == 3)
+  else if (pack_flag == 3)
    for(m = 0, i = first; m < n; m++, i++) t[i] = buf[m];
-  else if( pack_flag == 4)
+  else if (pack_flag == 4)
    for(m = 0, i = first; m < n; m++, i++) atom->q[i] = buf[m];
+  else if (pack_flag == 5) {
+    int last = first + n;
+    m = 0;
+    for(i = first; i < last; i++) {
+      int j = 2 * i;
+      d[j  ] = buf[m++];
+      d[j+1] = buf[m++];
+    }
+  }
 }

 /* ---------------------------------------------------------------------- */
@ -841,15 +887,35 @@ void FixQEqReax::unpack_forward_comm(int n, int first, double *buf)
 int FixQEqReax::pack_reverse_comm(int n, int first, double *buf)
 {
  int i, m;
-  for(m = 0, i = first; m < n; m++, i++) buf[m] = q[i];
-  return n;
+  if (pack_flag == 5) {
+    m = 0;
+    int last = first + n;
+    for(i = first; i < last; i++) {
+      int indxI = 2 * i;
+      buf[m++] = q[indxI  ];
+      buf[m++] = q[indxI+1];
+    }
+    return m;
+  } else {
+    for (m = 0, i = first; m < n; m++, i++) buf[m] = q[i];
+    return n;
+  }
 }

 /* ---------------------------------------------------------------------- */

 void FixQEqReax::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  for(int m = 0; m < n; m++) q[list[m]] += buf[m];
+  if (pack_flag == 5) {
+    int m = 0;
+    for(int i = 0; i < n; i++) {
+      int indxI = 2 * list[i];
+      q[indxI  ] += buf[m++];
+      q[indxI+1] += buf[m++];
+    }
+  } else {
+    for (int m = 0; m < n; m++) q[list[m]] += buf[m];
+  }
 }

 /* ----------------------------------------------------------------------
@ -866,6 +932,9 @@ double FixQEqReax::memory_usage()
  bytes += m_cap * sizeof(int);
  bytes += m_cap * sizeof(double);

+  if (dual_enabled)
+    bytes += atom->nmax*4 * sizeof(double); // double size for q, d, r, and p
+
  return bytes;
 }

@ -915,7 +984,7 @@ int FixQEqReax::unpack_exchange(int nlocal, double *buf)

 /* ---------------------------------------------------------------------- */

-double FixQEqReax::parallel_norm( double *v, int n )
+double FixQEqReax::parallel_norm( double *v, int n)
 {
  int  i;
  double my_sum, norm_sqr;
@ -930,15 +999,15 @@ double FixQEqReax::parallel_norm( double *v, int n )

  my_sum = 0.0;
  norm_sqr = 0.0;
-  for( ii = 0; ii < n; ++ii ) {
+  for (ii = 0; ii < n; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit)
-      my_sum += SQR( v[i] );
+      my_sum += SQR( v[i]);
  }

-  MPI_Allreduce( &my_sum, &norm_sqr, 1, MPI_DOUBLE, MPI_SUM, world );
+  MPI_Allreduce( &my_sum, &norm_sqr, 1, MPI_DOUBLE, MPI_SUM, world);

-  return sqrt( norm_sqr );
+  return sqrt( norm_sqr);
 }

 /* ---------------------------------------------------------------------- */
@ -958,20 +1027,20 @@ double FixQEqReax::parallel_dot( double *v1, double *v2, int n)

  my_dot = 0.0;
  res = 0.0;
-  for( ii = 0; ii < n; ++ii ) {
+  for (ii = 0; ii < n; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit)
      my_dot += v1[i] * v2[i];
  }

-  MPI_Allreduce( &my_dot, &res, 1, MPI_DOUBLE, MPI_SUM, world );
+  MPI_Allreduce( &my_dot, &res, 1, MPI_DOUBLE, MPI_SUM, world);

  return res;
 }

 /* ---------------------------------------------------------------------- */

-double FixQEqReax::parallel_vector_acc( double *v, int n )
+double FixQEqReax::parallel_vector_acc( double *v, int n)
 {
  int  i;
  double my_acc, res;
@ -986,13 +1055,13 @@ double FixQEqReax::parallel_vector_acc( double *v, int n )

  my_acc = 0.0;
  res = 0.0;
-  for( ii = 0; ii < n; ++ii ) {
+  for (ii = 0; ii < n; ++ii) {
    i = ilist[ii];
    if (atom->mask[i] & groupbit)
      my_acc += v[i];
  }

-  MPI_Allreduce( &my_acc, &res, 1, MPI_DOUBLE, MPI_SUM, world );
+  MPI_Allreduce( &my_acc, &res, 1, MPI_DOUBLE, MPI_SUM, world);

  return res;
 }
@ -1000,7 +1069,7 @@ double FixQEqReax::parallel_vector_acc( double *v, int n )
 /* ---------------------------------------------------------------------- */

 void FixQEqReax::vector_sum( double* dest, double c, double* v,
-                                double d, double* y, int k )
+                                double d, double* y, int k)
 {
  int kk;
  int *ilist;
@ -1010,7 +1079,7 @@ void FixQEqReax::vector_sum( double* dest, double c, double* v,
  else
    ilist = list->ilist;

-  for( --k; k>=0; --k ) {
+  for (--k; k>=0; --k) {
    kk = ilist[k];
    if (atom->mask[kk] & groupbit)
      dest[kk] = c * v[kk] + d * y[kk];
@ -1019,7 +1088,7 @@ void FixQEqReax::vector_sum( double* dest, double c, double* v,

 /* ---------------------------------------------------------------------- */

-void FixQEqReax::vector_add( double* dest, double c, double* v, int k )
+void FixQEqReax::vector_add( double* dest, double c, double* v, int k)
 {
  int kk;
  int *ilist;
@ -1029,10 +1098,9 @@ void FixQEqReax::vector_add( double* dest, double c, double* v, int k )
  else
    ilist = list->ilist;

-  for( --k; k>=0; --k ) {
+  for (--k; k>=0; --k) {
    kk = ilist[k];
    if (atom->mask[kk] & groupbit)
      dest[kk] += c * v[kk];
  }
-
 }
--- a/src/USER-REAXC/fix_qeq_reax.h
+++ b/src/USER-REAXC/fix_qeq_reax.h
@ -39,15 +39,17 @@ class FixQEqReax : public Fix {
  FixQEqReax(class LAMMPS *, int, char **);
  ~FixQEqReax();
  int setmask();
-  void init();
+  virtual void post_constructor();
+  virtual void init();
  void init_list(int,class NeighList *);
-  void init_storage();
+  virtual void init_storage();
  void setup_pre_force(int);
-  void pre_force(int);
+  virtual void pre_force(int);

  void setup_pre_force_respa(int, int);
  void pre_force_respa(int, int, int);

+  void min_setup_pre_force(int);
  void min_pre_force(int);

  int matvecs;
@ -99,25 +101,26 @@ class FixQEqReax : public Fix {
  //double **h;
  //double *hc, *hs;

-  void pertype_parameters(char*);
+  char *pertype_option;  // argument to determine how per-type info is obtained
+  virtual void pertype_parameters(char*);
  void init_shielding();
  void init_taper();
-  void allocate_storage();
-  void deallocate_storage();
+  virtual void allocate_storage();
+  virtual void deallocate_storage();
  void reallocate_storage();
-  void allocate_matrix();
+  virtual void allocate_matrix();
  void deallocate_matrix();
  void reallocate_matrix();

-  void init_matvec();
+  virtual void init_matvec();
  void init_H();
-  void compute_H();
+  virtual void compute_H();
  double calculate_H(double,double);
-  void calculate_Q();
+  virtual void calculate_Q();

-  int CG(double*,double*);
+  virtual int CG(double*,double*);
  //int GMRES(double*,double*);
-  void sparse_matvec(sparse_matrix*,double*,double*);
+  virtual void sparse_matvec(sparse_matrix*,double*,double*);

  int pack_forward_comm(int, int *, double *, int, int *);
  void unpack_forward_comm(int, int, double *);
@ -129,12 +132,16 @@ class FixQEqReax : public Fix {
  int pack_exchange(int, double *);
  int unpack_exchange(int, double *);

-  double parallel_norm( double*, int );
-  double parallel_dot( double*, double*, int );
-  double parallel_vector_acc( double*, int );
+  virtual double parallel_norm( double*, int );
+  virtual double parallel_dot( double*, double*, int );
+  virtual double parallel_vector_acc( double*, int );

-  void vector_sum(double*,double,double*,double,double*,int);
-  void vector_add(double*, double, double*,int);
+  virtual void vector_sum(double*,double,double*,double,double*,int);
+  virtual void vector_add(double*, double, double*,int);
+
+  // dual CG support
+  int dual_enabled;  // 0: Original, separate s & t optimization; 1: dual optimization
+  int matvecs_s, matvecs_t; // Iteration count for each system
 };

 }
--- a/src/USER-REAXC/fix_reaxc.h
+++ b/src/USER-REAXC/fix_reaxc.h
@ -36,6 +36,7 @@ namespace LAMMPS_NS {

 class FixReaxC : public Fix {
  friend class PairReaxC;
+  friend class PairReaxCOMP;

 public:
  FixReaxC(class LAMMPS *,int, char **);
--- a/src/USER-REAXC/fix_reaxc_bonds.cpp
+++ b/src/USER-REAXC/fix_reaxc_bonds.cpp
@ -121,12 +121,9 @@ void FixReaxCBonds::setup(int vflag)

 void FixReaxCBonds::init()
 {
-  reaxc = (PairReaxC *) force->pair_match("reax/c",1);
-  if (reaxc == NULL)
-    reaxc = (PairReaxC *) force->pair_match("reax/c/kk",1);
-
+  reaxc = (PairReaxC *) force->pair_match("reax/c",0);
  if (reaxc == NULL) error->all(FLERR,"Cannot use fix reax/c/bonds without "
-                  "pair_style reax/c");
+                                "pair_style reax/c, reax/c/kk, or reax/c/omp");

 }

--- a/src/USER-REAXC/fix_reaxc_species.cpp
+++ b/src/USER-REAXC/fix_reaxc_species.cpp
@ -13,7 +13,7 @@

 /* ----------------------------------------------------------------------
   Contributing authors: Ray Shan (Sandia, tnshan@sandia.gov)
-   			 Oleg Sergeev (VNIIA, sergeev@vniia.ru)
+                         Oleg Sergeev (VNIIA, sergeev@vniia.ru)
 ------------------------------------------------------------------------- */

 #include <stdlib.h>
@ -182,35 +182,35 @@ FixReaxCSpecies::FixReaxCSpecies(LAMMPS *lmp, int narg, char **arg) :
      jtype = atoi(arg[iarg+2]);
      bo_cut = atof(arg[iarg+3]);
      if (itype > ntypes || jtype > ntypes)
-      	error->all(FLERR,"Illegal fix reax/c/species command");
+        error->all(FLERR,"Illegal fix reax/c/species command");
      if (itype <= 0 || jtype <= 0)
-      	error->all(FLERR,"Illegal fix reax/c/species command");
+        error->all(FLERR,"Illegal fix reax/c/species command");
      if (bo_cut > 1.0 || bo_cut < 0.0)
-      	error->all(FLERR,"Illegal fix reax/c/species command");
+        error->all(FLERR,"Illegal fix reax/c/species command");

      BOCut[itype][jtype] = bo_cut;
      BOCut[jtype][itype] = bo_cut;
      iarg += 4;

-    // modify element type names
+      // modify element type names
    } else if (strcmp(arg[iarg],"element") == 0) {
      if (iarg+ntypes+1 > narg) error->all(FLERR,"Illegal fix reax/c/species command");

      eletype = (char**) malloc(ntypes*sizeof(char*));
      for (int i = 0; i < ntypes; i ++) {
        eletype[i] = (char*) malloc(2*sizeof(char));
-      	strcpy(eletype[i],arg[iarg+1+i]);
+        strcpy(eletype[i],arg[iarg+1+i]);
      }
      eleflag = 1;
      iarg += ntypes + 1;

-    // position of molecules
+      // position of molecules
    } else if (strcmp(arg[iarg],"position") == 0) {
      if (iarg+3 > narg) error->all(FLERR,"Illegal fix reax/c/species command");
      posflag = 1;
      posfreq = atoi(arg[iarg+1]);
      if (posfreq < nfreq || (posfreq%nfreq != 0))
-	error->all(FLERR,"Illegal fix reax/c/species command");
+        error->all(FLERR,"Illegal fix reax/c/species command");

      filepos = new char[255];
      strcpy(filepos,arg[iarg+2]);
@ -221,8 +221,8 @@ FixReaxCSpecies::FixReaxCSpecies(LAMMPS *lmp, int narg, char **arg) :
          pos = fopen(filepos, "w");
          if (pos == NULL) error->one(FLERR,"Cannot open fix reax/c/species position file");
        }
-      	singlepos_opened = 1;
-      	multipos = 0;
+        singlepos_opened = 1;
+        multipos = 0;
      }
      iarg += 3;
    } else error->all(FLERR,"Illegal fix reax/c/species command");
@ -298,12 +298,9 @@ void FixReaxCSpecies::init()
  if (atom->tag_enable == 0)
    error->all(FLERR,"Cannot use fix reax/c/species unless atoms have IDs");

-  reaxc = (PairReaxC *) force->pair_match("reax/c",1);
-  if (reaxc == NULL)
-    reaxc = (PairReaxC *) force->pair_match("reax/c/kk",1);
-
+  reaxc = (PairReaxC *) force->pair_match("reax/c",0);
  if (reaxc == NULL) error->all(FLERR,"Cannot use fix reax/c/species without "
-		  "pair_style reax/c");
+                                "pair_style reax/c, reax/c/kk, or reax/c/omp");

  reaxc->fixspecies_flag = 1;

@ -392,14 +389,14 @@ void FixReaxCSpecies::create_fix()
  args[3]  = tmparg[0];
  args[4]  = tmparg[1];
  args[5]  = tmparg[2];
-  args[6]  = (char *) "c_SPECATOM[1]";	 // q, array_atoms[i][0]
-  args[7]  = (char *) "c_SPECATOM[2]";	 // x, 1
-  args[8]  = (char *) "c_SPECATOM[3]";	 // y, 2
-  args[9]  = (char *) "c_SPECATOM[4]";	 // z, 3
-  args[10] = (char *) "c_SPECATOM[5]";	 // vx, 4
-  args[11] = (char *) "c_SPECATOM[6]";	 // vy, 5
-  args[12] = (char *) "c_SPECATOM[7]";	 // vz, 6
-  args[13] = (char *) "c_SPECATOM[8]";	 // abo01, 7
+  args[6]  = (char *) "c_SPECATOM[1]";   // q, array_atoms[i][0]
+  args[7]  = (char *) "c_SPECATOM[2]";   // x, 1
+  args[8]  = (char *) "c_SPECATOM[3]";   // y, 2
+  args[9]  = (char *) "c_SPECATOM[4]";   // z, 3
+  args[10] = (char *) "c_SPECATOM[5]";   // vx, 4
+  args[11] = (char *) "c_SPECATOM[6]";   // vy, 5
+  args[12] = (char *) "c_SPECATOM[7]";   // vz, 6
+  args[13] = (char *) "c_SPECATOM[8]";   // abo01, 7
  args[14] = (char *) "c_SPECATOM[9]";
  args[15] = (char *) "c_SPECATOM[10]";
  args[16] = (char *) "c_SPECATOM[11]";
@ -523,6 +520,8 @@ void FixReaxCSpecies::FindMolecule ()
  int *ilist;
  double bo_tmp,bo_cut;
  double **spec_atom = f_SPECBOND->array_atom;
+  const double * const * const x = atom->x;
+  const int nlocal = atom->nlocal;

  inum = reaxc->list->inum;
  ilist = reaxc->list->ilist;
@ -548,35 +547,47 @@ void FixReaxCSpecies::FindMolecule ()
      done = 1;

      for (ii = 0; ii < inum; ii++) {
-      	i = ilist[ii];
-      	if (!(mask[i] & groupbit)) continue;
+        i = ilist[ii];
+        if (!(mask[i] & groupbit)) continue;

-      	itype = atom->type[i];
+        itype = atom->type[i];

        for (jj = 0; jj < MAXSPECBOND; jj++) {
-      	  j = reaxc->tmpid[i][jj];
+          j = reaxc->tmpid[i][jj];

-      	  if (j < i) continue;
-      	  if (!(mask[j] & groupbit)) continue;
+          if ((j == 0) || (j < i)) continue;
+          if (!(mask[j] & groupbit)) continue;

-      	  if (clusterID[i] == clusterID[j] && PBCconnected[i] == PBCconnected[j]
-	    && x0[i].x == x0[j].x && x0[i].y == x0[j].y && x0[i].z == x0[j].z) continue;
+          if (clusterID[i] == clusterID[j]
+              && PBCconnected[i] == PBCconnected[j]
+              && x0[i].x == x0[j].x
+              && x0[i].y == x0[j].y
+              && x0[i].z == x0[j].z) continue;

          jtype = atom->type[j];
-      	  bo_cut = BOCut[itype][jtype];
-      	  bo_tmp = spec_atom[i][jj+7];
+          bo_cut = BOCut[itype][jtype];
+          bo_tmp = spec_atom[i][jj+7];

-      	  if (bo_tmp > bo_cut) {
+          if (bo_tmp > bo_cut) {
            clusterID[i] = clusterID[j] = MIN(clusterID[i], clusterID[j]);
            PBCconnected[i] = PBCconnected[j] = MAX(PBCconnected[i], PBCconnected[j]);
            x0[i] = x0[j] = chAnchor(x0[i], x0[j]);
-            if ((fabs(spec_atom[i][1] - spec_atom[j][1]) > reaxc->control->bond_cut)
-             || (fabs(spec_atom[i][2] - spec_atom[j][2]) > reaxc->control->bond_cut)
-             || (fabs(spec_atom[i][3] - spec_atom[j][3]) > reaxc->control->bond_cut))
-              PBCconnected[i] = PBCconnected[j] = 1;
-      	    done = 0;
-      	  }
-      	}
+            // spec_atom[][] contains filtered coordinates only for local atoms,
+            // so we have to use unfiltered ones for ghost atoms.
+            if (j < nlocal) {
+              if ((fabs(spec_atom[i][1] - spec_atom[j][1]) > reaxc->control->bond_cut)
+                  || (fabs(spec_atom[i][2] - spec_atom[j][2]) > reaxc->control->bond_cut)
+                  || (fabs(spec_atom[i][3] - spec_atom[j][3]) > reaxc->control->bond_cut))
+                PBCconnected[i] = PBCconnected[j] = 1;
+            } else {
+              if ((fabs(spec_atom[i][1] - x[j][1]) > reaxc->control->bond_cut)
+                  || (fabs(spec_atom[i][2] - x[j][2]) > reaxc->control->bond_cut)
+                  || (fabs(spec_atom[i][3] - x[j][3]) > reaxc->control->bond_cut))
+                PBCconnected[i] = PBCconnected[j] = 1;
+            }
+            done = 0;
+          }
+        }
      }
      if (!done) change = 1;
      if (done) break;
@ -612,13 +623,13 @@ void FixReaxCSpecies::SortMolecule(int &Nmole)
  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
  if (flagall && me == 0)
    error->warning(FLERR,"Atom with cluster ID = 0 included in "
-		    "fix reax/c/species group");
+                   "fix reax/c/species group");
  MPI_Allreduce(&lo,&idlo,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&hi,&idhi,1,MPI_INT,MPI_MAX,world);
  if (idlo == ntotal)
    if (me == 0)
      error->warning(FLERR,"Atom with cluster ID = maxmol "
-		    "included in fix reax/c/species group");
+                     "included in fix reax/c/species group");

  int nlen = idhi - idlo + 1;
  memory->create(molmap,nlen,"reax/c/species:molmap");
@ -798,7 +809,7 @@ void FixReaxCSpecies::OpenPos()
  *ptr = '\0';
  if (padflag == 0)
    sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
-	filepos,ntimestep,ptr+1);
+            filepos,ntimestep,ptr+1);
  else {
    char bif[8],pad[16];
    strcpy(bif,BIGINT_FORMAT);
@ -838,11 +849,11 @@ void FixReaxCSpecies::WritePos(int Nmole, int Nspec)

  if (me == 0) {
    fprintf(pos,"Timestep " BIGINT_FORMAT " NMole %d  NSpec %d  xlo %f  "
-		"xhi %f  ylo %f  yhi %f  zlo %f  zhi %f\n",
-	        update->ntimestep,Nmole, Nspec,
-		domain->boxlo[0],domain->boxhi[0],
-		domain->boxlo[1],domain->boxhi[1],
-		domain->boxlo[2],domain->boxhi[2]);
+            "xhi %f  ylo %f  yhi %f  zlo %f  zhi %f\n",
+            update->ntimestep,Nmole, Nspec,
+            domain->boxlo[0],domain->boxhi[0],
+            domain->boxlo[1],domain->boxhi[1],
+            domain->boxlo[2],domain->boxhi[2]);

    fprintf(pos,"ID\tAtom_Count\tType\tAve_q\t\tCoM_x\t\tCoM_y\t\tCoM_z\n");
  }
@ -865,21 +876,21 @@ void FixReaxCSpecies::WritePos(int Nmole, int Nspec)
      if (cid == m) {
        itype = atom->type[i]-1;
        Name[itype] ++;
-      	count ++;
-      	avq += spec_atom[i][0];
+        count ++;
+        avq += spec_atom[i][0];
        if (PBCconnected[i]) {
          if ((x0[i].x - spec_atom[i][1]) > halfbox[0])
            spec_atom[i][1] += box[0];
          if ((spec_atom[i][1] - x0[i].x) > halfbox[0])
-             spec_atom[i][1] -= box[0];
+            spec_atom[i][1] -= box[0];
          if ((x0[i].y - spec_atom[i][2]) > halfbox[1])
-              spec_atom[i][2] += box[1];
+            spec_atom[i][2] += box[1];
          if ((spec_atom[i][2] - x0[i].y) > halfbox[1])
-             spec_atom[i][2] -= box[1];
+            spec_atom[i][2] -= box[1];
          if ((x0[i].z - spec_atom[i][3]) > halfbox[2])
            spec_atom[i][3] += box[2];
          if ((spec_atom[i][3] - x0[i].z) > halfbox[2])
-           spec_atom[i][3] -= box[2];
+            spec_atom[i][3] -= box[2];
        }
        for (n = 0; n < 3; n++)
          avx[n] += spec_atom[i][n+1];
@ -914,17 +925,17 @@ void FixReaxCSpecies::WritePos(int Nmole, int Nspec)
      if (count > 0) {
        avq /= count;
        for (k = 0; k < 3; k++) {
-      	  avx[k] /= count;
+          avx[k] /= count;
          if (avx[k] >= domain->boxhi[k])
            avx[k] -= box[k];
          if (avx[k] < domain->boxlo[k])
            avx[k] += box[k];

-      	  avx[k] -= domain->boxlo[k];
-      	  avx[k] /= box[k];
+          avx[k] -= domain->boxlo[k];
+          avx[k] /= box[k];
        }
        fprintf(pos,"\t%.8f \t%.8f \t%.8f \t%.8f",
-		    avq,avx[0],avx[1],avx[2]);
+                avq,avx[0],avx[1],avx[2]);
      }
      fprintf(pos,"\n");
    }
--- a/src/USER-REAXC/pair_reaxc.cpp
+++ b/src/USER-REAXC/pair_reaxc.cpp
@ -211,6 +211,9 @@ void PairReaxC::settings(int narg, char **arg)
    control->thb_cutsq = 0.00001;
    control->bg_cut = 0.3;

+    // Initialize for when omp style included
+    control->nthreads = 1;
+
    out_control->write_steps = 0;
    out_control->traj_method = 0;
    strcpy( out_control->traj_title, "default_title" );
@ -227,7 +230,7 @@ void PairReaxC::settings(int narg, char **arg)
  system->mincap = MIN_CAP;
  system->safezone = SAFE_ZONE;
  system->saferzone = SAFER_ZONE;
-  
+
  // process optional keywords

  int iarg = 1;
@ -256,7 +259,7 @@ void PairReaxC::settings(int narg, char **arg)
      system->safezone = force->numeric(FLERR,arg[iarg+1]);
      if (system->safezone < 0.0)
 	error->all(FLERR,"Illegal pair_style reax/c safezone command");
-      system->saferzone = system->safezone*1.2;
+      system->saferzone = system->safezone*1.2 + 0.2;
      iarg += 2;
    } else if (strcmp(arg[iarg],"mincap") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal pair_style reax/c command");
@ -457,6 +460,9 @@ void PairReaxC::setup( )

    ReAllocate( system, control, data, workspace, &lists, mpi_data );
  }
+
+  bigint local_ngroup = list->inum;
+  MPI_Allreduce( &local_ngroup, &ngroup, 1, MPI_LMP_BIGINT, MPI_SUM, world );
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-REAXC/pair_reaxc.h
+++ b/src/USER-REAXC/pair_reaxc.h
@ -37,12 +37,18 @@ namespace LAMMPS_NS {

 class PairReaxC : public Pair {
 public:
+  PairReaxC(class LAMMPS *);
+  ~PairReaxC();
+  void compute(int, int);
+  void settings(int, char **);
+  void coeff(int, char **);
+  virtual void init_style();
+  double init_one(int, int);
+  void *extract(const char *, int &);
  int fixbond_flag, fixspecies_flag;
  int **tmpid;
  double **tmpbo,**tmpr;
-  double *chi,*eta,*gamma;

-  int *map;
  control_params *control;
  reax_system *system;
  output_controls *out_control;
@ -51,21 +57,16 @@ class PairReaxC : public Pair {
  reax_list *lists;
  mpi_datatypes *mpi_data;

-  PairReaxC(class LAMMPS *);
-  ~PairReaxC();
-  void compute(int, int);
-  void settings(int, char **);
-  void coeff(int, char **);
-  void init_style();
-  double init_one(int, int);
-  void *extract(const char *, int &);
+  bigint ngroup;

 protected:
  double cutmax;
  int nelements;                // # of unique elements
  char **elements;              // names of unique elements
+  int *map;
  class FixReaxC *fix_reax;

+  double *chi,*eta,*gamma;
  int qeqflag;
  int setup_flag;
  int firstwarn;
--- a/src/USER-REAXC/reaxc_allocate.cpp
+++ b/src/USER-REAXC/reaxc_allocate.cpp
@ -31,6 +31,10 @@
 #include "reaxc_tool_box.h"
 #include "reaxc_vector.h"

+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
 /* allocate space for my_atoms
   important: we cannot know the exact number of atoms that will fall into a
   process's box throughout the whole simulation. therefore
@ -49,6 +53,15 @@ int PreAllocate_Space( reax_system *system, control_params *control,
  system->my_atoms = (reax_atom*)
    scalloc( system->total_cap, sizeof(reax_atom), "my_atoms", comm );

+  // Nullify some arrays only used in omp styles
+  // Should be safe to do here since called in pair->setup();
+#ifdef LMP_USER_OMP
+  workspace->CdDeltaReduction = NULL;
+  workspace->forceReduction = NULL;
+  workspace->valence_angle_atom_myoffset = NULL;
+  workspace->my_ext_pressReduction = NULL;
+#endif
+
  return SUCCESS;
 }

@ -174,13 +187,21 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
  sfree( workspace->q2, "q2" );
  sfree( workspace->p2, "p2" );

-  /* integrator */
+  /* integrator storage */
  sfree( workspace->v_const, "v_const" );

  /* force related storage */
  sfree( workspace->f, "f" );
  sfree( workspace->CdDelta, "CdDelta" );

+  /* reductions */
+#ifdef LMP_USER_OMP
+  if(workspace->CdDeltaReduction) sfree( workspace->CdDeltaReduction, "cddelta_reduce" );
+  if(workspace->forceReduction) sfree( workspace->forceReduction, "f_reduce" );
+  if(workspace->valence_angle_atom_myoffset) sfree( workspace->valence_angle_atom_myoffset, "valence_angle_atom_myoffset");
+
+  if (control->virial && workspace->my_ext_pressReduction) sfree( workspace->my_ext_pressReduction, "ext_press_reduce");
+#endif
 }


@ -272,11 +293,25 @@ int Allocate_Workspace( reax_system *system, control_params *control,
  /* integrator storage */
  workspace->v_const = (rvec*) smalloc( local_rvec, "v_const", comm );

-  // /* force related storage */
+  /* force related storage */
  workspace->f = (rvec*) scalloc( total_cap, sizeof(rvec), "f", comm );
  workspace->CdDelta = (double*)
    scalloc( total_cap, sizeof(double), "CdDelta", comm );

+  // storage for reductions with multiple threads
+#ifdef LMP_USER_OMP
+  workspace->CdDeltaReduction = (double *) scalloc(sizeof(double), total_cap*control->nthreads,
+						 "cddelta_reduce", comm);
+
+  workspace->forceReduction = (rvec *) scalloc(sizeof(rvec), total_cap*control->nthreads,
+					       "forceReduction", comm);
+
+  workspace->valence_angle_atom_myoffset = (int *) scalloc(sizeof(int), total_cap, "valence_angle_atom_myoffset", comm);
+
+  if (control->virial)
+    workspace->my_ext_pressReduction = (rvec *) calloc(sizeof(rvec), control->nthreads);
+#endif
+
  return SUCCESS;
 }

@ -334,12 +369,29 @@ static int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
  }
  *total_bonds = (int)(MAX( *total_bonds * safezone, mincap*MIN_BONDS ));

+#ifdef LMP_USER_OMP
+  for (i = 0; i < bonds->num_intrs; ++i)
+    sfree(bonds->select.bond_list[i].bo_data.CdboReduction, "CdboReduction");
+#endif
+
  Delete_List( bonds, comm );
  if(!Make_List(system->total_cap, *total_bonds, TYP_BOND, bonds, comm)) {
    fprintf( stderr, "not enough space for bonds list. terminating!\n" );
    MPI_Abort( comm, INSUFFICIENT_MEMORY );
  }

+#ifdef LMP_USER_OMP
+#if defined(_OPENMP)
+  int nthreads = omp_get_num_threads();
+#else
+  int nthreads = 1;
+#endif
+
+  for (i = 0; i < bonds->num_intrs; ++i)
+    bonds->select.bond_list[i].bo_data.CdboReduction =
+      (double*) smalloc(sizeof(double)*nthreads, "CdboReduction", comm);
+#endif
+
  return SUCCESS;
 }

@ -438,7 +490,7 @@ void ReAllocate( reax_system *system, control_params *control,
    Reallocate_Bonds_List( system, (*lists)+BONDS, &num_bonds,
                           &est_3body, comm );
    realloc->bonds = 0;
-    realloc->num_3body = MAX( realloc->num_3body, est_3body );
+    realloc->num_3body = MAX( realloc->num_3body, est_3body ) * 2;
  }

  /* 3-body list */
--- a/src/USER-REAXC/reaxc_control.cpp
+++ b/src/USER-REAXC/reaxc_control.cpp
@ -48,6 +48,7 @@ char Read_Control_File( char *control_file, control_params* control,
  control->nsteps          = 0;
  control->dt              = 0.25;
  control->nprocs          = 1;
+  control->nthreads        = 1;
  control->procs_by_dim[0] = 1;
  control->procs_by_dim[1] = 1;
  control->procs_by_dim[2] = 1;
--- a/src/USER-REAXC/reaxc_types.h
+++ b/src/USER-REAXC/reaxc_types.h
@ -38,6 +38,40 @@
 #include "sys/time.h"
 #include <time.h>

+#if defined LMP_USER_OMP
+#define OMP_TIMING 1
+
+#ifdef OMP_TIMING
+// pkcoff timing fields
+enum {
+	COMPUTEINDEX=0,
+	COMPUTEWLINDEX,
+	COMPUTEBFINDEX,
+	COMPUTEQEQINDEX,
+	COMPUTENBFINDEX,
+	COMPUTEIFINDEX,
+	COMPUTETFINDEX,
+	COMPUTEBOINDEX,
+	COMPUTEBONDSINDEX,
+	COMPUTEATOMENERGYINDEX,
+	COMPUTEVALENCEANGLESBOINDEX,
+	COMPUTETORSIONANGLESBOINDEX,
+	COMPUTEHBONDSINDEX,
+	COMPUTECG1INDEX,
+	COMPUTECG2INDEX,
+	COMPUTECGCOMPUTEINDEX,
+	COMPUTECALCQINDEX,
+	COMPUTEINITMVINDEX,
+	COMPUTEMVCOMPINDEX,
+	LASTTIMINGINDEX
+};
+
+extern double ompTimingData[LASTTIMINGINDEX];
+extern int ompTimingCount[LASTTIMINGINDEX];
+extern int ompTimingCGCount[LASTTIMINGINDEX];
+#endif
+#endif
+
 /************* SOME DEFS - crucial for reax_types.h *********/

 #define LAMMPS_REAX
@ -391,6 +425,7 @@ typedef struct
 {
  char sim_name[REAX_MAX_STR];
  int  nprocs;
+  int  nthreads;
  ivec procs_by_dim;
  /* ensemble values:
     0 : NVE
@ -451,7 +486,7 @@ typedef struct

  int lgflag;
  int enobondsflag;
-  
+
 } control_params;


@ -616,6 +651,7 @@ typedef struct{
  double C1dbopi, C2dbopi, C3dbopi, C4dbopi;
  double C1dbopi2, C2dbopi2, C3dbopi2, C4dbopi2;
  rvec dBOp, dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+  double *CdboReduction;
 } bond_order_data;

 typedef struct {
@ -702,6 +738,13 @@ typedef struct
  double *CdDelta;  // coefficient of dDelta
  rvec *f;

+  /* omp */
+  rvec *forceReduction;
+  rvec *my_ext_pressReduction;
+  double *CdDeltaReduction;
+  int *valence_angle_atom_myoffset;
+
+
  reallocate_data realloc;
 } storage;