From 7fcc76f071ef25d6798a9cf43483a058df1b3eb9 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 2 Jan 2019 16:15:57 -0500
Subject: [PATCH] implement the first three convenience utility functions

---
 src/utils.cpp | 439 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/utils.h   |  76 +++++++++
 2 files changed, 515 insertions(+)
 create mode 100644 src/utils.cpp
 create mode 100644 src/utils.h
diff --git a/src/utils.cpp b/src/utils.cpp
new file mode 100644
index 0000000000..8709aef3eb
--- /dev/null
+++ b/src/utils.cpp
@@ -0,0 +1,439 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "utils.h"
+#include "error.h"
+
+/*! \file utils.cpp */
+
+/*
+ * Mini regex-module adapted from https://github.com/kokke/tiny-regex-c
+ * which is in the public domain.
+ * 
+ * Supports:
+ * ---------
+ *   '.'        Dot, matches any character
+ *   '^'        Start anchor, matches beginning of string
+ *   '$'        End anchor, matches end of string
+ *   '*'        Asterisk, match zero or more (greedy)
+ *   '+'        Plus, match one or more (greedy)
+ *   '?'        Question, match zero or one (non-greedy)
+ *   '[abc]'    Character class, match if one of {'a', 'b', 'c'}
+ *   '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
+ *   '\s'       Whitespace, \t \f \r \n \v and spaces
+ *   '\S'       Non-whitespace
+ *   '\w'       Alphanumeric, [a-zA-Z0-9_]
+ *   '\W'       Non-alphanumeric
+ *   '\d'       Digits, [0-9]
+ *   '\D'       Non-digits
+ *
+ * *NOT* supported:
+ *   '[^abc]'   Inverted class
+ *   'a|b'      Branches
+ *   '(abc)+'   Groups
+ */
+
+extern "C"
+{
+  /** Match text against a (simplified) regular expression
+   * (regexp will be compiled automatically). */
+  static int  re_match(const char *text, const char *pattern);
+}
+
+using namespace LAMMPS_NS;
+
+/** More flexible and specific matching of a string against a pattern.
+ *  This function is supposed to be a more safe, more specific and
+ *  simple to use API to find pattern matches. The purpose is to replace
+ *  uses of either strncmp() or strstr() in the code base to find
+ *  substrings safely. With strncmp() finding prefixes, the number of
+ *  characters to match must be counted, which can lead to errors,
+ *  while using "^pattern" will do the same with less problems.
+ *  Matching for suffixes using strstr() is not as specific as 'pattern$',
+ *  and complex matches, e.g. "^rigid.*\/small.*", to match all small
+ *  body optimized rigid fixes require only one test.
+ *
+ *  The use of std::string arguments allows for simple concatenation
+ *  even with char * type variables.
+ *  Example: utils::strmatch(text, std::string("^") + charptr)
+ */
+bool utils::strmatch(std::string text, std::string pattern)
+{
+  const int pos = re_match(text.c_str(),pattern.c_str());
+  return (pos >= 0);
+}
+
+/* utility function to avoid code repetition when parsing args */
+int utils::cfvarg(std::string mode, const char *arg, char *&cfv_id)
+{
+  int rv = utils::NONE;
+  cfv_id = NULL;
+
+  if (!arg) return rv;
+
+  if (utils::strmatch(arg,std::string("^[") + mode + "]_")) {
+    if (*arg == 'c') rv = utils::COMPUTE;
+    else if (*arg == 'f') rv = utils::FIX;
+    else if (*arg == 'v') rv = utils::VARIABLE;
+    else return rv;             // should not happen
+
+    arg += 2;
+    int n = strlen(arg)+1;
+    cfv_id = new char[n];
+    strcpy(cfv_id,arg);
+  }
+
+  return rv;
+}
+
+/* like fgets() but aborts with an error or EOF is encountered */
+void utils::sfgets(char* srcname, int srcline, char *s, int size,
+                   FILE *fp, std::string filename, Error *error)
+{
+  char *rv = fgets(s,size,fp);
+  if (rv == NULL) { // something went wrong
+    std::string errmsg;
+
+    if (feof(fp)) {
+      errmsg = "Unexpected end of file while reading file '";
+    } else if (ferror(fp)) {
+      errmsg = "Unexpected error while reading file '";
+    } else {
+      errmsg = "Unexpected short read while reading file '";
+    }
+    errmsg += filename + "'";
+
+    if (error) error->one(srcname,srcline,errmsg.c_str());
+    if (s) *s = '\0'; // truncate string to empty in case error is NULL
+  }
+  return;
+}
+
+/* ------------------------------------------------------------------ */
+
+extern "C" {
+  /* Typedef'd pointer to get abstract datatype. */
+  typedef struct regex_t *re_t;
+
+  /* Compile regex string pattern to a regex_t-array. */
+  static re_t re_compile(const char *pattern);
+
+
+  /* Find matches of the compiled pattern inside text. */
+  static int  re_matchp(const char *text, re_t pattern);
+
+
+/* Definitions: */
+
+#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
+#define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in.   */
+
+
+  enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS,
+         CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT,
+         ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE /*, BRANCH */ };
+
+  typedef struct regex_t {
+    unsigned char  type;   /* CHAR, STAR, etc.                      */
+    union {
+      unsigned char  ch;   /*      the character itself             */
+      unsigned char *ccl;  /*  OR  a pointer to characters in class */
+    };
+  } regex_t;
+
+/* Private function declarations: */
+  static int matchpattern(regex_t *pattern, const char *text);
+  static int matchcharclass(char c, const char *str);
+  static int matchstar(regex_t p, regex_t *pattern, const char *text);
+  static int matchplus(regex_t p, regex_t *pattern, const char *text);
+  static int matchone(regex_t p, char c);
+  static int matchdigit(char c);
+  static int matchalpha(char c);
+  static int matchwhitespace(char c);
+  static int matchmetachar(char c, const char *str);
+  static int matchrange(char c, const char *str);
+  static int ismetachar(char c);
+
+/* Semi-public functions: */
+  int re_match(const char *text, const char *pattern)
+  {
+    return re_matchp(text, re_compile(pattern));
+  }
+
+  int re_matchp(const char *text, re_t pattern)
+  {
+    if (pattern != 0) {
+      if (pattern[0].type == BEGIN) {
+        return ((matchpattern(&pattern[1], text)) ? 0 : -1);
+      } else {
+        int idx = -1;
+
+        do {
+          idx += 1;
+        
+          if (matchpattern(pattern, text)) {
+            if (text[0] == '\0')
+              return -1;
+        
+            return idx;
+          }
+        }
+        while (*text++ != '\0');
+      }
+    }
+    return -1;
+  }
+
+  re_t re_compile(const char *pattern)
+  {
+    /* The sizes of the two static arrays below substantiates the static RAM usage of this module.
+       MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
+       MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
+    static regex_t re_compiled[MAX_REGEXP_OBJECTS];
+    static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
+    int ccl_bufidx = 1;
+
+    char c;     /* current char in pattern   */
+    int i = 0;  /* index into pattern        */
+    int j = 0;  /* index into re_compiled    */
+
+    while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS)) {
+      c = pattern[i];
+
+      switch (c) {
+        /* Meta-characters: */
+      case '^': {    re_compiled[j].type = BEGIN;           } break;
+      case '$': {    re_compiled[j].type = END;             } break;
+      case '.': {    re_compiled[j].type = DOT;             } break;
+      case '*': {    re_compiled[j].type = STAR;            } break;
+      case '+': {    re_compiled[j].type = PLUS;            } break;
+      case '?': {    re_compiled[j].type = QUESTIONMARK;    } break;
+
+        /* Escaped character-classes (\s \w ...): */
+      case '\\': {
+        if (pattern[i+1] != '\0') {
+          /* Skip the escape-char '\\' */
+          i += 1;
+          /* ... and check the next */
+          switch (pattern[i]) {
+            /* Meta-character: */
+          case 'd': {    re_compiled[j].type = DIGIT;            } break;
+          case 'D': {    re_compiled[j].type = NOT_DIGIT;        } break;
+          case 'w': {    re_compiled[j].type = ALPHA;            } break;
+          case 'W': {    re_compiled[j].type = NOT_ALPHA;        } break;
+          case 's': {    re_compiled[j].type = WHITESPACE;       } break;
+          case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
+
+            /* Escaped character, e.g. '.' or '$' */ 
+          default: {
+            re_compiled[j].type = CHAR;
+            re_compiled[j].ch = pattern[i];
+          } break;
+          }
+        }
+        /* '\\' as last char in pattern -> invalid regular expression. */
+      } break;
+
+        /* Character class: */
+      case '[': {
+        /* Remember where the char-buffer starts. */
+        int buf_begin = ccl_bufidx;
+
+        /* Look-ahead to determine if negated */
+        if (pattern[i+1] == '^') {
+          re_compiled[j].type = INV_CHAR_CLASS;
+          i += 1; /* Increment i to avoid including '^' in the char-buffer */
+        } else {
+          re_compiled[j].type = CHAR_CLASS;
+        }
+
+        /* Copy characters inside [..] to buffer */
+        while ((pattern[++i] != ']') && (pattern[i] != '\0')) {
+          /* Missing ] */ 
+          if (pattern[i] == '\\') {
+            if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) {
+              return 0;
+            }
+            ccl_buf[ccl_bufidx++] = pattern[i++];
+          } else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
+            return 0;
+          }
+          ccl_buf[ccl_bufidx++] = pattern[i];
+        }
+        if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
+          /* Catches cases such as [00000000000000000000000000000000000000][ */
+          return 0;
+        }
+        /* Null-terminate string end */
+        ccl_buf[ccl_bufidx++] = 0;
+        re_compiled[j].ccl = &ccl_buf[buf_begin];
+      } break;
+
+        /* Other characters: */
+      default: {
+        re_compiled[j].type = CHAR;
+        re_compiled[j].ch = c;
+      } break;
+      }
+      i += 1;
+      j += 1;
+    }
+    /* 'UNUSED' is a sentinel used to indicate end-of-pattern */
+    re_compiled[j].type = UNUSED;
+
+    return (re_t) re_compiled;
+  }
+
+
+/* Private functions: */
+  static int matchdigit(char c)
+  {
+    return ((c >= '0') && (c <= '9'));
+  }
+
+  static int matchalpha(char c)
+  {
+    return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
+  }
+
+  static int matchwhitespace(char c)
+  {
+    return ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r') || (c == '\f') || (c == '\v'));
+  }
+
+  static int matchalphanum(char c)
+  {
+    return ((c == '_') || matchalpha(c) || matchdigit(c));
+  }
+
+  static int matchrange(char c, const char *str)
+  {
+    return ((c != '-') && (str[0] != '\0')
+            && (str[0] != '-') && (str[1] == '-')
+            && (str[1] != '\0') && (str[2] != '\0')
+            && ((c >= str[0]) && (c <= str[2])));
+  }
+
+  static int ismetachar(char c)
+  {
+    return ((c == 's') || (c == 'S')
+            || (c == 'w') || (c == 'W')
+            || (c == 'd') || (c == 'D'));
+  }
+
+  static int matchmetachar(char c, const char *str)
+  {
+    switch (str[0]) {
+    case 'd': return  matchdigit(c);
+    case 'D': return !matchdigit(c);
+    case 'w': return  matchalphanum(c);
+    case 'W': return !matchalphanum(c);
+    case 's': return  matchwhitespace(c);
+    case 'S': return !matchwhitespace(c);
+    default:  return (c == str[0]);
+    }
+  }
+
+  static int matchcharclass(char c, const char *str)
+  {
+    do {
+      if (matchrange(c, str)) {
+        return 1;
+      } else if (str[0] == '\\') {
+        /* Escape-char: increment str-ptr and match on next char */
+        str += 1;
+        if (matchmetachar(c, str)) {
+          return 1;
+        } else if ((c == str[0]) && !ismetachar(c)) {
+          return 1;
+        }
+      } else if (c == str[0]) {
+        if (c == '-') {
+          return ((str[-1] == '\0') || (str[1] == '\0'));
+        } else {
+          return 1;
+        }
+      }
+    }
+    while (*str++ != '\0');
+
+    return 0;
+  }
+
+  static int matchone(regex_t p, char c)
+  {
+    switch (p.type) {
+    case DOT:            return 1;
+    case CHAR_CLASS:     return  matchcharclass(c, (const char *)p.ccl);
+    case INV_CHAR_CLASS: return !matchcharclass(c, (const char *)p.ccl);
+    case DIGIT:          return  matchdigit(c);
+    case NOT_DIGIT:      return !matchdigit(c);
+    case ALPHA:          return  matchalphanum(c);
+    case NOT_ALPHA:      return !matchalphanum(c);
+    case WHITESPACE:     return  matchwhitespace(c);
+    case NOT_WHITESPACE: return !matchwhitespace(c);
+    default:             return  (p.ch == c);
+    }
+  }
+
+  static int matchstar(regex_t p, regex_t *pattern, const char *text)
+  {
+    do {
+      if (matchpattern(pattern, text))
+        return 1;
+    }
+    while ((text[0] != '\0') && matchone(p, *text++));
+
+    return 0;
+  }
+
+  static int matchplus(regex_t p, regex_t *pattern, const char *text)
+  {
+    while ((text[0] != '\0') && matchone(p, *text++)) {
+      if (matchpattern(pattern, text))
+        return 1;
+    }
+    return 0;
+  }
+
+  static int matchquestion(regex_t p, regex_t *pattern, const char *text)
+  {
+    if (p.type == UNUSED)
+      return 1;
+    if (matchpattern(pattern, text))
+      return 1;
+    if (*text && matchone(p, *text++))
+      return matchpattern(pattern, text);
+    return 0;
+  }
+
+/* Iterative matching */
+  static int matchpattern(regex_t *pattern, const char *text)
+  {
+    do {
+      if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) {
+        return matchquestion(pattern[0], &pattern[2], text);
+      } else if (pattern[1].type == STAR) {
+        return matchstar(pattern[0], &pattern[2], text);
+      } else if (pattern[1].type == PLUS) {
+        return matchplus(pattern[0], &pattern[2], text);
+      } else if ((pattern[0].type == END) && pattern[1].type == UNUSED) {
+        return (text[0] == '\0');
+      }
+    }
+    while ((text[0] != '\0') && matchone(*pattern++, *text++));
+
+    return 0;
+  }
+}
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000000..230cb44cba
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,76 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_UTILS_H
+#define LMP_UTILS_H
+
+/*! \file utils.h */
+
+#include <string>
+#include <cstdio>
+
+namespace LAMMPS_NS {
+
+  // forward declarations
+  class Error;
+
+  namespace utils {
+
+    /** \brief Match text against a simplified regex pattern
+     *
+     *  \param text the text to be matched against the pattern
+     *  \param pattern the search pattern, which may contain regexp markers
+     *  \return true if the pattern matches, false if not
+     */
+    bool strmatch(std::string text, std::string pattern);
+
+    /** Categories of special arguments for cfvarg() function
+     *
+     * Enum starts from 100 to avoid conflicts with other local define flags
+     */
+    enum {NONE=100,              /// does not match any category
+          COMPUTE,               /// processed a compute
+          FIX,                   /// processed a fix
+          VARIABLE               /// processed a variable
+    };
+
+    /** \brief Convenience function to process 'c_', 'f_', and 'v_' arguments
+     *
+     *  \param mode types to search for. 1-3 char string from 'c', 'f', or 'v'
+     *  \param arg  argument string to test against the prefixes
+     *  \param cfv_id name or ID of the compute, fix, or variable
+     *  \return utils::COMPUTE, utils::FIX, utils::VARIABLE or utils::NONE
+     */
+    int cfvarg(std::string mode, const char *arg, char *&cfv_id);
+
+    /** \brief safe wrapper around fgets() which aborts on errors
+     *  or EOF and prints a suitable error message to help debugging
+     *
+     *  \param srcname  name of the calling source file (from FLERR macro)
+     *  \param srcline  line in the calling source file (from FLERR macro)
+     *  \param s        buffer for storing the result of fgets()
+     *  \param size     size of buffer s (max number of bytes read by fgets())
+     *  \param fp       file pointer used by fgets()
+     *  \param filename file name associated with fp (for error message)
+     *  \param error    pointer to Error class instance (for abort)
+     */
+    void sfgets(char* srcname, int srcline, char *s, int size,
+                FILE *fp, std::string filename, Error *error);
+  }
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/