diff --git a/doc/src/Developer_utils.rst b/doc/src/Developer_utils.rst index 2945420b5a..992df6ba63 100644 --- a/doc/src/Developer_utils.rst +++ b/doc/src/Developer_utils.rst @@ -104,6 +104,9 @@ and parsing files or arguments. .. doxygenfunction:: strmatch :project: progguide +.. doxygenfunction:: strfind + :project: progguide + .. doxygenfunction:: is_integer :project: progguide diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst index 815db12668..be3b17abac 100644 --- a/doc/src/Packages_details.rst +++ b/doc/src/Packages_details.rst @@ -367,7 +367,7 @@ KIM package **Contents:** -This package contains a command with a set of subcommands that serve as a +This package contains a command with a set of sub-commands that serve as a wrapper on the `Open Knowledgebase of Interatomic Models (OpenKIM) `_ repository of interatomic models (IMs) enabling compatible ones to be used in diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index 982e1fde2a..d1ffc2eaf3 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -2367,6 +2367,7 @@ parmin Parrinello Partay Particuology +Pascuet pastewka Pastewka pathangle diff --git a/src/utils.cpp b/src/utils.cpp index ee2533e725..b7f679a28a 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -69,6 +69,10 @@ extern "C" /** Match text against a (simplified) regular expression * (regexp will be compiled automatically). */ static int re_match(const char *text, const char *pattern); + + /** Match find substring that matches a (simplified) regular expression + * (regexp will be compiled automatically). */ + static int re_find(const char *text, const char *pattern, int *matchlen); } //////////////////////////////////////////////////////////////////////// @@ -104,6 +108,21 @@ bool utils::strmatch(const std::string &text, const std::string &pattern) return (pos >= 0); } +/** This function is a companion function to utils::strmatch(). Arguments + * and logic is the same, but instead of a boolean, it returns the + * sub-string that matches the regex pattern. There can be only one match. + * This can be used as a more flexible alternative to strstr(). + */ +std::string utils::strfind(const std::string &text, const std::string &pattern) +{ + int matchlen; + const int pos = re_find(text.c_str(),pattern.c_str(),&matchlen); + if ((pos >=0) && (matchlen > 0)) + return text.substr(pos,matchlen); + else + return ""; +} + /** This function simplifies the repetitive task of outputting some * message to both the screen and/or the log file. In combination * with using fmt::format(), which returns the formatted text @@ -1258,16 +1277,26 @@ static void do_merge(int *idx, int *buf, int llo, int lhi, int rlo, int rhi, /* ------------------------------------------------------------------ */ extern "C" { + /* Typedef'd pointer to get abstract datatype. */ typedef struct regex_t *re_t; /* Compile regex string pattern to a regex_t-array. */ static re_t re_compile(const char *pattern); - /* Find matches of the compiled pattern inside text. */ - static int re_matchp(const char *text, re_t pattern); + static int re_matchp(const char *text, re_t pattern, int *matchlen); + int re_match(const char *text, const char *pattern) + { + int dummy; + return re_matchp(text, re_compile(pattern), &dummy); + } + + int re_find(const char *text, const char *pattern, int *matchlen) + { + return re_matchp(text, re_compile(pattern), matchlen); + } /* Definitions: */ @@ -1285,14 +1314,14 @@ extern "C" { union { unsigned char ch; /* the character itself */ unsigned char *ccl; /* OR a pointer to characters in class */ - }; + } u; } regex_t; /* Private function declarations: */ - static int matchpattern(regex_t *pattern, const char *text); + static int matchpattern(regex_t *pattern, const char *text, int *matchlen); static int matchcharclass(char c, const char *str); - static int matchstar(regex_t p, regex_t *pattern, const char *text); - static int matchplus(regex_t p, regex_t *pattern, const char *text); + static int matchstar(regex_t p, regex_t *pattern, const char *text, int *matchlen); + static int matchplus(regex_t p, regex_t *pattern, const char *text, int *matchlen); static int matchone(regex_t p, char c); static int matchdigit(char c); static int matchint(char c); @@ -1301,26 +1330,23 @@ extern "C" { static int matchwhitespace(char c); static int matchmetachar(char c, const char *str); static int matchrange(char c, const char *str); + static int matchdot(char c); static int ismetachar(char c); /* Semi-public functions: */ - int re_match(const char *text, const char *pattern) - { - return re_matchp(text, re_compile(pattern)); - } - - int re_matchp(const char *text, re_t pattern) + int re_matchp(const char *text, re_t pattern, int *matchlen) { + *matchlen = 0; if (pattern != 0) { if (pattern[0].type == BEGIN) { - return ((matchpattern(&pattern[1], text)) ? 0 : -1); + return ((matchpattern(&pattern[1], text, matchlen)) ? 0 : -1); } else { int idx = -1; do { idx += 1; - if (matchpattern(pattern, text)) { + if (matchpattern(pattern, text, matchlen)) { if (text[0] == '\0') return -1; @@ -1380,7 +1406,7 @@ extern "C" { /* Escaped character, e.g. '.' or '$' */ default: { re_compiled[j].type = CHAR; - re_compiled[j].ch = pattern[i]; + re_compiled[j].u.ch = pattern[i]; } break; } } @@ -1396,6 +1422,10 @@ extern "C" { if (pattern[i+1] == '^') { re_compiled[j].type = INV_CHAR_CLASS; i += 1; /* Increment i to avoid including '^' in the char-buffer */ + if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */ + { + return 0; + } } else { re_compiled[j].type = CHAR_CLASS; } @@ -1407,6 +1437,10 @@ extern "C" { if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) { return 0; } + if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '\\' */ + { + return 0; + } ccl_buf[ccl_bufidx++] = pattern[i++]; } else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) { return 0; @@ -1419,15 +1453,22 @@ extern "C" { } /* Null-terminate string end */ ccl_buf[ccl_bufidx++] = 0; - re_compiled[j].ccl = &ccl_buf[buf_begin]; + re_compiled[j].u.ccl = &ccl_buf[buf_begin]; } break; /* Other characters: */ - default: { + default: + { re_compiled[j].type = CHAR; - re_compiled[j].ch = c; + re_compiled[j].u.ch = c; } break; } + /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */ + if (pattern[i] == 0) + { + return 0; + } + i += 1; j += 1; } @@ -1477,6 +1518,16 @@ extern "C" { && ((c >= str[0]) && (c <= str[2]))); } + static int matchdot(char c) + { +#if defined(RE_DOT_MATCHES_NEWLINE) && (RE_DOT_MATCHES_NEWLINE == 1) + (void)c; + return 1; +#else + return c != '\n' && c != '\r'; +#endif + } + static int ismetachar(char c) { return ((c == 's') || (c == 'S') @@ -1530,9 +1581,9 @@ extern "C" { static int matchone(regex_t p, char c) { switch (p.type) { - case DOT: return 1; - case CHAR_CLASS: return matchcharclass(c, (const char *)p.ccl); - case INV_CHAR_CLASS: return !matchcharclass(c, (const char *)p.ccl); + case DOT: return matchdot(c); + case CHAR_CLASS: return matchcharclass(c, (const char *)p.u.ccl); + case INV_CHAR_CLASS: return !matchcharclass(c, (const char *)p.u.ccl); case DIGIT: return matchdigit(c); case NOT_DIGIT: return !matchdigit(c); case INTEGER: return matchint(c); @@ -1543,57 +1594,83 @@ extern "C" { case NOT_ALPHA: return !matchalphanum(c); case WHITESPACE: return matchwhitespace(c); case NOT_WHITESPACE: return !matchwhitespace(c); - default: return (p.ch == c); + default: return (p.u.ch == c); } } - static int matchstar(regex_t p, regex_t *pattern, const char *text) + static int matchstar(regex_t p, regex_t *pattern, const char *text, int *matchlen) { - do { - if (matchpattern(pattern, text)) - return 1; + int prelen = *matchlen; + const char *prepos = text; + while ((text[0] != '\0') && matchone(p, *text)) + { + text++; + (*matchlen)++; + } + while (text >= prepos) + { + if (matchpattern(pattern, text--, matchlen)) + return 1; + (*matchlen)--; } - while ((text[0] != '\0') && matchone(p, *text++)); + *matchlen = prelen; return 0; } - static int matchplus(regex_t p, regex_t *pattern, const char *text) + static int matchplus(regex_t p, regex_t *pattern, const char *text, int *matchlen) { - while ((text[0] != '\0') && matchone(p, *text++)) { - if (matchpattern(pattern, text)) + const char *prepos = text; + while ((text[0] != '\0') && matchone(p, *text)) + { + text++; + (*matchlen)++; + } + while (text > prepos) + { + if (matchpattern(pattern, text--, matchlen)) return 1; + (*matchlen)--; } return 0; } - static int matchquestion(regex_t p, regex_t *pattern, const char *text) + static int matchquestion(regex_t p, regex_t *pattern, const char *text, int *matchlen) { if (p.type == UNUSED) return 1; - if (matchpattern(pattern, text)) + if (matchpattern(pattern, text, matchlen)) return 1; if (*text && matchone(p, *text++)) - return matchpattern(pattern, text); + { + if (matchpattern(pattern, text, matchlen)) + { + (*matchlen)++; + return 1; + } + } return 0; } /* Iterative matching */ - static int matchpattern(regex_t *pattern, const char *text) + static int matchpattern(regex_t *pattern, const char *text, int *matchlen) { + int pre = *matchlen; do { if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) { - return matchquestion(pattern[0], &pattern[2], text); + return matchquestion(pattern[0], &pattern[2], text, matchlen); } else if (pattern[1].type == STAR) { - return matchstar(pattern[0], &pattern[2], text); + return matchstar(pattern[0], &pattern[2], text, matchlen); } else if (pattern[1].type == PLUS) { - return matchplus(pattern[0], &pattern[2], text); + return matchplus(pattern[0], &pattern[2], text, matchlen); } else if ((pattern[0].type == END) && pattern[1].type == UNUSED) { return (text[0] == '\0'); } + (*matchlen)++; } while ((text[0] != '\0') && matchone(*pattern++, *text++)); + *matchlen = pre; return 0; } diff --git a/src/utils.h b/src/utils.h index eece00f306..70a4c08cd3 100644 --- a/src/utils.h +++ b/src/utils.h @@ -37,6 +37,14 @@ namespace LAMMPS_NS { bool strmatch(const std::string &text, const std::string &pattern); + /** Find sub-string that matches a simplified regex pattern + * + * \param text the text to be matched against the pattern + * \param pattern the search pattern, which may contain regexp markers + * \return the string that matches the patters or an empty one */ + + std::string strfind(const std::string &text, const std::string &pattern); + /** Send message to screen and logfile, if available * * \param lmp pointer to LAMMPS class instance diff --git a/unittest/utils/test_utils.cpp b/unittest/utils/test_utils.cpp index 73a9449f6e..a0e4022d2c 100644 --- a/unittest/utils/test_utils.cpp +++ b/unittest/utils/test_utils.cpp @@ -35,11 +35,11 @@ TEST(Utils, strdup) std::string original("some_text"); const char *copy = utils::strdup(original); ASSERT_THAT(original, StrEq(copy)); - ASSERT_NE(copy,original.c_str()); + ASSERT_NE(copy, original.c_str()); const char *copy2 = utils::strdup(copy); ASSERT_THAT(original, StrEq(copy2)); - ASSERT_NE(copy,copy2); + ASSERT_NE(copy, copy2); delete[] copy; delete[] copy2; @@ -72,7 +72,7 @@ TEST(Utils, trim_comment) TEST(Utils, has_utf8) { const char ascii_string[] = " -2"; - const char utf8_string[] = " −2"; + const char utf8_string[] = " −2"; ASSERT_FALSE(utils::has_utf8(ascii_string)); ASSERT_TRUE(utils::has_utf8(utf8_string)); } @@ -80,9 +80,9 @@ TEST(Utils, has_utf8) TEST(Utils, utf8_subst) { const char ascii_string[] = " -2"; - const char utf8_string[] = " −2"; - auto ascii = utils::utf8_subst(ascii_string); - auto utf8 = utils::utf8_subst(utf8_string); + const char utf8_string[] = " −2"; + auto ascii = utils::utf8_subst(ascii_string); + auto utf8 = utils::utf8_subst(utf8_string); ASSERT_TRUE(ascii == utf8); } @@ -399,6 +399,91 @@ TEST(Utils, strmatch_whitespace_nonwhitespace) ASSERT_TRUE(utils::strmatch(" 5.0 angles\n", "^\\s*\\S+\\s+\\S+\\s")); } +TEST(Utils, strfind_beg) +{ + ASSERT_THAT(utils::strfind("rigid/small/omp", "^rigid"), StrEq("rigid")); +} + +TEST(Utils, strfind_mid1) +{ + ASSERT_THAT(utils::strfind("rigid/small/omp", ".small."), StrEq("/small/")); +} + +TEST(Utils, strfind_mid2) +{ + ASSERT_THAT(utils::strfind("rigid/small/ompXXX", "omp"), StrEq("omp")); +} + +TEST(Utils, strfind_end) +{ + ASSERT_THAT(utils::strfind("rigid/small/omp", "/omp$"), StrEq("/omp")); +} + +TEST(Utils, no_strfind_beg) +{ + ASSERT_THAT(utils::strfind("rigid/small/omp", "^small"), StrEq("")); +} + +TEST(Utils, no_strfind_mid) +{ + ASSERT_THAT(utils::strfind("rigid/small/omp", "none"), StrEq("")); +} + +TEST(Utils, no_strfind_end) +{ + ASSERT_THAT(utils::strfind("rigid/small/omp", "/opt$"), StrEq("")); +} + +TEST(Utils, strfind_whole_line) +{ + ASSERT_THAT(utils::strfind("ITEM: UNITS\n", "^\\s*ITEM: UNITS\\s*$"), StrEq("ITEM: UNITS\n")); +} + +TEST(Utils, no_strfind_whole_line) +{ + ASSERT_THAT(utils::strfind("ITEM: UNITS\n", "^\\s*ITEM: UNIT\\s*$"), StrEq("")); +} + +TEST(Utils, strfind_char_range) +{ + ASSERT_THAT(utils::strfind("rigidXXX", "^[ip-s]+gid"), StrEq("rigid")); +} + +TEST(Utils, strfind_notchar_range) +{ + ASSERT_THAT(utils::strfind("rigidYYY", "^[^a-g]+gid"), StrEq("rigid")); +} + +TEST(Utils, strfind_backslash) +{ + ASSERT_THAT(utils::strfind("\\rigidZZZ", "^\\W\\w+gid"), StrEq("\\rigid")); +} + +TEST(Utils, strfind_opt_range) +{ + ASSERT_THAT(utils::strfind("rigidAAA", "^[0-9]*[\\Wp-s]igid"), StrEq("rigid")); +} + +TEST(Utils, strfind_opt_char) +{ + ASSERT_THAT(utils::strfind("rigid111", "^r?igid"), StrEq("rigid")); + ASSERT_THAT(utils::strfind("igid222", "^r?igid"), StrEq("igid")); +} + +TEST(Utils, strfind_dot) +{ + ASSERT_THAT(utils::strfind("AAArigidBBB", ".igid"), StrEq("rigid")); + ASSERT_THAT(utils::strfind("000Rigid111", ".igid"), StrEq("Rigid")); +} + +TEST(Utils, strfind_kim) +{ + ASSERT_THAT(utils::strfind("n3409jfse MO_004835508849_000 aslfjiaf", + "[MS][MO]_\\d\\d\\d+_\\d\\d\\d"), StrEq("MO_004835508849_000")); + ASSERT_THAT(utils::strfind("VanDuinChakraborty_2003_CHNO__SM_107643900657_000", + "[MS][MO]_\\d\\d\\d+_\\d\\d\\d"), StrEq("SM_107643900657_000")); +} + TEST(Utils, bounds_case1) { int nlo, nhi;