/* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: Richard Berger (Temple U) ------------------------------------------------------------------------- */ #include "tokenizer.h" #include "fmt/format.h" #include "utils.h" #include using namespace LAMMPS_NS; TokenizerException::TokenizerException(const std::string &msg, const std::string &token) { if (token.empty()) { message = msg; } else { message = fmt::format("{}: '{}'", msg, token); } } /** Class for splitting text into words * * This tokenizer will break down a string into sub-strings (i.e words) * separated by the given separator characters. If the string contains * certain known UTF-8 characters they will be replaced by their ASCII * equivalents processing the string. * \verbatim embed:rst *See also* :cpp:class:`ValueTokenizer`, :cpp:func:`utils::split_words`, :cpp:func:`utils::utf8_subst` \endverbatim * * \param str string to be processed * \param _separators string with separator characters (default: " \t\r\n\f") */ Tokenizer::Tokenizer(std::string str, std::string _separators) : text(std::move(str)), separators(std::move(_separators)), start(0), ntokens(std::string::npos) { // replace known UTF-8 characters with ASCII equivalents if (utils::has_utf8(text)) text = utils::utf8_subst(text); reset(); } Tokenizer::Tokenizer(const Tokenizer &rhs) : text(rhs.text), separators(rhs.separators), ntokens(rhs.ntokens) { reset(); } Tokenizer::Tokenizer(Tokenizer &&rhs) : text(std::move(rhs.text)), separators(std::move(rhs.separators)), ntokens(rhs.ntokens) { reset(); } Tokenizer &Tokenizer::operator=(const Tokenizer &other) { Tokenizer tmp(other); swap(tmp); return *this; } Tokenizer &Tokenizer::operator=(Tokenizer &&other) { Tokenizer tmp(std::move(other)); swap(tmp); return *this; } void Tokenizer::swap(Tokenizer &other) { std::swap(text, other.text); std::swap(separators, other.separators); std::swap(start, other.start); std::swap(ntokens, other.ntokens); } /*! Re-position the tokenizer state to the first word, * i.e. the first non-separator character */ void Tokenizer::reset() { start = text.find_first_not_of(separators); } /*! Search the text to be processed for a sub-string. * * This method does a generic sub-string match. * * \param str string to be searched for * \return true if string was found, false if not */ bool Tokenizer::contains(const std::string &str) const { return text.find(str) != std::string::npos; } /*! Search the text to be processed for regular expression match. * \verbatim embed:rst This method matches the current string against a regular expression using the :cpp:func:`utils::strmatch() ` function. \endverbatim * * \param str regular expression to be matched against * \return true if string was found, false if not */ bool Tokenizer::matches(const std::string &str) const { return utils::strmatch(text, str); } /*! Skip over a given number of tokens * * \param n number of tokens to skip over */ void Tokenizer::skip(int n) { for (int i = 0; i < n; ++i) { if (!has_next()) throw TokenizerException("No more tokens", ""); size_t end = text.find_first_of(separators, start); if (end == std::string::npos) { start = end; } else { start = text.find_first_not_of(separators, end + 1); } } } /*! Indicate whether more tokens are available * * \return true if there are more tokens, false if not */ bool Tokenizer::has_next() const { return start != std::string::npos; } /*! Retrieve next token. * * \return string with the next token */ std::string Tokenizer::next() { if (!has_next()) throw TokenizerException("No more tokens", ""); size_t end = text.find_first_of(separators, start); if (end == std::string::npos) { std::string token = text.substr(start); start = end; return token; } std::string token = text.substr(start, end - start); start = text.find_first_not_of(separators, end + 1); return token; } /*! Count number of tokens in text. * * \return number of counted tokens */ size_t Tokenizer::count() { // lazy evaluation if (ntokens == std::string::npos) { ntokens = utils::count_words(text, separators); } return ntokens; } /*! Retrieve the entire text converted to an STL vector of tokens. * * \return The STL vector */ std::vector Tokenizer::as_vector() { // store current state size_t current = start; reset(); // generate vector std::vector tokens; while (has_next()) { tokens.emplace_back(next()); } // restore state start = current; return tokens; } /*! Class for reading text with numbers * \verbatim embed:rst *See also* :cpp:class:`Tokenizer` \endverbatim * * \param str String to be processed * \param separators String with separator characters (default: " \t\r\n\f") * * \see Tokenizer InvalidIntegerException InvalidFloatException */ ValueTokenizer::ValueTokenizer(const std::string &str, const std::string &separators) : tokens(str, separators) { } ValueTokenizer::ValueTokenizer(ValueTokenizer &&rhs) : tokens(std::move(rhs.tokens)) {} ValueTokenizer &ValueTokenizer::operator=(const ValueTokenizer &other) { ValueTokenizer tmp(other); swap(tmp); return *this; } ValueTokenizer &ValueTokenizer::operator=(ValueTokenizer &&other) { ValueTokenizer tmp(std::move(other)); swap(tmp); return *this; } void ValueTokenizer::swap(ValueTokenizer &other) { std::swap(tokens, other.tokens); } /*! Indicate whether more tokens are available * * \return true if there are more tokens, false if not */ bool ValueTokenizer::has_next() const { return tokens.has_next(); } /*! Search the text to be processed for a sub-string. * * This method does a generic sub-string match. * * \param value string with value to be searched for * \return true if string was found, false if not */ bool ValueTokenizer::contains(const std::string &value) const { return tokens.contains(value); } /*! Search the text to be processed for regular expression match. * \verbatim embed:rst This method matches the current string against a regular expression using the :cpp:func:`utils::strmatch() ` function. \endverbatim * * \param str regular expression to be matched against * \return true if string was found, false if not */ bool ValueTokenizer::matches(const std::string &str) const { return tokens.matches(str); } /*! Retrieve next token * * \return string with next token */ std::string ValueTokenizer::next_string() { return tokens.next(); } /*! Retrieve next token and convert to int * * \return value of next token */ int ValueTokenizer::next_int() { std::string current = tokens.next(); try { std::size_t end; auto val = std::stoi(current, &end); // only partially converted if (current.size() != end) { throw InvalidIntegerException(current); } return val; // rethrow exceptions from std::stoi() } catch (std::out_of_range const &) { throw InvalidIntegerException(current); } catch (std::invalid_argument const &) { throw InvalidIntegerException(current); } return 0; } /*! Retrieve next token and convert to bigint * * \return value of next token */ bigint ValueTokenizer::next_bigint() { std::string current = tokens.next(); try { std::size_t end; auto val = std::stoll(current, &end, 10); // only partially converted if (current.size() != end) { throw InvalidIntegerException(current); } // out of range if ((val < (-MAXBIGINT - 1) || (val > MAXBIGINT))) { throw InvalidIntegerException(current); }; return (bigint) val; // rethrow exceptions from std::stoll() } catch (std::out_of_range const &) { throw InvalidIntegerException(current); } catch (std::invalid_argument const &) { throw InvalidIntegerException(current); } return 0; } /*! Retrieve next token and convert to tagint * * \return value of next token */ tagint ValueTokenizer::next_tagint() { std::string current = tokens.next(); try { std::size_t end; auto val = std::stoll(current, &end, 10); // only partially converted if (current.size() != end) { throw InvalidIntegerException(current); } // out of range if ((val < (-MAXTAGINT - 1) || (val > MAXTAGINT))) { throw InvalidIntegerException(current); } return (tagint) val; // rethrow exceptions from std::stoll() } catch (std::out_of_range const &) { throw InvalidIntegerException(current); } catch (std::invalid_argument const &) { throw InvalidIntegerException(current); } return 0; } /*! Retrieve next token and convert to double * * \return value of next token */ double ValueTokenizer::next_double() { std::string current = tokens.next(); try { std::size_t end; auto val = std::stod(current, &end); // only partially converted if (current.size() != end) { throw InvalidFloatException(current); } return val; // rethrow exceptions from std::stod() } catch (std::out_of_range const &) { throw InvalidFloatException(current); } catch (std::invalid_argument const &) { throw InvalidFloatException(current); } return 0.0; } /*! Skip over a given number of tokens * * \param n number of tokens to skip over */ void ValueTokenizer::skip(int n) { tokens.skip(n); } /*! Count number of tokens in text. * * \return number of counted tokens */ size_t ValueTokenizer::count() { return tokens.count(); }