Performance optimization of Tokenizer
Reduces string allocations and removes std::vector from Tokenizer Most processing now happens on-demand.
This commit is contained in:
@ -2448,11 +2448,11 @@ int AtomVec::process_fields(char *str, const char *default_str, Method *method)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// tokenize words in both strings
|
// tokenize words in both strings
|
||||||
Tokenizer words(str, " ");
|
std::vector<std::string> words = Tokenizer(str, " ").as_vector();
|
||||||
Tokenizer def_words(default_str, " ");
|
std::vector<std::string> def_words = Tokenizer(default_str, " ").as_vector();
|
||||||
|
|
||||||
int nfield = words.count();
|
int nfield = words.size();
|
||||||
int ndef = def_words.count();
|
int ndef = def_words.size();
|
||||||
|
|
||||||
// process fields one by one, add to index vector
|
// process fields one by one, add to index vector
|
||||||
|
|
||||||
|
|||||||
@ -514,8 +514,8 @@ char *AtomVecHybrid::merge_fields(int inum, char *root,
|
|||||||
|
|
||||||
// identify unique words in concatenated string
|
// identify unique words in concatenated string
|
||||||
|
|
||||||
Tokenizer words(concat, " ");
|
std::vector<std::string> words = Tokenizer(concat, " ").as_vector();
|
||||||
int nwords = words.count();
|
int nwords = words.size();
|
||||||
|
|
||||||
int *unique = new int[nwords];
|
int *unique = new int[nwords];
|
||||||
|
|
||||||
|
|||||||
@ -43,7 +43,7 @@ namespace LAMMPS_NS
|
|||||||
void skip_line();
|
void skip_line();
|
||||||
char * next_line(int nparams = 0);
|
char * next_line(int nparams = 0);
|
||||||
void next_dvector(double * list, int n);
|
void next_dvector(double * list, int n);
|
||||||
ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
|
ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
|
||||||
|
|
||||||
// convenience functions
|
// convenience functions
|
||||||
double next_double();
|
double next_double();
|
||||||
|
|||||||
@ -42,7 +42,7 @@ namespace LAMMPS_NS
|
|||||||
char * next_line(int nparams = 0);
|
char * next_line(int nparams = 0);
|
||||||
|
|
||||||
void next_dvector(double * list, int n);
|
void next_dvector(double * list, int n);
|
||||||
ValueTokenizer next_values(int nparams, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
|
ValueTokenizer next_values(int nparams, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
|
||||||
};
|
};
|
||||||
|
|
||||||
class FileReaderException : public std::exception {
|
class FileReaderException : public std::exception {
|
||||||
|
|||||||
@ -17,77 +17,118 @@
|
|||||||
|
|
||||||
#include "tokenizer.h"
|
#include "tokenizer.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
#include "fmt/format.h"
|
||||||
|
|
||||||
using namespace LAMMPS_NS;
|
using namespace LAMMPS_NS;
|
||||||
|
|
||||||
Tokenizer::Tokenizer(const std::string & str, const std::string & seperators) {
|
TokenizerException::TokenizerException(const std::string & msg, const std::string & token){
|
||||||
size_t end = -1;
|
if(token.empty()) {
|
||||||
|
message = msg;
|
||||||
|
} else {
|
||||||
|
message = fmt::format("{}: '{}'", msg, token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
do {
|
Tokenizer::Tokenizer(const std::string & str, const std::string & separators) :
|
||||||
size_t start = str.find_first_not_of(seperators, end + 1);
|
text(str), separators(separators), start(0), ntokens(std::string::npos)
|
||||||
if(start == std::string::npos) break;
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
end = str.find_first_of(seperators, start);
|
Tokenizer::Tokenizer(const Tokenizer & rhs) :
|
||||||
|
text(rhs.text), separators(rhs.separators), ntokens(rhs.ntokens)
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
Tokenizer::Tokenizer(Tokenizer && rhs) :
|
||||||
|
text(std::move(rhs.text)), separators(std::move(rhs.separators)), ntokens(rhs.ntokens)
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Tokenizer::reset() {
|
||||||
|
start = text.find_first_not_of(separators);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Tokenizer::skip(int n) {
|
||||||
|
for(int i = 0; i < n; ++i) {
|
||||||
|
if(!has_next()) throw TokenizerException("No more tokens", "");
|
||||||
|
|
||||||
|
size_t end = text.find_first_of(separators, start);
|
||||||
|
|
||||||
if(end == std::string::npos) {
|
if(end == std::string::npos) {
|
||||||
tokens.push_back(str.substr(start));
|
start = end;
|
||||||
} else {
|
} else {
|
||||||
tokens.push_back(str.substr(start, end-start));
|
start = text.find_first_not_of(separators, end+1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} while(end != std::string::npos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Tokenizer::Tokenizer(const Tokenizer & rhs) : tokens(rhs.tokens) {
|
bool Tokenizer::has_next() const {
|
||||||
|
return start != std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tokenizer::Tokenizer(Tokenizer && rhs) : tokens(std::move(rhs.tokens)) {
|
std::string Tokenizer::next() {
|
||||||
|
if(!has_next()) throw TokenizerException("No more tokens", "");
|
||||||
|
|
||||||
|
size_t end = text.find_first_of(separators, start);
|
||||||
|
|
||||||
|
if(end == std::string::npos) {
|
||||||
|
std::string token = text.substr(start);
|
||||||
|
start = end;
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string token = text.substr(start, end-start);
|
||||||
|
start = text.find_first_not_of(separators, end+1);
|
||||||
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tokenizer::iterator Tokenizer::begin() {
|
size_t Tokenizer::count() {
|
||||||
return tokens.begin();
|
// lazy evaluation
|
||||||
|
if (ntokens == std::string::npos) {
|
||||||
|
ntokens = utils::count_words(text, separators);
|
||||||
|
}
|
||||||
|
return ntokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tokenizer::iterator Tokenizer::end() {
|
std::vector<std::string> Tokenizer::as_vector() {
|
||||||
return tokens.end();
|
// store current state
|
||||||
}
|
size_t current = start;
|
||||||
|
|
||||||
Tokenizer::const_iterator Tokenizer::cbegin() const {
|
reset();
|
||||||
return tokens.cbegin();
|
|
||||||
}
|
|
||||||
|
|
||||||
Tokenizer::const_iterator Tokenizer::cend() const {
|
// generate vector
|
||||||
return tokens.cend();
|
std::vector<std::string> tokens;
|
||||||
}
|
|
||||||
|
|
||||||
std::string & Tokenizer::operator[](size_t index) {
|
while(has_next()) {
|
||||||
return tokens[index];
|
tokens.emplace_back(next());
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Tokenizer::count() const {
|
// restore state
|
||||||
return tokens.size();
|
start = current;
|
||||||
|
|
||||||
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & seperators) : tokens(str, seperators) {
|
ValueTokenizer::ValueTokenizer(const std::string & str, const std::string & separators) : tokens(str, separators) {
|
||||||
current = tokens.begin();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ValueTokenizer::ValueTokenizer(const ValueTokenizer & rhs) : tokens(rhs.tokens) {
|
ValueTokenizer::ValueTokenizer(const ValueTokenizer & rhs) : tokens(rhs.tokens) {
|
||||||
current = tokens.begin();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ValueTokenizer::ValueTokenizer(ValueTokenizer && rhs) : tokens(std::move(rhs.tokens)) {
|
ValueTokenizer::ValueTokenizer(ValueTokenizer && rhs) : tokens(std::move(rhs.tokens)) {
|
||||||
current = tokens.begin();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ValueTokenizer::has_next() const {
|
bool ValueTokenizer::has_next() const {
|
||||||
return current != tokens.cend();
|
return tokens.has_next();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ValueTokenizer::next_string() {
|
std::string ValueTokenizer::next_string() {
|
||||||
if (has_next()) {
|
if (has_next()) {
|
||||||
std::string value = *current;
|
std::string value = tokens.next();
|
||||||
++current;
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
@ -95,11 +136,11 @@ std::string ValueTokenizer::next_string() {
|
|||||||
|
|
||||||
int ValueTokenizer::next_int() {
|
int ValueTokenizer::next_int() {
|
||||||
if (has_next()) {
|
if (has_next()) {
|
||||||
if(!utils::is_integer(*current)) {
|
std::string current = tokens.next();
|
||||||
throw InvalidIntegerException(*current);
|
if(!utils::is_integer(current)) {
|
||||||
|
throw InvalidIntegerException(current);
|
||||||
}
|
}
|
||||||
int value = atoi(current->c_str());
|
int value = atoi(current.c_str());
|
||||||
++current;
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@ -107,45 +148,44 @@ int ValueTokenizer::next_int() {
|
|||||||
|
|
||||||
bigint ValueTokenizer::next_bigint() {
|
bigint ValueTokenizer::next_bigint() {
|
||||||
if (has_next()) {
|
if (has_next()) {
|
||||||
if(!utils::is_integer(*current)) {
|
std::string current = tokens.next();
|
||||||
throw InvalidIntegerException(*current);
|
if(!utils::is_integer(current)) {
|
||||||
|
throw InvalidIntegerException(current);
|
||||||
}
|
}
|
||||||
bigint value = ATOBIGINT(current->c_str());
|
bigint value = ATOBIGINT(current.c_str());
|
||||||
++current;
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
tagint ValueTokenizer::next_tagint() {
|
tagint ValueTokenizer::next_tagint() {
|
||||||
if (current != tokens.end()) {
|
if (has_next()) {
|
||||||
if(!utils::is_integer(*current)) {
|
std::string current = tokens.next();
|
||||||
throw InvalidIntegerException(*current);
|
if(!utils::is_integer(current)) {
|
||||||
|
throw InvalidIntegerException(current);
|
||||||
}
|
}
|
||||||
tagint value = ATOTAGINT(current->c_str());
|
tagint value = ATOTAGINT(current.c_str());
|
||||||
++current;
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
double ValueTokenizer::next_double() {
|
double ValueTokenizer::next_double() {
|
||||||
if (current != tokens.end()) {
|
if (has_next()) {
|
||||||
if(!utils::is_double(*current)) {
|
std::string current = tokens.next();
|
||||||
throw InvalidFloatException(*current);
|
if(!utils::is_double(current)) {
|
||||||
|
throw InvalidFloatException(current);
|
||||||
}
|
}
|
||||||
|
double value = atof(current.c_str());
|
||||||
double value = atof(current->c_str());
|
|
||||||
++current;
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ValueTokenizer::skip(int ntokens) {
|
void ValueTokenizer::skip(int n) {
|
||||||
current = std::next(current, ntokens);
|
tokens.skip(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ValueTokenizer::count() const {
|
size_t ValueTokenizer::count() {
|
||||||
return tokens.count();
|
return tokens.count();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -25,34 +25,33 @@
|
|||||||
|
|
||||||
namespace LAMMPS_NS {
|
namespace LAMMPS_NS {
|
||||||
|
|
||||||
#define TOKENIZER_DEFAULT_SEPERATORS " \t\r\n\f"
|
#define TOKENIZER_DEFAULT_SEPARATORS " \t\r\n\f"
|
||||||
|
|
||||||
class Tokenizer {
|
class Tokenizer {
|
||||||
std::vector<std::string> tokens;
|
std::string text;
|
||||||
|
std::string separators;
|
||||||
|
size_t start;
|
||||||
|
size_t ntokens;
|
||||||
public:
|
public:
|
||||||
typedef std::vector<std::string>::iterator iterator;
|
Tokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
|
||||||
typedef std::vector<std::string>::const_iterator const_iterator;
|
|
||||||
|
|
||||||
Tokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
|
|
||||||
Tokenizer(Tokenizer &&);
|
Tokenizer(Tokenizer &&);
|
||||||
Tokenizer(const Tokenizer &);
|
Tokenizer(const Tokenizer &);
|
||||||
Tokenizer& operator=(const Tokenizer&) = default;
|
Tokenizer& operator=(const Tokenizer&) = default;
|
||||||
Tokenizer& operator=(Tokenizer&&) = default;
|
Tokenizer& operator=(Tokenizer&&) = default;
|
||||||
|
|
||||||
iterator begin();
|
void reset();
|
||||||
iterator end();
|
void skip(int n);
|
||||||
const_iterator cbegin() const;
|
bool has_next() const;
|
||||||
const_iterator cend() const;
|
std::string next();
|
||||||
|
|
||||||
std::string & operator[](size_t index);
|
size_t count();
|
||||||
size_t count() const;
|
std::vector<std::string> as_vector();
|
||||||
};
|
};
|
||||||
|
|
||||||
class TokenizerException : public std::exception {
|
class TokenizerException : public std::exception {
|
||||||
std::string message;
|
std::string message;
|
||||||
public:
|
public:
|
||||||
TokenizerException(const std::string & msg, const std::string & token) : message(msg + ": '" + token + "'") {
|
TokenizerException(const std::string & msg, const std::string & token);
|
||||||
}
|
|
||||||
|
|
||||||
~TokenizerException() throw() {
|
~TokenizerException() throw() {
|
||||||
}
|
}
|
||||||
@ -76,9 +75,8 @@ public:
|
|||||||
|
|
||||||
class ValueTokenizer {
|
class ValueTokenizer {
|
||||||
Tokenizer tokens;
|
Tokenizer tokens;
|
||||||
Tokenizer::const_iterator current;
|
|
||||||
public:
|
public:
|
||||||
ValueTokenizer(const std::string & str, const std::string & seperators = TOKENIZER_DEFAULT_SEPERATORS);
|
ValueTokenizer(const std::string & str, const std::string & separators = TOKENIZER_DEFAULT_SEPARATORS);
|
||||||
ValueTokenizer(const ValueTokenizer &);
|
ValueTokenizer(const ValueTokenizer &);
|
||||||
ValueTokenizer(ValueTokenizer &&);
|
ValueTokenizer(ValueTokenizer &&);
|
||||||
ValueTokenizer& operator=(const ValueTokenizer&) = default;
|
ValueTokenizer& operator=(const ValueTokenizer&) = default;
|
||||||
@ -91,9 +89,9 @@ public:
|
|||||||
double next_double();
|
double next_double();
|
||||||
|
|
||||||
bool has_next() const;
|
bool has_next() const;
|
||||||
void skip(int ntokens);
|
void skip(int n);
|
||||||
|
|
||||||
size_t count() const;
|
size_t count();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -369,8 +369,20 @@ std::string utils::trim_comment(const std::string & line) {
|
|||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
size_t utils::count_words(const std::string & text, const std::string & seperators) {
|
size_t utils::count_words(const std::string & text, const std::string & seperators) {
|
||||||
ValueTokenizer words(text, seperators);
|
size_t count = 0;
|
||||||
return words.count();
|
size_t start = text.find_first_not_of(seperators);
|
||||||
|
|
||||||
|
while (start != std::string::npos) {
|
||||||
|
size_t end = text.find_first_of(seperators, start);
|
||||||
|
++count;
|
||||||
|
|
||||||
|
if(end == std::string::npos) {
|
||||||
|
return count;
|
||||||
|
} else {
|
||||||
|
start = text.find_first_not_of(seperators, end + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
|
|||||||
@ -50,25 +50,21 @@ TEST(Tokenizer, postfix_seperators) {
|
|||||||
|
|
||||||
TEST(Tokenizer, iterate_words) {
|
TEST(Tokenizer, iterate_words) {
|
||||||
Tokenizer t(" test word ", " ");
|
Tokenizer t(" test word ", " ");
|
||||||
ASSERT_THAT(t[0], Eq("test"));
|
ASSERT_THAT(t.next(), Eq("test"));
|
||||||
ASSERT_THAT(t[1], Eq("word"));
|
ASSERT_THAT(t.next(), Eq("word"));
|
||||||
ASSERT_EQ(t.count(), 2);
|
ASSERT_EQ(t.count(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Tokenizer, default_seperators) {
|
TEST(Tokenizer, default_seperators) {
|
||||||
Tokenizer t(" \r\n test \t word \f");
|
Tokenizer t(" \r\n test \t word \f");
|
||||||
ASSERT_THAT(t[0], Eq("test"));
|
ASSERT_THAT(t.next(), Eq("test"));
|
||||||
ASSERT_THAT(t[1], Eq("word"));
|
ASSERT_THAT(t.next(), Eq("word"));
|
||||||
ASSERT_EQ(t.count(), 2);
|
ASSERT_EQ(t.count(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Tokenizer, for_loop) {
|
TEST(Tokenizer, as_vector) {
|
||||||
Tokenizer t(" \r\n test \t word \f");
|
Tokenizer t(" \r\n test \t word \f");
|
||||||
std::vector<std::string> list;
|
std::vector<std::string> list = t.as_vector();
|
||||||
|
|
||||||
for(auto word : t) {
|
|
||||||
list.push_back(word);
|
|
||||||
}
|
|
||||||
ASSERT_THAT(list[0], Eq("test"));
|
ASSERT_THAT(list[0], Eq("test"));
|
||||||
ASSERT_THAT(list[1], Eq("word"));
|
ASSERT_THAT(list[1], Eq("word"));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -32,6 +32,10 @@ TEST(Utils, trim_and_count_words) {
|
|||||||
ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2);
|
ASSERT_EQ(utils::trim_and_count_words("some text # comment"), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Utils, count_words_with_extra_spaces) {
|
||||||
|
ASSERT_EQ(utils::count_words(" some text # comment "), 4);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(Utils, valid_integer1) {
|
TEST(Utils, valid_integer1) {
|
||||||
ASSERT_TRUE(utils::is_integer("10"));
|
ASSERT_TRUE(utils::is_integer("10"));
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user