From d41927b056af8db063761ca5da938831d3d250c3 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Fri, 15 May 2020 15:36:13 -0400 Subject: [PATCH] Add Tokenizer class --- src/tokenizer.cpp | 53 +++++++++++++++++++++++++++ src/tokenizer.h | 42 +++++++++++++++++++++ unittest/CMakeLists.txt | 2 + unittest/utils/CMakeLists.txt | 3 ++ unittest/utils/test_tokenizer.cpp | 61 +++++++++++++++++++++++++++++++ 5 files changed, 161 insertions(+) create mode 100644 src/tokenizer.cpp create mode 100644 src/tokenizer.h create mode 100644 unittest/utils/CMakeLists.txt create mode 100644 unittest/utils/test_tokenizer.cpp diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 0000000000..e8099743d0 --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,53 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Richard Berger (Temple U) +------------------------------------------------------------------------- */ + +#include "tokenizer.h" + +using namespace LAMMPS_NS; + +Tokenizer::Tokenizer(const std::string & str, const std::string & seperators) { + size_t end = -1; + + do { + size_t start = str.find_first_not_of(seperators, end + 1); + if(start == std::string::npos) break; + + end = str.find_first_of(seperators, start); + + if(end == std::string::npos) { + tokens.push_back(str.substr(start)); + } else { + tokens.push_back(str.substr(start, end-start)); + } + } while(end != std::string::npos); +} + +Tokenizer::iterator Tokenizer::begin() { + return tokens.begin(); +} + +Tokenizer::iterator Tokenizer::end() { + return tokens.end(); +} + +const std::string & Tokenizer::operator[](size_t index) { + return tokens[index]; +} + +const size_t Tokenizer::count() const { + return tokens.size(); +} \ No newline at end of file diff --git a/src/tokenizer.h b/src/tokenizer.h new file mode 100644 index 0000000000..855b21d7f2 --- /dev/null +++ b/src/tokenizer.h @@ -0,0 +1,42 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Richard Berger (Temple U) +------------------------------------------------------------------------- */ + +#ifndef LMP_TOKENIZER_H +#define LMP_TOKENIZER_H + +#include +#include + +namespace LAMMPS_NS { + +class Tokenizer { + std::vector tokens; +public: + typedef std::vector::iterator iterator; + + Tokenizer(const std::string & str, const std::string & seperators = " \t\r\n\f"); + + iterator begin(); + iterator end(); + + const std::string & operator[](size_t index); + const size_t count() const; +}; + +} + +#endif diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt index 2fb6500a9f..2d1663272f 100644 --- a/unittest/CMakeLists.txt +++ b/unittest/CMakeLists.txt @@ -1,3 +1,5 @@ include(GTest) add_subdirectory(force-styles) + +add_subdirectory(utils) diff --git a/unittest/utils/CMakeLists.txt b/unittest/utils/CMakeLists.txt new file mode 100644 index 0000000000..1185a36345 --- /dev/null +++ b/unittest/utils/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(test_tokenizer test_tokenizer.cpp) +target_link_libraries(test_tokenizer PRIVATE lammps GTest::GMockMain GTest::GMock GTest::GTest) +add_test(Tokenizer test_tokenizer) diff --git a/unittest/utils/test_tokenizer.cpp b/unittest/utils/test_tokenizer.cpp new file mode 100644 index 0000000000..04e942c9f2 --- /dev/null +++ b/unittest/utils/test_tokenizer.cpp @@ -0,0 +1,61 @@ +#include +#include +#include "tokenizer.h" + +using namespace LAMMPS_NS; +using ::testing::Eq; + +TEST(Tokenizer, empty_string) { + Tokenizer t("", " "); + ASSERT_EQ(t.count(), 0); +} + +TEST(Tokenizer, whitespace_only) { + Tokenizer t(" ", " "); + ASSERT_EQ(t.count(), 0); +} + +TEST(Tokenizer, single_word) { + Tokenizer t("test", " "); + ASSERT_EQ(t.count(), 1); +} + +TEST(Tokenizer, two_words) { + Tokenizer t("test word", " "); + ASSERT_EQ(t.count(), 2); +} + +TEST(Tokenizer, prefix_seperators) { + Tokenizer t(" test word", " "); + ASSERT_EQ(t.count(), 2); +} + +TEST(Tokenizer, postfix_seperators) { + Tokenizer t("test word ", " "); + ASSERT_EQ(t.count(), 2); +} + +TEST(Tokenizer, iterate_words) { + Tokenizer t(" test word ", " "); + ASSERT_THAT(t[0], Eq("test")); + ASSERT_THAT(t[1], Eq("word")); + ASSERT_EQ(t.count(), 2); +} + +TEST(Tokenizer, default_seperators) { + Tokenizer t(" \r\n test \t word \f"); + ASSERT_THAT(t[0], Eq("test")); + ASSERT_THAT(t[1], Eq("word")); + ASSERT_EQ(t.count(), 2); +} + +TEST(Tokenizer, for_loop) { + Tokenizer t(" \r\n test \t word \f"); + std::vector list; + + for(auto word : t) { + list.push_back(word); + } + ASSERT_THAT(list[0], Eq("test")); + ASSERT_THAT(list[1], Eq("word")); +} \ No newline at end of file