Sans Pareil Technologies, Inc.

Key To Your Business

Lab Exercise 6 - String


We will implement a simple string tokeniser as well as a stream tokeniser and run a suite of tests to verify that our implementation works as expected. We will test both positive and negative scenarios to validate our implementation.
Tokeniser.h 

Create a header file that will define the string tokenise function as well as a template stream split function. For help with debugging we will also define a print method that will print out the contents of a std::vector.

#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <vector>

namespace csc240
{
  using Strings = std::vector<std::string>;

  Strings tokenise( const std::string& text, const std::string& delimiters = " " );

  template <typename T>
  std::vector<T> split( const std::string& text )
  {
    std::vector<T> tokens;
    std::stringstream ss{ text };

    T value;
    while ( ss >> value ) tokens.push_back( value );

    return tokens;
  }

  template <typename T>
  void print( const std::vector<T>& tokens )
  {
    std::copy( tokens.cbegin(), tokens.cend(),
      std::ostream_iterator<T>( std::cout, ", " ) );
    std::cout << std::endl;
  }
}
Tokeniser.cpp 

Create a new source file to implement the tokenise function.

#include "Tokeniser.h"

using csc240::Strings;

Strings csc240::tokenise( const std::string& text, const std::string& delimiters )
{
  using std::string;
  Strings tokens;

  auto lastPos = text.find_first_not_of( delimiters, 0 );
  auto pos = text.find_first_of( delimiters, lastPos );

  while ( string::npos != pos || string::npos != lastPos )
  {
    tokens.emplace_back( text.substr( lastPos, pos - lastPos ) );
    lastPos = text.find_first_not_of( delimiters, pos );
    pos = text.find_first_of( delimiters, lastPos );
  }

  return tokens;
}
StringTokeniser.cpp 

Create a new source file to hold the unit test suite for the tokenise function. Note that we could have added the test code straight into the Tokeniser.cpp file as well. Catch has a preprocessor switch that can be used to suppress the test code from being included in the release build of an application.

#include "catch.hpp"
#include "Tokeniser.h"

using csc240::Strings;
using csc240::tokenise;
using csc240::print;

SCENARIO( "Tokenise strings using default delimiter" )
{
  GIVEN( "Text containing delimiters" )
  {
    const std::string text{ "A random sentence with some words." };

    THEN( "csc240::tokenise splits the text into words" )
    {
      const Strings tokens = tokenise( text );
      REQUIRE( 6 == tokens.size() );
      REQUIRE( tokens[0] == "A" );
      REQUIRE( tokens[1] == "random" );
      REQUIRE( tokens[2] == "sentence" );
      REQUIRE( tokens[3] == "with" );
      REQUIRE( tokens[4] == "some" );
      REQUIRE( tokens[5] == "words." );
      print( tokens );
    }
  }

  GIVEN( "Text without default delimiters" )
  {
    const std::string text{ "A-really-long-sentence-with-some-words." };

    THEN( "csc240::tokenise does not split the text into words" )
    {
      const Strings tokens = tokenise( text );
      REQUIRE( 1 == tokens.size() );
      REQUIRE( tokens[0] == text );
    }
  }
}

SCENARIO( "Tokenise strings using custom delimiter" )
{
  GIVEN( "Text containing a custom delimiter" )
  {
    const std::string text{ "A-really-long-sentence-with-some-words." };

    THEN( "csc240::tokenise splits the text into parts" )
    {
      const Strings tokens = tokenise( text, "-" );
      REQUIRE( 7 == tokens.size() );
      REQUIRE( tokens[0] == "A" );
      REQUIRE( tokens[1] == "really" );
      REQUIRE( tokens[2] == "long" );
      REQUIRE( tokens[3] == "sentence" );
      REQUIRE( tokens[4] == "with" );
      REQUIRE( tokens[5] == "some" );
      REQUIRE( tokens[6] == "words." );
      print( tokens );
    }
  }

  GIVEN( "Text without custom delimiter" )
  {
    const std::string text{ "A random sentence with some words." };

    THEN( "csc240::tokenise does not split the text into words" )
    {
      const Strings tokens = tokenise( text, "-" );
      REQUIRE( 1 == tokens.size() );
      REQUIRE( tokens[0] == text );
    }
  }
}

SCENARIO( "Tokenise strings using multiple delimiters" )
{
  GIVEN( "Text containing a variety of delimiters" )
  {
    const std::string text{ "A random:sentence, with|some words." };

    THEN( "csc240::tokenise splits the text into parts at each delimiter" )
    {
      const Strings tokens = tokenise( text, " |:,." );
      REQUIRE( 6 == tokens.size() );
      REQUIRE( tokens[0] == "A" );
      REQUIRE( tokens[1] == "random" );
      REQUIRE( tokens[2] == "sentence" );
      REQUIRE( tokens[3] == "with" );
      REQUIRE( tokens[4] == "some" );
      REQUIRE( tokens[5] == "words" );
    }
  }

  GIVEN( "Text without custom delimiters" )
  {
    const std::string text{ "A-random-sentence-with-some-words" };

    THEN( "csc240::tokenise does not split the text into words" )
    {
      const Strings tokens = tokenise( text, " ,." );
      REQUIRE( 1 == tokens.size() );
      REQUIRE( tokens[0] == text );
    }
  }
}
StreamTokeniser.cpp 

Create a new source file to hold the test suite for the stream split function. We only test splitting into fundamental types such as int, double etc. The csc240::tokenise function is much more powerful for splitting strings and hence is not included in this test suite.

#include "catch.hpp"
#include "Tokeniser.h"

using csc240::split;
using csc240::print;

SCENARIO( "Tokenise string as a stream" )
{
  GIVEN( "A string with int values" )
  {
    const std::string text{ "1 2 3 4 5 6 7 8 9 10" };

    THEN( "csc240::split converts string into vector<int>" )
    {
      auto tokens = split<int>( text );
      REQUIRE( 10 == tokens.size() );
      for ( uint32_t i = 0; i < tokens.size(); ++i )
      {
        REQUIRE( i + 1 == tokens[i] );
      }
    }
  }

  GIVEN( "A string with int values separated by comma" )
  {
    const std::string text{ "1,2,3,4,5,6,7,8,9,10" };

    THEN( "csc240::split extracts first value" )
    {
      auto tokens = split<int>( text );
      REQUIRE( 1 == tokens.size() );
      print( tokens );
    }
  }

  GIVEN( "A string with double values" )
  {
    const std::string text{ "1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8 9.9" };

    THEN( "csc240::split converts string into vector<double>")
    {
      auto tokens = split<double>( text );
      REQUIRE( 9 == tokens.size() );
      for ( uint32_t i = 0; i < tokens.size(); ++i )
      {
        const double value = ( i + 1 ) + ( ( i + 1 ) / 10.0 );
        REQUIRE( value == tokens[i] );
      }
      print( tokens );
    }
  }

  GIVEN( "A string with bool values" )
  {
    const std::string text{ "1 1 0 1 0 0" };

    THEN( "csc240::split converts string into vector<bool>" )
    {
      auto tokens = split<bool>( text );
      print( tokens );
      REQUIRE( 6 == tokens.size() );
      REQUIRE( tokens[0] );
      REQUIRE( tokens[1] );
      REQUIRE( ! tokens[2] );
      REQUIRE( tokens[3] );
      REQUIRE( ! tokens[4] );
      REQUIRE( ! tokens[5] );
    }
  }
}
Run the project and verify that all the test scenarios passed. Note that for the test case involving stream tokenising of the input string, we cannot pass in `true/false` values, but have to use the numeric equivalents.