422 lines
12 KiB
C++
422 lines
12 KiB
C++
// string_token.hpp
|
|
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
#ifndef LEXERTL_STRING_TOKEN_HPP
|
|
#define LEXERTL_STRING_TOKEN_HPP
|
|
|
|
#include "char_traits.hpp"
|
|
#include <ios> // Needed by GCC 4.4
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace lexertl
|
|
{
|
|
template<typename ch_type>
|
|
struct basic_string_token
|
|
{
|
|
typedef ch_type char_type;
|
|
typedef basic_char_traits<char_type> char_traits;
|
|
typedef typename char_traits::index_type index_type;
|
|
typedef std::pair<index_type, index_type> range;
|
|
typedef std::vector<range> range_vector;
|
|
typedef std::basic_string<char_type> string;
|
|
typedef basic_string_token<char_type> string_token;
|
|
|
|
range_vector _ranges;
|
|
|
|
basic_string_token () :
|
|
_ranges ()
|
|
{
|
|
}
|
|
|
|
basic_string_token (char_type ch_) :
|
|
_ranges ()
|
|
{
|
|
insert (range (ch_, ch_));
|
|
}
|
|
|
|
basic_string_token (char_type first_, char_type second_) :
|
|
_ranges ()
|
|
{
|
|
insert (range (first_, second_));
|
|
}
|
|
|
|
void clear ()
|
|
{
|
|
_ranges.clear ();
|
|
}
|
|
|
|
bool empty () const
|
|
{
|
|
return _ranges.empty ();
|
|
}
|
|
|
|
bool any () const
|
|
{
|
|
return _ranges.size () == 1 && _ranges.front ().first == 0 &&
|
|
_ranges.front ().second == char_traits::max_val ();
|
|
}
|
|
|
|
bool operator < (const basic_string_token &rhs_) const
|
|
{
|
|
return _ranges < rhs_._ranges;
|
|
}
|
|
|
|
bool operator == (const basic_string_token &rhs_) const
|
|
{
|
|
return _ranges == rhs_._ranges;
|
|
}
|
|
|
|
bool negatable () const
|
|
{
|
|
std::size_t size_ = 0;
|
|
typename range_vector::const_iterator iter_ = _ranges.begin ();
|
|
typename range_vector::const_iterator end_ = _ranges.end ();
|
|
|
|
for (; iter_ != end_; ++iter_)
|
|
{
|
|
size_ += static_cast<std::size_t>(iter_->second) + 1 -
|
|
static_cast<std::size_t>(iter_->first);
|
|
}
|
|
|
|
return size_ > static_cast<std::size_t>(char_traits::max_val ()) / 2;
|
|
}
|
|
|
|
void swap (basic_string_token &rhs_)
|
|
{
|
|
_ranges.swap (rhs_._ranges);
|
|
}
|
|
|
|
void insert (const basic_string_token &rhs_)
|
|
{
|
|
typename range_vector::const_iterator iter_ = rhs_._ranges.begin ();
|
|
typename range_vector::const_iterator end_ = rhs_._ranges.end ();
|
|
|
|
for (; iter_ != end_; ++iter_)
|
|
{
|
|
insert (*iter_);
|
|
}
|
|
}
|
|
|
|
// Deliberately pass by value - may modify
|
|
typename range_vector::iterator insert (range rhs_)
|
|
{
|
|
bool insert_ = true;
|
|
typename range_vector::iterator iter_ = _ranges.begin ();
|
|
typename range_vector::const_iterator end_ = _ranges.end ();
|
|
|
|
while (iter_ != end_)
|
|
{
|
|
// follows current item
|
|
if (rhs_.first > iter_->second)
|
|
{
|
|
if (rhs_.first == iter_->second + 1)
|
|
{
|
|
// Auto normalise
|
|
rhs_.first = iter_->first;
|
|
}
|
|
else
|
|
{
|
|
// No intersection, consider next
|
|
++iter_;
|
|
continue;
|
|
}
|
|
}
|
|
// Precedes current item
|
|
else if (rhs_.second < iter_->first)
|
|
{
|
|
if (rhs_.second == iter_->first - 1)
|
|
{
|
|
// Auto normalise
|
|
rhs_.second = iter_->second;
|
|
}
|
|
else
|
|
{
|
|
// insert here
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// overlap (under)
|
|
if (rhs_.first < iter_->first)
|
|
{
|
|
if (rhs_.second < iter_->second)
|
|
{
|
|
rhs_.second = iter_->second;
|
|
}
|
|
}
|
|
// overlap (over)
|
|
else if (rhs_.second > iter_->second)
|
|
{
|
|
if (rhs_.first > iter_->first)
|
|
{
|
|
rhs_.first = iter_->first;
|
|
}
|
|
}
|
|
// subset
|
|
else
|
|
{
|
|
insert_ = false;
|
|
iter_ = _ranges.end ();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Code minimisation: this always applies unless we have already
|
|
// exited the loop, or "continue" executed.
|
|
iter_ = _ranges.erase (iter_);
|
|
end_ = _ranges.end ();
|
|
}
|
|
|
|
if (insert_)
|
|
{
|
|
iter_ = _ranges.insert(iter_, rhs_);
|
|
}
|
|
|
|
return iter_;
|
|
}
|
|
|
|
void negate ()
|
|
{
|
|
index_type next_ = 0;
|
|
const index_type max_ = char_traits::max_val ();
|
|
string_token temp_;
|
|
typename range_vector::iterator iter_ = _ranges.begin ();
|
|
typename range_vector::const_iterator end_ = _ranges.end ();
|
|
bool finished_ = false;
|
|
|
|
for (; iter_ != end_; ++iter_)
|
|
{
|
|
if (next_ < iter_->first)
|
|
{
|
|
temp_.insert (range (next_, iter_->first - 1));
|
|
}
|
|
|
|
if (iter_->second < max_)
|
|
{
|
|
next_ = iter_->second + 1;
|
|
}
|
|
else
|
|
{
|
|
finished_ = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!finished_)
|
|
{
|
|
temp_.insert (range (next_, max_));
|
|
}
|
|
|
|
swap (temp_);
|
|
}
|
|
|
|
void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
|
|
{
|
|
typename range_vector::iterator lhs_iter_ = _ranges.begin ();
|
|
typename range_vector::const_iterator lhs_end_ = _ranges.end ();
|
|
typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin ();
|
|
typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end ();
|
|
|
|
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
|
{
|
|
if (rhs_iter_->first > lhs_iter_->second)
|
|
{
|
|
++lhs_iter_;
|
|
}
|
|
else if (rhs_iter_->second < lhs_iter_->first)
|
|
{
|
|
++rhs_iter_;
|
|
}
|
|
else
|
|
{
|
|
range range_;
|
|
|
|
if (rhs_iter_->first > lhs_iter_->first)
|
|
{
|
|
range_.first = rhs_iter_->first;
|
|
}
|
|
else
|
|
{
|
|
range_.first = lhs_iter_->first;
|
|
}
|
|
|
|
if (rhs_iter_->second < lhs_iter_->second)
|
|
{
|
|
range_.second = rhs_iter_->second;
|
|
}
|
|
else
|
|
{
|
|
range_.second = lhs_iter_->second;
|
|
}
|
|
|
|
adjust (range_, *this, lhs_iter_, lhs_end_);
|
|
adjust (range_, rhs_, rhs_iter_, rhs_end_);
|
|
overlap_.insert (range_);
|
|
}
|
|
}
|
|
}
|
|
|
|
void remove (basic_string_token &rhs_)
|
|
{
|
|
typename range_vector::iterator lhs_iter_ = _ranges.begin ();
|
|
typename range_vector::const_iterator lhs_end_ = _ranges.end ();
|
|
typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin ();
|
|
typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end ();
|
|
|
|
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
|
{
|
|
if (rhs_iter_->first > lhs_iter_->second)
|
|
{
|
|
++lhs_iter_;
|
|
}
|
|
else if (rhs_iter_->second < lhs_iter_->first)
|
|
{
|
|
++rhs_iter_;
|
|
}
|
|
else
|
|
{
|
|
range range_;
|
|
|
|
if (rhs_iter_->first > lhs_iter_->first)
|
|
{
|
|
range_.first = rhs_iter_->first;
|
|
}
|
|
else
|
|
{
|
|
range_.first = lhs_iter_->first;
|
|
}
|
|
|
|
if (rhs_iter_->second < lhs_iter_->second)
|
|
{
|
|
range_.second = rhs_iter_->second;
|
|
}
|
|
else
|
|
{
|
|
range_.second = lhs_iter_->second;
|
|
}
|
|
|
|
adjust (range_, *this, lhs_iter_, lhs_end_);
|
|
}
|
|
}
|
|
}
|
|
|
|
static string escape_char (const typename char_traits::index_type ch_)
|
|
{
|
|
string out_;
|
|
|
|
switch (ch_)
|
|
{
|
|
case '\0':
|
|
out_ += '\\';
|
|
out_ += '0';
|
|
break;
|
|
case '\a':
|
|
out_ += '\\';
|
|
out_ += 'a';
|
|
break;
|
|
case '\b':
|
|
out_ += '\\';
|
|
out_ += 'b';
|
|
break;
|
|
case 27:
|
|
out_ += '\\';
|
|
out_ += 'x';
|
|
out_ += '1';
|
|
out_ += 'b';
|
|
break;
|
|
case '\f':
|
|
out_ += '\\';
|
|
out_ += 'f';
|
|
break;
|
|
case '\n':
|
|
out_ += '\\';
|
|
out_ += 'n';
|
|
break;
|
|
case '\r':
|
|
out_ += '\\';
|
|
out_ += 'r';
|
|
break;
|
|
case '\t':
|
|
out_ += '\\';
|
|
out_ += 't';
|
|
break;
|
|
case '\v':
|
|
out_ += '\\';
|
|
out_ += 'v';
|
|
break;
|
|
case '\\':
|
|
out_ += '\\';
|
|
out_ += '\\';
|
|
break;
|
|
case '"':
|
|
out_ += '\\';
|
|
out_ += '"';
|
|
break;
|
|
case '\'':
|
|
out_ += '\\';
|
|
out_ += '\'';
|
|
break;
|
|
default:
|
|
{
|
|
if (ch_ < 32 || ch_ > 126)
|
|
{
|
|
std::basic_stringstream<char_type> ss_;
|
|
|
|
out_ += '\\';
|
|
out_ += 'x';
|
|
ss_ << std::hex <<
|
|
static_cast<std::size_t> (ch_);
|
|
out_ += ss_.str ();
|
|
}
|
|
else
|
|
{
|
|
out_ += ch_;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
return out_;
|
|
}
|
|
|
|
private:
|
|
void adjust (const range &range_, basic_string_token &token_,
|
|
typename range_vector::iterator &iter_,
|
|
typename range_vector::const_iterator &end_)
|
|
{
|
|
if (range_.first > iter_->first)
|
|
{
|
|
const index_type second_ = iter_->second;
|
|
|
|
iter_->second = range_.first - 1;
|
|
|
|
if (range_.second < second_)
|
|
{
|
|
range new_range_ (range_.second + 1, second_);
|
|
|
|
iter_ = token_.insert (new_range_);
|
|
end_ = token_._ranges.end ();
|
|
}
|
|
}
|
|
else if (range_.second < iter_->second)
|
|
{
|
|
iter_->first = range_.second + 1;
|
|
}
|
|
else
|
|
{
|
|
iter_ = token_._ranges.erase (iter_);
|
|
end_ = token_._ranges.end ();
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
#endif
|