Fix typo in grammar.
Add lexertl.
This commit is contained in:
45
inc/lexertl/old/fast_filebuf.hpp
Normal file
45
inc/lexertl/old/fast_filebuf.hpp
Normal file
@@ -0,0 +1,45 @@
|
||||
// Quick hack...
|
||||
// If you find this really is faster then using std::ifstream, let me know
|
||||
// as I can always spend some more time to improve it.
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename CharT, class Traits>
|
||||
class basic_fast_filebuf : public std::basic_streambuf<CharT, Traits>
|
||||
{
|
||||
public:
|
||||
basic_fast_filebuf (const char *filename_) :
|
||||
_fp (0)
|
||||
{
|
||||
_fp = ::fopen(filename_, "r");
|
||||
}
|
||||
|
||||
virtual ~basic_fast_filebuf()
|
||||
{
|
||||
::fclose(_fp);
|
||||
_fp = 0;
|
||||
}
|
||||
|
||||
protected:
|
||||
FILE *_fp;
|
||||
|
||||
virtual std::streamsize xsgetn (CharT *ptr_, std::streamsize count_)
|
||||
{
|
||||
return ::fread (ptr_, sizeof(CharT),
|
||||
static_cast<std::size_t>(count_), _fp);
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_fast_filebuf<char, std::char_traits<char> > fast_filebuf;
|
||||
typedef basic_fast_filebuf<wchar_t, std::char_traits<wchar_t> > wfast_filebuf;
|
||||
}
|
||||
|
||||
// Usage:
|
||||
// lexertl::rules rules_;
|
||||
// lexertl::state_machine state_machine_;
|
||||
// fast_filebuf buf ("Unicode/PropList.txt");
|
||||
// std::istream if_(&buf);
|
||||
// lexertl::stream_shared_iterator iter_ (if_);
|
||||
// lexertl::stream_shared_iterator end_;
|
||||
// lexertl::match_results<lexertl::stream_shared_iterator>
|
||||
// results_(iter_, end_);
|
||||
561
inc/lexertl/old/string_token.hpp
Normal file
561
inc/lexertl/old/string_token.hpp
Normal file
@@ -0,0 +1,561 @@
|
||||
// string_token.hpp
|
||||
// Copyright (c) 2005-2010 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_STRING_TOKEN_HPP
|
||||
#define LEXERTL_STRING_TOKEN_HPP
|
||||
|
||||
#include "../char_traits.hpp"
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type>
|
||||
struct basic_string_token
|
||||
{
|
||||
typedef std::basic_string<char_type> string;
|
||||
|
||||
bool _negated;
|
||||
string _chars;
|
||||
|
||||
basic_string_token () :
|
||||
_negated (false)
|
||||
{
|
||||
}
|
||||
|
||||
basic_string_token (const bool negated_, const string &chars_) :
|
||||
_negated (negated_),
|
||||
_chars (chars_)
|
||||
{
|
||||
}
|
||||
|
||||
void remove_duplicates ()
|
||||
{
|
||||
const char_type *start_ = _chars.c_str ();
|
||||
const char_type *end_ = start_ + _chars.size ();
|
||||
|
||||
// Optimisation for very large charsets:
|
||||
// sorting via pointers is much quicker than
|
||||
// via iterators...
|
||||
std::sort (const_cast<char_type *> (start_), const_cast<char_type *>
|
||||
(end_));
|
||||
_chars.erase (std::unique (_chars.begin (), _chars.end ()),
|
||||
_chars.end ());
|
||||
}
|
||||
|
||||
void normalise ()
|
||||
{
|
||||
const std::size_t max_chars_ = sizeof (char_type) == 1 ?
|
||||
num_chars : num_wchar_ts;
|
||||
|
||||
if (_chars.length () == max_chars_)
|
||||
{
|
||||
_negated = !_negated;
|
||||
_chars.clear ();
|
||||
}
|
||||
else if (_chars.length () > max_chars_ / 2)
|
||||
{
|
||||
negate ();
|
||||
}
|
||||
}
|
||||
|
||||
void negate ()
|
||||
{
|
||||
const std::size_t max_chars_ = sizeof (char_type) == 1 ?
|
||||
num_chars : num_wchar_ts;
|
||||
char_type curr_char_ = std::numeric_limits<CharT>::min ();
|
||||
string temp_;
|
||||
const char_type *curr_ = _chars.c_str ();
|
||||
const char_type *chars_end_ = curr_ + _chars.size ();
|
||||
|
||||
_negated = !_negated;
|
||||
temp_.resize (max_chars_ - _chars.size ());
|
||||
|
||||
char_type *ptr_ = const_cast<char_type *> (temp_.c_str ());
|
||||
std::size_t i_ = 0;
|
||||
|
||||
while (curr_ < chars_end_)
|
||||
{
|
||||
while (*curr_ > curr_char_)
|
||||
{
|
||||
*ptr_ = curr_char_;
|
||||
++ptr_;
|
||||
++curr_char_;
|
||||
++i_;
|
||||
}
|
||||
|
||||
++curr_char_;
|
||||
++curr_;
|
||||
++i_;
|
||||
}
|
||||
|
||||
for (; i_ < max_chars_; ++i_)
|
||||
{
|
||||
*ptr_ = curr_char_;
|
||||
++ptr_;
|
||||
++curr_char_;
|
||||
}
|
||||
|
||||
_chars = temp_;
|
||||
}
|
||||
|
||||
bool operator < (const basic_string_token &rhs_) const
|
||||
{
|
||||
return _negated < rhs_._negated ||
|
||||
(_negated == rhs_._negated && _chars < rhs_._chars);
|
||||
}
|
||||
|
||||
bool operator == (const basic_string_token &rhs_) const
|
||||
{
|
||||
return _negated == rhs_._negated && _chars == rhs_._chars;
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _chars.empty () && !_negated;
|
||||
}
|
||||
|
||||
bool any () const
|
||||
{
|
||||
return _chars.empty () && _negated;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_negated = false;
|
||||
_chars.clear ();
|
||||
}
|
||||
|
||||
void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
|
||||
{
|
||||
if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
|
||||
!any () && !rhs_.any ()))
|
||||
{
|
||||
intersect_same_types (rhs_, overlap_);
|
||||
}
|
||||
else
|
||||
{
|
||||
intersect_diff_types (rhs_, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void merge (const basic_string_token &rhs_,
|
||||
basic_string_token &merged_) const
|
||||
{
|
||||
if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
|
||||
!any () && !rhs_.any ()))
|
||||
{
|
||||
merge_same_types (rhs_, merged_);
|
||||
}
|
||||
else
|
||||
{
|
||||
merge_diff_types (rhs_, merged_);
|
||||
}
|
||||
}
|
||||
|
||||
static string escape_char (const char_type ch_)
|
||||
{
|
||||
string out_;
|
||||
|
||||
switch (ch_)
|
||||
{
|
||||
case '\0':
|
||||
out_ += '\\';
|
||||
out_ += '0';
|
||||
break;
|
||||
case '\a':
|
||||
out_ += '\\';
|
||||
out_ += 'a';
|
||||
break;
|
||||
case '\b':
|
||||
out_ += '\\';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case 27:
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
out_ += '1';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case '\f':
|
||||
out_ += '\\';
|
||||
out_ += 'f';
|
||||
break;
|
||||
case '\n':
|
||||
out_ += '\\';
|
||||
out_ += 'n';
|
||||
break;
|
||||
case '\r':
|
||||
out_ += '\\';
|
||||
out_ += 'r';
|
||||
break;
|
||||
case '\t':
|
||||
out_ += '\\';
|
||||
out_ += 't';
|
||||
break;
|
||||
case '\v':
|
||||
out_ += '\\';
|
||||
out_ += 'v';
|
||||
break;
|
||||
case '\\':
|
||||
out_ += '\\';
|
||||
out_ += '\\';
|
||||
break;
|
||||
case '"':
|
||||
out_ += '\\';
|
||||
out_ += '"';
|
||||
break;
|
||||
case '\'':
|
||||
out_ += '\\';
|
||||
out_ += '\'';
|
||||
break;
|
||||
default:
|
||||
{
|
||||
if (ch_ < 32)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
ss_ << std::hex <<
|
||||
static_cast<std::size_t> (ch_);
|
||||
out_ += ss_.str ();
|
||||
}
|
||||
else
|
||||
{
|
||||
out_ += ch_;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return out_;
|
||||
}
|
||||
|
||||
private:
|
||||
void intersect_same_types (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (any ())
|
||||
{
|
||||
clear ();
|
||||
overlap_._negated = true;
|
||||
rhs_.clear ();
|
||||
}
|
||||
else
|
||||
{
|
||||
typename string::iterator iter_ = _chars.begin ();
|
||||
typename string::iterator end_ = _chars.end ();
|
||||
typename string::iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
overlap_._negated = _negated;
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
overlap_._chars += *iter_;
|
||||
iter_ = _chars.erase (iter_);
|
||||
end_ = _chars.end ();
|
||||
rhs_iter_ = rhs_._chars.erase (rhs_iter_);
|
||||
rhs_end_ = rhs_._chars.end ();
|
||||
}
|
||||
}
|
||||
|
||||
if (_negated)
|
||||
{
|
||||
// duplicates already merged, so safe to merge
|
||||
// using std lib.
|
||||
|
||||
// src, dest
|
||||
merge (_chars, overlap_._chars);
|
||||
// duplicates already merged, so safe to merge
|
||||
// using std lib.
|
||||
|
||||
// src, dest
|
||||
merge (rhs_._chars, overlap_._chars);
|
||||
_negated = false;
|
||||
rhs_._negated = false;
|
||||
std::swap (_chars, rhs_._chars);
|
||||
normalise ();
|
||||
overlap_.normalise ();
|
||||
rhs_.normalise ();
|
||||
}
|
||||
else if (!overlap_._chars.empty ())
|
||||
{
|
||||
normalise ();
|
||||
overlap_.normalise ();
|
||||
rhs_.normalise ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_diff_types (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (any ())
|
||||
{
|
||||
intersect_any (rhs_, overlap_);
|
||||
}
|
||||
else if (_negated)
|
||||
{
|
||||
intersect_negated (rhs_, overlap_);
|
||||
}
|
||||
else // _negated == false
|
||||
{
|
||||
intersect_charset (rhs_, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_)
|
||||
{
|
||||
if (rhs_._negated)
|
||||
{
|
||||
rhs_.intersect_negated (*this, overlap_);
|
||||
}
|
||||
else // rhs._negated == false
|
||||
{
|
||||
rhs_.intersect_charset (*this, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_negated (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (rhs_.any ())
|
||||
{
|
||||
overlap_._negated = true;
|
||||
overlap_._chars = _chars;
|
||||
rhs_._negated = false;
|
||||
rhs_._chars = _chars;
|
||||
clear ();
|
||||
}
|
||||
else // rhs._negated == false
|
||||
{
|
||||
rhs_.intersect_charset (*this, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_charset (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (rhs_.any ())
|
||||
{
|
||||
overlap_._chars = _chars;
|
||||
rhs_._negated = true;
|
||||
rhs_._chars = _chars;
|
||||
clear ();
|
||||
}
|
||||
else // rhs_._negated == true
|
||||
{
|
||||
typename string::iterator iter_ = _chars.begin ();
|
||||
typename string::iterator end_ = _chars.end ();
|
||||
typename string::iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
overlap_._chars += *iter_;
|
||||
rhs_iter_ = rhs_._chars.insert (rhs_iter_, *iter_);
|
||||
++rhs_iter_;
|
||||
rhs_end_ = rhs_._chars.end ();
|
||||
iter_ = _chars.erase (iter_);
|
||||
end_ = _chars.end ();
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
++iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
if (iter_ != end_)
|
||||
{
|
||||
// nothing bigger in rhs_ than iter_,
|
||||
// so safe to merge using std lib.
|
||||
string temp_ (iter_, end_);
|
||||
|
||||
// src, dest
|
||||
merge (temp_, overlap_._chars);
|
||||
_chars.erase (iter_, end_);
|
||||
}
|
||||
|
||||
if (!overlap_._chars.empty ())
|
||||
{
|
||||
merge (overlap_._chars, rhs_._chars);
|
||||
// possible duplicates, so check for any and erase.
|
||||
rhs_._chars.erase (std::unique (rhs_._chars.begin (),
|
||||
rhs_._chars.end ()), rhs_._chars.end ());
|
||||
normalise ();
|
||||
overlap_.normalise ();
|
||||
rhs_.normalise ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void merge (string &src_, string &dest_)
|
||||
{
|
||||
string tmp_ (src_.size () + dest_.size (), 0);
|
||||
|
||||
std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (),
|
||||
tmp_.begin ());
|
||||
dest_ = tmp_;
|
||||
}
|
||||
|
||||
void merge_same_types (const basic_string_token &rhs_,
|
||||
basic_string_token &merged_) const
|
||||
{
|
||||
if (any ())
|
||||
{
|
||||
merged_._negated = true;
|
||||
}
|
||||
else if (_negated)
|
||||
{
|
||||
typename string::const_iterator iter_ = _chars.begin ();
|
||||
typename string::const_iterator end_ = _chars.end ();
|
||||
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
merged_._negated = _negated;
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
merged_._chars += *iter_;
|
||||
++iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
merged_.normalise ();
|
||||
}
|
||||
else
|
||||
{
|
||||
typename string::const_iterator iter_ = _chars.begin ();
|
||||
typename string::const_iterator end_ = _chars.end ();
|
||||
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
merged_._chars += *iter_;
|
||||
++iter_;
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
merged_._chars += *rhs_iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
merged_._chars += *iter_;
|
||||
++iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
// Include any trailing chars
|
||||
if (iter_ != end_)
|
||||
{
|
||||
string temp_ (iter_, end_);
|
||||
|
||||
merged_._chars += temp_;
|
||||
}
|
||||
else if (rhs_iter_ != rhs_end_)
|
||||
{
|
||||
string temp_ (rhs_iter_, rhs_end_);
|
||||
|
||||
merged_._chars += temp_;
|
||||
}
|
||||
|
||||
merged_.normalise ();
|
||||
}
|
||||
}
|
||||
|
||||
void merge_diff_types (const basic_string_token &rhs_,
|
||||
basic_string_token &merged_) const
|
||||
{
|
||||
if (_negated)
|
||||
{
|
||||
merge_negated (*this, rhs_, merged_);
|
||||
}
|
||||
else
|
||||
{
|
||||
merge_negated (rhs_, *this, merged_);
|
||||
}
|
||||
|
||||
merged_.normalise ();
|
||||
}
|
||||
|
||||
void merge_negated (const basic_string_token &lhs_,
|
||||
const basic_string_token &rhs_, basic_string_token &merged_) const
|
||||
{
|
||||
typename string::const_iterator lhs_iter_ = lhs_._chars.begin ();
|
||||
typename string::const_iterator lhs_end_ = lhs_._chars.end ();
|
||||
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
merged_._negated = true;
|
||||
|
||||
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*lhs_iter_ < *rhs_iter_)
|
||||
{
|
||||
merged_._chars += *lhs_iter_;
|
||||
++lhs_iter_;
|
||||
}
|
||||
else if (*lhs_iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
++lhs_iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
// Only interested in any remaining 'negated' chars
|
||||
if (lhs_iter_ != lhs_end_)
|
||||
{
|
||||
string temp_ (lhs_iter_, lhs_end_);
|
||||
|
||||
merged_._chars += temp_;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user