Fix typo in grammar.
Add lexertl.
This commit is contained in:
22
inc/lexertl/bool.hpp
Normal file
22
inc/lexertl/bool.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// bool.hpp
|
||||
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_BOOL_H
|
||||
#define LEXERTL_BOOL_H
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
// Named template param for compiler compatibility
|
||||
template<bool b>
|
||||
struct bool_
|
||||
{
|
||||
};
|
||||
|
||||
typedef bool_<true> true_;
|
||||
typedef bool_<false> false_;
|
||||
}
|
||||
|
||||
#endif
|
||||
50
inc/lexertl/char_traits.hpp
Normal file
50
inc/lexertl/char_traits.hpp
Normal file
@@ -0,0 +1,50 @@
|
||||
// char_traits.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_CHAR_TRAITS_H
|
||||
#define LEXERTL_CHAR_TRAITS_H
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type>
|
||||
struct basic_char_traits
|
||||
{
|
||||
typedef ch_type char_type;
|
||||
typedef ch_type index_type;
|
||||
|
||||
static index_type index (const char_type ch)
|
||||
{
|
||||
return ch;
|
||||
}
|
||||
|
||||
static index_type max_val ()
|
||||
{
|
||||
return sizeof(char_type) > 2 ? 0x10ffff :
|
||||
static_cast<index_type>(~0);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct basic_char_traits<char>
|
||||
{
|
||||
typedef char char_type;
|
||||
typedef unsigned char index_type;
|
||||
|
||||
static index_type index (const char ch)
|
||||
{
|
||||
return static_cast<index_type>(ch);
|
||||
}
|
||||
|
||||
static index_type max_val ()
|
||||
{
|
||||
return static_cast<index_type>(~0);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
24
inc/lexertl/compile_assert.hpp
Normal file
24
inc/lexertl/compile_assert.hpp
Normal file
@@ -0,0 +1,24 @@
|
||||
// compile_assert.hpp
|
||||
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_COMPILE_ASSERT_H
|
||||
#define LEXERTL_COMPILE_ASSERT_H
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
// Named template param for compiler compatibility
|
||||
template<bool b>
|
||||
struct compile_assert;
|
||||
|
||||
// enum for compiler compatibility
|
||||
template<>
|
||||
struct compile_assert<true>
|
||||
{
|
||||
enum {value = 1};
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
228
inc/lexertl/containers/bitvector.hpp
Normal file
228
inc/lexertl/containers/bitvector.hpp
Normal file
@@ -0,0 +1,228 @@
|
||||
// bitvector.hpp
|
||||
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_BITVECTOR_HPP
|
||||
#define LEXERTL_BITVECTOR_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename T>
|
||||
class basic_bitvector
|
||||
{
|
||||
public:
|
||||
template<typename Ty>
|
||||
class reference
|
||||
{
|
||||
public:
|
||||
reference (Ty &block_, const std::size_t mask_) :
|
||||
_block (block_),
|
||||
_mask (mask_)
|
||||
{
|
||||
}
|
||||
|
||||
operator bool () const
|
||||
{
|
||||
return (_block & _mask) != 0;
|
||||
}
|
||||
|
||||
reference<Ty> &operator = (const bool bit_)
|
||||
{
|
||||
if (bit_)
|
||||
{
|
||||
_block |= _mask;
|
||||
}
|
||||
else
|
||||
{
|
||||
_block &= ~_mask;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
reference<Ty> &operator = (reference<Ty> &rhs_)
|
||||
{
|
||||
if (rhs_)
|
||||
{
|
||||
_block |= _mask;
|
||||
}
|
||||
else
|
||||
{
|
||||
_block &= ~_mask;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
Ty &_block;
|
||||
const std::size_t _mask;
|
||||
};
|
||||
|
||||
basic_bitvector (const std::size_t size_) :
|
||||
_vec (block (size_) + (bit (size_) ? 1 : 0), 0)
|
||||
{
|
||||
}
|
||||
|
||||
basic_bitvector (const basic_bitvector &rhs_) :
|
||||
_vec (rhs_._vec)
|
||||
{
|
||||
}
|
||||
|
||||
basic_bitvector &operator = (const basic_bitvector &rhs_)
|
||||
{
|
||||
if (&rhs_ != this)
|
||||
{
|
||||
_vec = rhs_._vec;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator [] (const std::size_t index_) const
|
||||
{
|
||||
return (_vec[block (index_)] & (1 << bit (index_))) != 0;
|
||||
}
|
||||
|
||||
reference<T> operator [] (const std::size_t index_)
|
||||
{
|
||||
return reference<T> (_vec[block (index_)], (1 << bit (index_)));
|
||||
}
|
||||
|
||||
basic_bitvector<T> &operator |= (const basic_bitvector<T> &rhs_)
|
||||
{
|
||||
typename t_vector::iterator lhs_iter_ = _vec.begin ();
|
||||
typename t_vector::iterator lhs_end_ = _vec.end ();
|
||||
typename t_vector::const_iterator rhs_iter_ = rhs_._vec.begin ();
|
||||
typename t_vector::const_iterator rhs_end_ = rhs_._vec.end ();
|
||||
|
||||
for (; lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_;
|
||||
++lhs_iter_, ++rhs_iter_)
|
||||
{
|
||||
*lhs_iter_ |= *rhs_iter_;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_bitvector<T> &operator &= (const basic_bitvector<T> &rhs_)
|
||||
{
|
||||
typename t_vector::iterator lhs_iter_ = _vec.begin ();
|
||||
typename t_vector::iterator lhs_end_ = _vec.end ();
|
||||
typename t_vector::const_iterator rhs_iter_ = rhs_._vec.begin ();
|
||||
typename t_vector::const_iterator rhs_end_ = rhs_._vec.end ();
|
||||
|
||||
for (; lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_;
|
||||
++lhs_iter_, ++rhs_iter_)
|
||||
{
|
||||
*lhs_iter_ &= *rhs_iter_;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
typename t_vector::iterator iter_ = _vec.begin ();
|
||||
typename t_vector::iterator end_ = _vec.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
*iter_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool any () const
|
||||
{
|
||||
typename t_vector::const_iterator iter_ = _vec.begin ();
|
||||
typename t_vector::const_iterator end_ = _vec.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
if (*iter_) break;
|
||||
}
|
||||
|
||||
return iter_ != end_;
|
||||
}
|
||||
|
||||
void negate ()
|
||||
{
|
||||
typename t_vector::iterator iter_ = _vec.begin ();
|
||||
typename t_vector::iterator end_ = _vec.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
*iter_ = ~*iter_;
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t find_first () const
|
||||
{
|
||||
return find_next (npos ());
|
||||
}
|
||||
|
||||
std::size_t find_next (const std::size_t index_) const
|
||||
{
|
||||
std::size_t ret_ = npos ();
|
||||
const std::size_t block_ = index_ == npos () ? 0 : block (index_ + 1);
|
||||
std::size_t bit_ = index_ == npos () ? 0 : bit (index_ + 1);
|
||||
typename t_vector::const_iterator iter_ = _vec.begin () + block_;
|
||||
typename t_vector::const_iterator end_ = _vec.end ();
|
||||
|
||||
for (std::size_t i_ = block_; iter_ != end_; ++iter_, ++i_)
|
||||
{
|
||||
const bool bits_ = (*iter_ & (static_cast<T>(~0) << bit_)) != 0;
|
||||
|
||||
if (bits_)
|
||||
{
|
||||
std::size_t j_ = bit_;
|
||||
std::size_t b_ = 1 << bit_;
|
||||
bool found_ = false;
|
||||
|
||||
for (; j_ < sizeof(T) * 8; ++j_, b_ <<= 1)
|
||||
{
|
||||
if (*iter_ & b_)
|
||||
{
|
||||
found_ = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_)
|
||||
{
|
||||
ret_ = i_ * sizeof(T) * 8 + j_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bit_ = 0;
|
||||
}
|
||||
|
||||
return ret_;
|
||||
}
|
||||
|
||||
std::size_t npos () const
|
||||
{
|
||||
return static_cast<std::size_t>(~0);
|
||||
}
|
||||
|
||||
private:
|
||||
typedef std::vector<T> t_vector;
|
||||
|
||||
t_vector _vec;
|
||||
|
||||
std::size_t block (const std::size_t index_) const
|
||||
{
|
||||
return index_ / (sizeof(T) * 8);
|
||||
}
|
||||
|
||||
std::size_t bit (const std::size_t index_) const
|
||||
{
|
||||
return index_ % (sizeof(T) * 8);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
69
inc/lexertl/containers/ptr_list.hpp
Normal file
69
inc/lexertl/containers/ptr_list.hpp
Normal file
@@ -0,0 +1,69 @@
|
||||
// ptr_list.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_PTR_LIST_HPP
|
||||
#define LEXERTL_PTR_LIST_HPP
|
||||
|
||||
#include <list>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename ptr_type>
|
||||
class ptr_list
|
||||
{
|
||||
public:
|
||||
typedef std::list<ptr_type *> list;
|
||||
|
||||
ptr_list () :
|
||||
_list ()
|
||||
{
|
||||
}
|
||||
|
||||
~ptr_list ()
|
||||
{
|
||||
clear ();
|
||||
}
|
||||
|
||||
list *operator -> ()
|
||||
{
|
||||
return &_list;
|
||||
}
|
||||
|
||||
const list *operator -> () const
|
||||
{
|
||||
return &_list;
|
||||
}
|
||||
|
||||
list &operator * ()
|
||||
{
|
||||
return _list;
|
||||
}
|
||||
|
||||
const list &operator * () const
|
||||
{
|
||||
return _list;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
while (!_list.empty ())
|
||||
{
|
||||
delete _list.front ();
|
||||
_list.pop_front ();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
list _list;
|
||||
|
||||
ptr_list (const ptr_list &); // No copy construction.
|
||||
ptr_list &operator = (const ptr_list &); // No assignment.
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
72
inc/lexertl/containers/ptr_map.hpp
Normal file
72
inc/lexertl/containers/ptr_map.hpp
Normal file
@@ -0,0 +1,72 @@
|
||||
// ptr_map.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_PTR_MAP_HPP
|
||||
#define LEXERTL_PTR_MAP_HPP
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename key_type, typename ptr_type>
|
||||
class ptr_map
|
||||
{
|
||||
public:
|
||||
typedef std::map<key_type, ptr_type *> map;
|
||||
typedef std::pair<key_type, ptr_type *> pair;
|
||||
typedef std::pair<typename map::iterator, bool> iter_pair;
|
||||
|
||||
ptr_map ()
|
||||
{
|
||||
}
|
||||
|
||||
~ptr_map ()
|
||||
{
|
||||
clear ();
|
||||
}
|
||||
|
||||
map *operator -> ()
|
||||
{
|
||||
return &_map;
|
||||
}
|
||||
|
||||
const map *operator -> () const
|
||||
{
|
||||
return &_map;
|
||||
}
|
||||
|
||||
map &operator * ()
|
||||
{
|
||||
return _map;
|
||||
}
|
||||
|
||||
const map &operator * () const
|
||||
{
|
||||
return _map;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
for (typename map::iterator iter_ = _map.begin (), end_ = _map.end ();
|
||||
iter_ != end_; ++iter_)
|
||||
{
|
||||
delete iter_->second;
|
||||
}
|
||||
|
||||
_map.clear ();
|
||||
}
|
||||
|
||||
private:
|
||||
map _map;
|
||||
|
||||
ptr_map (const ptr_map &); // No copy construction.
|
||||
ptr_map &operator = (const ptr_map &); // No assignment.
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
69
inc/lexertl/containers/ptr_stack.hpp
Normal file
69
inc/lexertl/containers/ptr_stack.hpp
Normal file
@@ -0,0 +1,69 @@
|
||||
// ptr_stack.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_PTR_STACK_HPP
|
||||
#define LEXERTL_PTR_STACK_HPP
|
||||
|
||||
#include <stack>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename ptr_type>
|
||||
class ptr_stack
|
||||
{
|
||||
public:
|
||||
typedef std::stack<ptr_type *> stack;
|
||||
|
||||
ptr_stack () :
|
||||
_stack ()
|
||||
{
|
||||
}
|
||||
|
||||
~ptr_stack ()
|
||||
{
|
||||
clear ();
|
||||
}
|
||||
|
||||
stack *operator -> ()
|
||||
{
|
||||
return &_stack;
|
||||
}
|
||||
|
||||
const stack *operator -> () const
|
||||
{
|
||||
return &_stack;
|
||||
}
|
||||
|
||||
stack &operator * ()
|
||||
{
|
||||
return _stack;
|
||||
}
|
||||
|
||||
const stack &operator * () const
|
||||
{
|
||||
return _stack;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
while (!_stack.empty ())
|
||||
{
|
||||
delete _stack.top ();
|
||||
_stack.pop ();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
stack _stack;
|
||||
|
||||
ptr_stack (const ptr_stack &); // No copy construction.
|
||||
ptr_stack &operator = (const ptr_stack &); // No assignment.
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
106
inc/lexertl/containers/ptr_vector.hpp
Normal file
106
inc/lexertl/containers/ptr_vector.hpp
Normal file
@@ -0,0 +1,106 @@
|
||||
// ptr_vector.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_PTR_VECTOR_HPP
|
||||
#define LEXERTL_PTR_VECTOR_HPP
|
||||
|
||||
#include "../size_t.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename ptr_type>
|
||||
class ptr_vector
|
||||
{
|
||||
public:
|
||||
typedef std::vector<ptr_type *> vector;
|
||||
|
||||
ptr_vector () :
|
||||
_vector ()
|
||||
{
|
||||
}
|
||||
|
||||
~ptr_vector ()
|
||||
{
|
||||
clear ();
|
||||
}
|
||||
|
||||
vector *operator -> ()
|
||||
{
|
||||
return &_vector;
|
||||
}
|
||||
|
||||
const vector *operator -> () const
|
||||
{
|
||||
return &_vector;
|
||||
}
|
||||
|
||||
vector &operator * ()
|
||||
{
|
||||
return _vector;
|
||||
}
|
||||
|
||||
const vector &operator * () const
|
||||
{
|
||||
return _vector;
|
||||
}
|
||||
|
||||
ptr_type * &operator [] (const std::size_t index_)
|
||||
{
|
||||
return _vector[index_];
|
||||
}
|
||||
|
||||
ptr_type * const &operator [] (const std::size_t index_) const
|
||||
{
|
||||
return _vector[index_];
|
||||
}
|
||||
|
||||
bool operator == (const ptr_vector &rhs_) const
|
||||
{
|
||||
bool equal_ = _vector.size () == rhs_._vector.size ();
|
||||
|
||||
if (equal_)
|
||||
{
|
||||
typename vector::const_iterator lhs_iter_ = _vector.begin ();
|
||||
typename vector::const_iterator end_ = _vector.end ();
|
||||
typename vector::const_iterator rhs_iter_ = rhs_._vector.begin ();
|
||||
|
||||
for (; equal_ && lhs_iter_ != end_; ++lhs_iter_, ++rhs_iter_)
|
||||
{
|
||||
equal_ = **lhs_iter_ == **rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
return equal_;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
if (!_vector.empty ())
|
||||
{
|
||||
ptr_type **iter_ = &_vector.front ();
|
||||
ptr_type **end_ = iter_ + _vector.size ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
delete *iter_;
|
||||
}
|
||||
}
|
||||
|
||||
_vector.clear ();
|
||||
}
|
||||
|
||||
private:
|
||||
vector _vector;
|
||||
|
||||
ptr_vector (const ptr_vector &); // No copy construction.
|
||||
ptr_vector &operator = (const ptr_vector &); // No assignment.
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
353
inc/lexertl/debug.hpp
Normal file
353
inc/lexertl/debug.hpp
Normal file
@@ -0,0 +1,353 @@
|
||||
// debug.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_DEBUG_HPP
|
||||
#define LEXERTL_DEBUG_HPP
|
||||
|
||||
#include <map>
|
||||
#include <ostream>
|
||||
#include "rules.hpp"
|
||||
#include "size_t.hpp"
|
||||
#include "state_machine.hpp"
|
||||
#include "string_token.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename sm, typename char_type, typename id_type = std::size_t,
|
||||
bool is_dfa = true>
|
||||
class basic_debug
|
||||
{
|
||||
public:
|
||||
typedef lexertl::basic_char_state_machine<char_type, id_type, is_dfa>
|
||||
char_state_machine;
|
||||
typedef std::basic_ostream<char_type> ostream;
|
||||
typedef lexertl::basic_rules<char_type, id_type> rules;
|
||||
typedef std::basic_string<char_type> string;
|
||||
|
||||
static void dump (const sm &sm_, rules &rules_, ostream &stream_)
|
||||
{
|
||||
char_state_machine csm_;
|
||||
|
||||
sm_to_csm (sm_, csm_);
|
||||
dump (csm_, rules_, stream_);
|
||||
}
|
||||
|
||||
static void dump (const sm &sm_, ostream &stream_)
|
||||
{
|
||||
char_state_machine csm_;
|
||||
|
||||
sm_to_csm (sm_, csm_);
|
||||
dump (csm_, stream_);
|
||||
}
|
||||
|
||||
static void dump (const char_state_machine &csm_, rules &rules_,
|
||||
ostream &stream_)
|
||||
{
|
||||
for (std::size_t dfa_ = 0, dfas_ = csm_.size (); dfa_ < dfas_; ++dfa_)
|
||||
{
|
||||
lexer_state (stream_);
|
||||
stream_ << rules_.state (dfa_) << std::endl << std::endl;
|
||||
|
||||
dump_ex (csm_._sm_deque[dfa_], stream_);
|
||||
}
|
||||
}
|
||||
|
||||
static void dump (const char_state_machine &csm_, ostream &stream_)
|
||||
{
|
||||
for (std::size_t dfa_ = 0, dfas_ = csm_.size (); dfa_ < dfas_; ++dfa_)
|
||||
{
|
||||
lexer_state (stream_);
|
||||
stream_ << dfa_ << std::endl << std::endl;
|
||||
|
||||
dump_ex (csm_._sm_deque[dfa_], stream_);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
typedef typename char_state_machine::state dfa_state;
|
||||
typedef typename dfa_state::string_token string_token;
|
||||
typedef std::basic_stringstream<char_type> stringstream;
|
||||
|
||||
static void sm_to_csm (const sm &sm_, char_state_machine &csm_)
|
||||
{
|
||||
const detail::basic_internals<id_type> &internals_ = sm_.data ();
|
||||
const std::size_t dfas_ = internals_._dfa->size ();
|
||||
|
||||
for (id_type i_ = 0; i_ < dfas_; ++i_)
|
||||
{
|
||||
if (internals_._dfa_alphabet[i_] == 0) continue;
|
||||
|
||||
const std::size_t alphabet_ = internals_._dfa_alphabet[i_] -
|
||||
transitions_index;
|
||||
typename char_state_machine::string_token_vector token_vector_
|
||||
(alphabet_, string_token ());
|
||||
id_type *ptr_ = &internals_._lookup[i_]->front ();
|
||||
|
||||
for (std::size_t c_ = 0; c_ < 256; ++c_, ++ptr_)
|
||||
{
|
||||
if (*ptr_ >= transitions_index)
|
||||
{
|
||||
string_token &token_ = token_vector_
|
||||
[*ptr_ - transitions_index];
|
||||
|
||||
token_.insert (typename string_token::range
|
||||
(typename string_token::index_type (c_),
|
||||
typename string_token::index_type (c_)));
|
||||
}
|
||||
}
|
||||
|
||||
csm_.append (token_vector_, internals_, i_);
|
||||
}
|
||||
}
|
||||
|
||||
static void dump_ex (const typename char_state_machine::dfa &dfa_,
|
||||
ostream &stream_)
|
||||
{
|
||||
const std::size_t states_ = dfa_._states.size ();
|
||||
const id_type bol_index_ = dfa_._bol_index;
|
||||
typename dfa_state::id_type_string_token_map::const_iterator iter_;
|
||||
typename dfa_state::id_type_string_token_map::const_iterator end_;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < states_; ++i_)
|
||||
{
|
||||
const dfa_state &state_ = dfa_._states[i_];
|
||||
|
||||
state (stream_);
|
||||
stream_ << i_ << std::endl;
|
||||
|
||||
if (state_._end_state)
|
||||
{
|
||||
end_state (stream_);
|
||||
|
||||
if (state_._push_pop_dfa == dfa_state::push_dfa)
|
||||
{
|
||||
push (stream_);
|
||||
stream_ << state_._push_dfa;
|
||||
}
|
||||
else if (state_._push_pop_dfa == dfa_state::pop_dfa)
|
||||
{
|
||||
pop (stream_);
|
||||
}
|
||||
|
||||
id (stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._id);
|
||||
user_id (stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._user_id);
|
||||
dfa (stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._next_dfa);
|
||||
stream_ << std::endl;
|
||||
}
|
||||
|
||||
if (i_ == 0 && bol_index_ != char_state_machine::npos ())
|
||||
{
|
||||
bol (stream_);
|
||||
stream_ << static_cast<std::size_t>(bol_index_) << std::endl;
|
||||
}
|
||||
|
||||
if (state_._eol_index != char_state_machine::npos ())
|
||||
{
|
||||
eol (stream_);
|
||||
stream_ << static_cast<std::size_t>(state_._eol_index) <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
iter_ = state_._transitions.begin ();
|
||||
end_ = state_._transitions.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
string_token token_ = iter_->second;
|
||||
|
||||
open_bracket (stream_);
|
||||
|
||||
if (!iter_->second.any () && iter_->second.negatable ())
|
||||
{
|
||||
token_.negate ();
|
||||
negated (stream_);
|
||||
}
|
||||
|
||||
string chars_;
|
||||
typename string_token::range_vector::const_iterator
|
||||
ranges_iter_ = token_._ranges.begin ();
|
||||
typename string_token::range_vector::const_iterator
|
||||
ranges_end_ = token_._ranges.end ();
|
||||
|
||||
for (; ranges_iter_ != ranges_end_; ++ranges_iter_)
|
||||
{
|
||||
if (ranges_iter_->first == '^' ||
|
||||
ranges_iter_->first == ']')
|
||||
{
|
||||
stream_ << '\\';
|
||||
}
|
||||
|
||||
chars_ = string_token::escape_char
|
||||
(ranges_iter_->first);
|
||||
|
||||
if (ranges_iter_->first != ranges_iter_->second)
|
||||
{
|
||||
if (ranges_iter_->first + 1 < ranges_iter_->second)
|
||||
{
|
||||
chars_ += '-';
|
||||
}
|
||||
|
||||
if (ranges_iter_->second == '^' ||
|
||||
ranges_iter_->second == ']')
|
||||
{
|
||||
stream_ << '\\';
|
||||
}
|
||||
|
||||
chars_ += string_token::escape_char
|
||||
(ranges_iter_->second);
|
||||
}
|
||||
|
||||
stream_ << chars_;
|
||||
}
|
||||
|
||||
close_bracket (stream_);
|
||||
stream_ << static_cast<std::size_t>(iter_->first) <<
|
||||
std::endl;
|
||||
}
|
||||
|
||||
stream_ << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
static void lexer_state (std::ostream &stream_)
|
||||
{
|
||||
stream_ << "Lexer state: ";
|
||||
}
|
||||
|
||||
static void lexer_state (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"Lexer state: ";
|
||||
}
|
||||
|
||||
static void state (std::ostream &stream_)
|
||||
{
|
||||
stream_ << "State: ";
|
||||
}
|
||||
|
||||
static void state (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"State: ";
|
||||
}
|
||||
|
||||
static void bol (std::ostream &stream_)
|
||||
{
|
||||
stream_ << " BOL -> ";
|
||||
}
|
||||
|
||||
static void bol (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" BOL -> ";
|
||||
}
|
||||
|
||||
static void eol (std::ostream &stream_)
|
||||
{
|
||||
stream_ << " EOL -> ";
|
||||
}
|
||||
|
||||
static void eol (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" EOL -> ";
|
||||
}
|
||||
|
||||
static void end_state (std::ostream &stream_)
|
||||
{
|
||||
stream_ << " END STATE";
|
||||
}
|
||||
|
||||
static void end_state (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" END STATE";
|
||||
}
|
||||
|
||||
static void id (std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", Id = ";
|
||||
}
|
||||
|
||||
static void id (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", Id = ";
|
||||
}
|
||||
|
||||
static void push (std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", PUSH ";
|
||||
}
|
||||
|
||||
static void push (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", PUSH ";
|
||||
}
|
||||
|
||||
static void pop (std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", POP";
|
||||
}
|
||||
|
||||
static void pop (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", POP";
|
||||
}
|
||||
|
||||
static void user_id (std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", User Id = ";
|
||||
}
|
||||
|
||||
static void user_id (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", User Id = ";
|
||||
}
|
||||
|
||||
static void open_bracket (std::ostream &stream_)
|
||||
{
|
||||
stream_ << " [";
|
||||
}
|
||||
|
||||
static void open_bracket (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L" [";
|
||||
}
|
||||
|
||||
static void negated (std::ostream &stream_)
|
||||
{
|
||||
stream_ << "^";
|
||||
}
|
||||
|
||||
static void negated (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"^";
|
||||
}
|
||||
|
||||
static void close_bracket (std::ostream &stream_)
|
||||
{
|
||||
stream_ << "] -> ";
|
||||
}
|
||||
|
||||
static void close_bracket (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L"] -> ";
|
||||
}
|
||||
|
||||
static void dfa (std::ostream &stream_)
|
||||
{
|
||||
stream_ << ", dfa = ";
|
||||
}
|
||||
|
||||
static void dfa (std::wostream &stream_)
|
||||
{
|
||||
stream_ << L", dfa = ";
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_debug<basic_state_machine<char>, char> debug;
|
||||
typedef basic_debug<basic_state_machine<wchar_t>, wchar_t> wdebug;
|
||||
}
|
||||
|
||||
#endif
|
||||
25
inc/lexertl/enums.hpp
Normal file
25
inc/lexertl/enums.hpp
Normal file
@@ -0,0 +1,25 @@
|
||||
// enums.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_ENUMS_H
|
||||
#define LEXERTL_ENUMS_H
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
enum regex_flags {icase = 1, dot_not_newline = 2, skip_ws = 4,
|
||||
match_zero_len = 8};
|
||||
// 0 = end state, 1 = id, 2 = user id, 3 = push_dfa_index
|
||||
// 4 = next dfa, 5 = dead state, 6 = dfa_start
|
||||
enum {end_state_index, id_index, user_id_index, push_dfa_index,
|
||||
next_dfa_index, eol_index, dead_state_index, transitions_index};
|
||||
// Rule flags:
|
||||
enum feature_flags {bol_bit = 1, eol_bit = 2, skip_bit = 4, again_bit = 8,
|
||||
multi_state_bit = 16, recursive_bit = 32, advance_bit = 64};
|
||||
// End state flags:
|
||||
enum {end_state_bit = 1, pop_dfa_bit = 2};
|
||||
}
|
||||
|
||||
#endif
|
||||
1122
inc/lexertl/generate_cpp.hpp
Normal file
1122
inc/lexertl/generate_cpp.hpp
Normal file
File diff suppressed because it is too large
Load Diff
829
inc/lexertl/generator.hpp
Normal file
829
inc/lexertl/generator.hpp
Normal file
@@ -0,0 +1,829 @@
|
||||
// generator.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_GENERATOR_HPP
|
||||
#define LEXERTL_GENERATOR_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include "bool.hpp"
|
||||
#include "partition/charset.hpp"
|
||||
#include "char_traits.hpp"
|
||||
#include "partition/equivset.hpp"
|
||||
#include <memory>
|
||||
#include "parser/parser.hpp"
|
||||
#include "containers/ptr_list.hpp"
|
||||
#include "rules.hpp"
|
||||
#include "size_t.hpp"
|
||||
#include "state_machine.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename rules, typename sm, typename char_traits = basic_char_traits
|
||||
<typename sm::traits::input_char_type> >
|
||||
class basic_generator
|
||||
{
|
||||
public:
|
||||
typedef typename rules::id_type id_type;
|
||||
typedef typename rules::char_type rules_char_type;
|
||||
typedef typename sm::traits sm_traits;
|
||||
typedef detail::basic_parser<rules_char_type, sm_traits> parser;
|
||||
typedef typename parser::charset_map charset_map;
|
||||
typedef typename parser::node node;
|
||||
typedef typename parser::node_ptr_vector node_ptr_vector;
|
||||
|
||||
static void build (const rules &rules_, sm &sm_)
|
||||
{
|
||||
const std::size_t size_ = rules_.statemap ().size ();
|
||||
// Strong exception guarantee
|
||||
// http://www.boost.org/community/exception_safety.html
|
||||
internals internals_;
|
||||
sm temp_sm_;
|
||||
node_ptr_vector node_ptr_vector_;
|
||||
|
||||
internals_._eoi = rules_.eoi ();
|
||||
internals_.add_states (size_);
|
||||
|
||||
for (id_type index_ = 0; index_ < size_; ++index_)
|
||||
{
|
||||
if (rules_.regexes ()[index_].empty ())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Lexer states with no rules are not allowed "
|
||||
"(lexer state " << index_ << ".)";
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
else
|
||||
{
|
||||
// Note that the following variables are per DFA.
|
||||
// Map of regex charset tokens (strings) to index
|
||||
charset_map charset_map_;
|
||||
// Used to fix up $ and \n clashes.
|
||||
id_type nl_id_ = sm_traits::npos ();
|
||||
// Regex syntax tree
|
||||
node *root_ = build_tree (rules_, index_, node_ptr_vector_,
|
||||
charset_map_, nl_id_);
|
||||
|
||||
build_dfa (charset_map_, root_, internals_, temp_sm_, index_,
|
||||
nl_id_);
|
||||
|
||||
if (internals_._dfa[index_]->size () /
|
||||
internals_._dfa_alphabet[index_] >= sm_traits::npos ())
|
||||
{
|
||||
// Overflow
|
||||
throw runtime_error ("The data type you have chosen "
|
||||
"cannot hold this many DFA rows.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If you get a compile error here the id_type from rules and
|
||||
// state machine do no match.
|
||||
create (internals_, temp_sm_, rules_.features (), lookup ());
|
||||
sm_.swap (temp_sm_);
|
||||
}
|
||||
|
||||
static node *build_tree (const rules &rules_, const std::size_t dfa_,
|
||||
node_ptr_vector &node_ptr_vector_, charset_map &charset_map_,
|
||||
id_type &nl_id_)
|
||||
{
|
||||
typename parser::macro_map macro_map_;
|
||||
parser parser_ (rules_.locale (), node_ptr_vector_, macro_map_,
|
||||
charset_map_, rules_.eoi ());
|
||||
const typename rules::string_deque_deque ®exes_ =
|
||||
rules_.regexes ();
|
||||
typename rules::string_deque::const_iterator regex_iter_ =
|
||||
regexes_[dfa_].begin ();
|
||||
typename rules::string_deque::const_iterator regex_iter_end_ =
|
||||
regexes_[dfa_].end ();
|
||||
const typename rules::string ®ex_ = *regex_iter_;
|
||||
const typename rules::id_vector_deque &ids_ = rules_.ids ();
|
||||
const typename rules::id_vector_deque &user_ids_ =
|
||||
rules_.user_ids ();
|
||||
typename rules::id_vector::const_iterator id_iter_ =
|
||||
ids_[dfa_].begin ();
|
||||
typename rules::id_vector::const_iterator user_id_iter_ =
|
||||
user_ids_[dfa_].begin ();
|
||||
const typename rules::id_vector_deque &next_dfas_ =
|
||||
rules_.next_dfas ();
|
||||
const typename rules::id_vector_deque &pushes_ = rules_.pushes ();
|
||||
const typename rules::bool_vector_deque &pops_ = rules_.pops ();
|
||||
typename rules::id_vector::const_iterator next_dfa_iter_ =
|
||||
next_dfas_[dfa_].begin ();
|
||||
typename rules::id_vector::const_iterator push_dfa_iter_ =
|
||||
pushes_[dfa_].begin ();
|
||||
typename rules::bool_vector::const_iterator pop_dfa_iter_ =
|
||||
pops_[dfa_].begin ();
|
||||
const bool seen_bol_ = (rules_.features ()[dfa_] & bol_bit) != 0;
|
||||
node *root_ = 0;
|
||||
|
||||
// Macros have a different context per lexer state
|
||||
// as equivsets (generally) differ.
|
||||
build_macros (rules_, macro_map_, node_ptr_vector_, charset_map_,
|
||||
nl_id_);
|
||||
root_ = parser_.parse (regex_.c_str (),
|
||||
regex_.c_str () + regex_.size (), *id_iter_, *user_id_iter_,
|
||||
*next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_,
|
||||
rules_.flags (), nl_id_, seen_bol_, false);
|
||||
++regex_iter_;
|
||||
++id_iter_;
|
||||
++user_id_iter_;
|
||||
++next_dfa_iter_;
|
||||
++push_dfa_iter_;
|
||||
++pop_dfa_iter_;
|
||||
|
||||
// Build syntax trees
|
||||
while (regex_iter_ != regex_iter_end_)
|
||||
{
|
||||
// Re-declare var, otherwise we perform an assignment..!
|
||||
const typename rules::string ®ex_ = *regex_iter_;
|
||||
node *rhs_ = parser_.parse (regex_.c_str (),
|
||||
regex_.c_str () + regex_.size (), *id_iter_, *user_id_iter_,
|
||||
*next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_,
|
||||
rules_.flags (), nl_id_,
|
||||
(rules_.features ()[dfa_] & bol_bit) != 0, false);
|
||||
|
||||
node_ptr_vector_->push_back
|
||||
(static_cast<selection_node *>(0));
|
||||
node_ptr_vector_->back () = new selection_node (root_, rhs_);
|
||||
root_ = node_ptr_vector_->back ();
|
||||
|
||||
++regex_iter_;
|
||||
++id_iter_;
|
||||
++user_id_iter_;
|
||||
++next_dfa_iter_;
|
||||
++push_dfa_iter_;
|
||||
++pop_dfa_iter_;
|
||||
}
|
||||
|
||||
return root_;
|
||||
}
|
||||
|
||||
protected:
|
||||
typedef bool_<sm_traits::compressed> compressed;
|
||||
typedef detail::basic_equivset<id_type> equivset;
|
||||
typedef detail::ptr_list<equivset> equivset_list;
|
||||
typedef std::auto_ptr<equivset> equivset_ptr;
|
||||
typedef typename sm_traits::char_type sm_char_type;
|
||||
typedef detail::basic_charset<sm_char_type, id_type> charset;
|
||||
typedef std::auto_ptr<charset> charset_ptr;
|
||||
typedef detail::ptr_list<charset> charset_list;
|
||||
typedef detail::basic_internals<id_type> internals;
|
||||
typedef typename std::set<id_type> id_type_set;
|
||||
typedef typename internals::id_type_vector id_type_vector;
|
||||
typedef typename charset::index_set index_set;
|
||||
typedef std::vector<index_set> index_set_vector;
|
||||
typedef bool_<sm_traits::is_dfa> is_dfa;
|
||||
typedef bool_<sm_traits::lookup> lookup;
|
||||
typedef typename parser::macro_map macro_map;
|
||||
typedef typename macro_map::iterator macro_iter;
|
||||
typedef std::pair<macro_iter, bool> macro_iter_pair;
|
||||
typedef std::set<const node *> node_set;
|
||||
typedef detail::ptr_vector<node_set> node_set_vector;
|
||||
typedef typename node::node_vector node_vector;
|
||||
typedef detail::ptr_vector<node_vector> node_vector_vector;
|
||||
typedef std::pair<typename rules::string, const node *> macro_pair;
|
||||
typedef typename parser::selection_node selection_node;
|
||||
typedef typename std::vector<std::size_t> size_t_vector;
|
||||
typedef typename parser::string_token string_token;
|
||||
|
||||
static void build_macros (const rules &rules_,
|
||||
macro_map ¯o_map_, node_ptr_vector &node_ptr_vector_,
|
||||
charset_map &charset_map_, id_type &nl_id_)
|
||||
{
|
||||
const typename rules::string_pair_deque ¯odeque_ =
|
||||
rules_.macrodeque ();
|
||||
|
||||
for (typename rules::string_pair_deque::const_iterator iter_ =
|
||||
macrodeque_.begin (), end_ = macrodeque_.end ();
|
||||
iter_ != end_; ++iter_)
|
||||
{
|
||||
const typename rules::string &name_ = iter_->first;
|
||||
const typename rules::string ®ex_ = iter_->second;
|
||||
parser parser_ (rules_.locale (), node_ptr_vector_, macro_map_,
|
||||
charset_map_, rules_.eoi ());
|
||||
node *node_ = parser_.parse (regex_.c_str (),
|
||||
regex_.c_str () + regex_.size (), 0, 0, 0, false, false,
|
||||
rules_.flags (), nl_id_, false, true);
|
||||
macro_iter_pair map_iter_ = macro_map_.insert (macro_pair (name_,
|
||||
static_cast<const node *>(0)));
|
||||
|
||||
map_iter_.first->second = node_;
|
||||
}
|
||||
}
|
||||
|
||||
static void build_dfa (const charset_map &charset_map_, const node *root_,
|
||||
internals &internals_, sm &sm_, const id_type dfa_index_,
|
||||
id_type &nl_id_)
|
||||
{
|
||||
// partitioned charset list
|
||||
charset_list charset_list_;
|
||||
// vector mapping token indexes to partitioned token index sets
|
||||
index_set_vector set_mapping_;
|
||||
typename internals::id_type_vector &dfa_ =
|
||||
*internals_._dfa[dfa_index_];
|
||||
std::size_t dfa_alphabet_ = 0;
|
||||
const node_vector *followpos_ = &root_->firstpos ();
|
||||
node_set_vector seen_sets_;
|
||||
node_vector_vector seen_vectors_;
|
||||
size_t_vector hash_vector_;
|
||||
id_type zero_id_ = sm_traits::npos ();
|
||||
id_type_set eol_set_;
|
||||
|
||||
set_mapping_.resize (charset_map_.size ());
|
||||
partition_charsets (charset_map_, charset_list_, is_dfa ());
|
||||
build_set_mapping (charset_list_, internals_, dfa_index_,
|
||||
set_mapping_);
|
||||
|
||||
if (nl_id_ != sm_traits::npos ())
|
||||
{
|
||||
nl_id_ = *set_mapping_[nl_id_].begin ();
|
||||
zero_id_ = sm_traits::compressed ?
|
||||
*set_mapping_[charset_map_.find (string_token (0, 0))->
|
||||
second].begin () : sm_traits::npos ();
|
||||
}
|
||||
|
||||
dfa_alphabet_ = charset_list_->size () + transitions_index +
|
||||
(nl_id_ == sm_traits::npos () ? 0 : 1);
|
||||
|
||||
if (dfa_alphabet_ > sm_traits::npos ())
|
||||
{
|
||||
// Overflow
|
||||
throw runtime_error ("The data type you have chosen cannot hold "
|
||||
"the dfa alphabet.");
|
||||
}
|
||||
|
||||
internals_._dfa_alphabet[dfa_index_] = dfa_alphabet_;
|
||||
// 'jam' state
|
||||
dfa_.resize (dfa_alphabet_, 0);
|
||||
closure (followpos_, seen_sets_, seen_vectors_, hash_vector_,
|
||||
dfa_alphabet_, dfa_);
|
||||
|
||||
for (id_type index_ = 0; index_ < static_cast<id_type>
|
||||
(seen_vectors_->size ()); ++index_)
|
||||
{
|
||||
equivset_list equiv_list_;
|
||||
|
||||
build_equiv_list (seen_vectors_[index_], set_mapping_,
|
||||
equiv_list_, is_dfa ());
|
||||
|
||||
for (typename equivset_list::list::const_iterator iter_ =
|
||||
equiv_list_->begin (), end_ = equiv_list_->end ();
|
||||
iter_ != end_; ++iter_)
|
||||
{
|
||||
equivset *equivset_ = *iter_;
|
||||
const id_type transition_ = closure
|
||||
(&equivset_->_followpos, seen_sets_, seen_vectors_,
|
||||
hash_vector_, dfa_alphabet_, dfa_);
|
||||
|
||||
if (transition_ != sm_traits::npos ())
|
||||
{
|
||||
id_type *ptr_ = &dfa_.front () + ((index_ + 1) *
|
||||
dfa_alphabet_);
|
||||
|
||||
// Prune abstemious transitions from end states.
|
||||
if (*ptr_ && !equivset_->_greedy) continue;
|
||||
|
||||
for (typename equivset::index_vector::const_iterator
|
||||
equiv_iter_ = equivset_->_index_vector.begin (),
|
||||
equiv_end_ = equivset_->_index_vector.end ();
|
||||
equiv_iter_ != equiv_end_; ++equiv_iter_)
|
||||
{
|
||||
const id_type i_ = *equiv_iter_;
|
||||
|
||||
if (i_ == parser::bol_token ())
|
||||
{
|
||||
dfa_.front () = transition_;
|
||||
}
|
||||
else if (i_ == parser:: eol_token ())
|
||||
{
|
||||
ptr_[eol_index] = transition_;
|
||||
eol_set_.insert (index_ + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
ptr_[i_ + transitions_index] = transition_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fix_clashes (eol_set_, nl_id_, zero_id_, dfa_, dfa_alphabet_,
|
||||
compressed ());
|
||||
append_dfa (charset_list_, internals_, sm_, dfa_index_, lookup ());
|
||||
}
|
||||
|
||||
// Uncompressed
|
||||
static void fix_clashes (const id_type_set &eol_set_,
|
||||
const id_type nl_id_, const id_type /*zero_id_*/,
|
||||
typename internals::id_type_vector &dfa_,
|
||||
const std::size_t dfa_alphabet_, const false_ &)
|
||||
{
|
||||
typename id_type_set::const_iterator eol_iter_ =
|
||||
eol_set_.begin ();
|
||||
typename id_type_set::const_iterator eol_end_ =
|
||||
eol_set_.end ();
|
||||
|
||||
for (; eol_iter_ != eol_end_; ++eol_iter_)
|
||||
{
|
||||
id_type *ptr_ = &dfa_.front () + *eol_iter_ * dfa_alphabet_;
|
||||
const id_type eol_state_ = ptr_[eol_index];
|
||||
const id_type nl_state_ = ptr_[nl_id_ + transitions_index];
|
||||
|
||||
if (nl_state_)
|
||||
{
|
||||
ptr_[transitions_index + nl_id_] = 0;
|
||||
ptr_ = &dfa_.front () + eol_state_ * dfa_alphabet_;
|
||||
|
||||
if (ptr_[transitions_index + nl_id_] == 0)
|
||||
{
|
||||
ptr_[transitions_index + nl_id_] = nl_state_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compressed
|
||||
static void fix_clashes (const id_type_set &eol_set_,
|
||||
const id_type nl_id_, const id_type zero_id_,
|
||||
typename internals::id_type_vector &dfa_,
|
||||
const std::size_t dfa_alphabet_, const true_ &)
|
||||
{
|
||||
typename id_type_set::const_iterator eol_iter_ =
|
||||
eol_set_.begin ();
|
||||
typename id_type_set::const_iterator eol_end_ =
|
||||
eol_set_.end ();
|
||||
std::size_t i_ = 0;
|
||||
|
||||
for (; eol_iter_ != eol_end_; ++eol_iter_)
|
||||
{
|
||||
id_type *ptr_ = &dfa_.front () + *eol_iter_ * dfa_alphabet_;
|
||||
const id_type eol_state_ = ptr_[eol_index];
|
||||
id_type nl_state_ = 0;
|
||||
|
||||
for (; i_ < (sm_traits::char_24_bit ? 2 : 1); ++i_)
|
||||
{
|
||||
ptr_ = &dfa_.front () + ptr_[transitions_index + zero_id_] *
|
||||
dfa_alphabet_;
|
||||
}
|
||||
|
||||
nl_state_ = ptr_[transitions_index + nl_id_];
|
||||
|
||||
if (nl_state_)
|
||||
{
|
||||
ptr_ = &dfa_.front () + eol_state_ * dfa_alphabet_;
|
||||
|
||||
if (ptr_[transitions_index + zero_id_] != 0) continue;
|
||||
|
||||
ptr_[transitions_index + zero_id_] = dfa_.size () /
|
||||
dfa_alphabet_;
|
||||
dfa_.resize (dfa_.size () + dfa_alphabet_, 0);
|
||||
|
||||
for (i_ = 0; i_ < (sm_traits::char_24_bit ? 1 : 0); ++i_)
|
||||
{
|
||||
ptr_ = &dfa_.front () + dfa_.size () - dfa_alphabet_;
|
||||
ptr_[transitions_index + zero_id_] = dfa_.size () /
|
||||
dfa_alphabet_;
|
||||
dfa_.resize (dfa_.size () + dfa_alphabet_, 0);
|
||||
}
|
||||
|
||||
ptr_ = &dfa_.front () + dfa_.size () - dfa_alphabet_;
|
||||
ptr_[transitions_index + nl_id_] = nl_state_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// char_state_machine version
|
||||
static void append_dfa (const charset_list &charset_list_,
|
||||
const internals &internals_, sm &sm_, const id_type dfa_index_,
|
||||
const false_ &)
|
||||
{
|
||||
typename charset_list::list::const_iterator list_iter_ =
|
||||
charset_list_->begin ();
|
||||
std::size_t size_ = charset_list_->size ();
|
||||
typename sm::string_token_vector token_vector_;
|
||||
|
||||
token_vector_.reserve (size_);
|
||||
|
||||
for (std::size_t i_ = 0; i_ < size_; ++i_, ++list_iter_)
|
||||
{
|
||||
const charset *charset_ = *list_iter_;
|
||||
|
||||
token_vector_.push_back (charset_->_token);
|
||||
}
|
||||
|
||||
sm_.append (token_vector_, internals_, dfa_index_);
|
||||
}
|
||||
|
||||
// state_machine version
|
||||
static void append_dfa (const charset_list &,
|
||||
const internals &, sm &, const id_type, const true_ &)
|
||||
{
|
||||
// Nothing to do - will use create() instead
|
||||
}
|
||||
|
||||
// char_state_machine version
|
||||
static void create (internals &, sm &, const id_type_vector &,
|
||||
const false_ &)
|
||||
{
|
||||
// Nothing to do - will use append_dfa() instead
|
||||
}
|
||||
|
||||
// state_machine version
|
||||
static void create (internals &internals_, sm &sm_,
|
||||
const id_type_vector &features_, const true_ &)
|
||||
{
|
||||
for (std::size_t i_ = 0, size_ = internals_._dfa->size ();
|
||||
i_ < size_; ++i_)
|
||||
{
|
||||
internals_._features |= features_[i_];
|
||||
}
|
||||
|
||||
if (internals_._dfa->size () > 1)
|
||||
{
|
||||
internals_._features |= multi_state_bit;
|
||||
}
|
||||
|
||||
sm_.data ().swap (internals_);
|
||||
}
|
||||
|
||||
// NFA version
|
||||
static void partition_charsets (const charset_map &map_,
|
||||
charset_list &lhs_, const false_ &)
|
||||
{
|
||||
fill_rhs_list (map_, lhs_);
|
||||
}
|
||||
|
||||
// DFA version
|
||||
static void partition_charsets (const charset_map &map_,
|
||||
charset_list &lhs_, const true_ &)
|
||||
{
|
||||
charset_list rhs_;
|
||||
|
||||
fill_rhs_list (map_, rhs_);
|
||||
|
||||
if (!rhs_->empty ())
|
||||
{
|
||||
typename charset_list::list::iterator iter_;
|
||||
typename charset_list::list::iterator end_;
|
||||
charset_ptr overlap_ (new charset);
|
||||
|
||||
lhs_->push_back (static_cast<charset *>(0));
|
||||
lhs_->back () = rhs_->front ();
|
||||
rhs_->pop_front ();
|
||||
|
||||
while (!rhs_->empty ())
|
||||
{
|
||||
charset_ptr r_ (rhs_->front ());
|
||||
|
||||
rhs_->pop_front ();
|
||||
iter_ = lhs_->begin ();
|
||||
end_ = lhs_->end ();
|
||||
|
||||
while (!r_->empty () && iter_ != end_)
|
||||
{
|
||||
typename charset_list::list::iterator l_iter_ = iter_;
|
||||
|
||||
(*l_iter_)->intersect (*r_.get (), *overlap_.get ());
|
||||
|
||||
if (overlap_->empty ())
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if ((*l_iter_)->empty ())
|
||||
{
|
||||
delete *l_iter_;
|
||||
*l_iter_ = overlap_.release ();
|
||||
overlap_.reset (new charset);
|
||||
++iter_;
|
||||
}
|
||||
else if (r_->empty ())
|
||||
{
|
||||
delete r_.release ();
|
||||
r_ = overlap_;
|
||||
overlap_.reset (new charset);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = lhs_->insert (++iter_,
|
||||
static_cast<charset *>(0));
|
||||
*iter_ = overlap_.release ();
|
||||
overlap_.reset (new charset);
|
||||
++iter_;
|
||||
end_ = lhs_->end ();
|
||||
}
|
||||
}
|
||||
|
||||
if (!r_->empty ())
|
||||
{
|
||||
lhs_->push_back (static_cast<charset *>(0));
|
||||
lhs_->back () = r_.release ();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_rhs_list (const charset_map &map_,
|
||||
charset_list &list_)
|
||||
{
|
||||
typename charset_map::const_iterator iter_ = map_.begin ();
|
||||
typename charset_map::const_iterator end_ = map_.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
list_->push_back (static_cast<charset *>(0));
|
||||
list_->back () = new charset (iter_->first, iter_->second);
|
||||
}
|
||||
}
|
||||
|
||||
static void build_set_mapping (const charset_list &charset_list_,
|
||||
internals &internals_, const id_type dfa_index_,
|
||||
index_set_vector &set_mapping_)
|
||||
{
|
||||
typename charset_list::list::const_iterator iter_ =
|
||||
charset_list_->begin ();
|
||||
typename charset_list::list::const_iterator end_ =
|
||||
charset_list_->end ();
|
||||
typename index_set::const_iterator set_iter_;
|
||||
typename index_set::const_iterator set_end_;
|
||||
|
||||
for (id_type index_ = 0; iter_ != end_; ++iter_, ++index_)
|
||||
{
|
||||
const charset *cs_ = *iter_;
|
||||
|
||||
set_iter_ = cs_->_index_set.begin ();
|
||||
set_end_ = cs_->_index_set.end ();
|
||||
fill_lookup (cs_->_token, internals_._lookup[dfa_index_],
|
||||
index_, lookup ());
|
||||
|
||||
for (; set_iter_ != set_end_; ++set_iter_)
|
||||
{
|
||||
set_mapping_[*set_iter_].insert (index_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// char_state_machine version
|
||||
static void fill_lookup (const string_token &, id_type_vector *,
|
||||
const id_type, const false_ &)
|
||||
{
|
||||
// Do nothing (lookup not used)
|
||||
}
|
||||
|
||||
// state_machine version
|
||||
static void fill_lookup (const string_token &charset_,
|
||||
id_type_vector *lookup_, const id_type index_, const true_ &)
|
||||
{
|
||||
typename string_token::range_vector::const_iterator iter_ =
|
||||
charset_._ranges.begin ();
|
||||
typename string_token::range_vector::const_iterator end_ =
|
||||
charset_._ranges.end ();
|
||||
id_type *ptr_ = &lookup_->front ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
for (typename char_traits::index_type char_ = iter_->first;
|
||||
char_ < iter_->second; ++char_)
|
||||
{
|
||||
// Note char_ must be unsigned
|
||||
ptr_[char_] = index_ + transitions_index;
|
||||
}
|
||||
|
||||
// Note iter_->second must be unsigned
|
||||
ptr_[iter_->second] = index_ + transitions_index;
|
||||
}
|
||||
}
|
||||
|
||||
static id_type closure (const node_vector *followpos_,
|
||||
node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
|
||||
size_t_vector &hash_vector_, const id_type size_, id_type_vector &dfa_)
|
||||
{
|
||||
bool end_state_ = false;
|
||||
id_type id_ = 0;
|
||||
id_type user_id_ = sm_traits::npos ();
|
||||
id_type next_dfa_ = 0;
|
||||
id_type push_dfa_ = sm_traits::npos ();
|
||||
bool pop_dfa_ = false;
|
||||
std::size_t hash_ = 0;
|
||||
|
||||
if (followpos_->empty ()) return sm_traits::npos ();
|
||||
|
||||
id_type index_ = 0;
|
||||
std::auto_ptr<node_set> set_ptr_ (new node_set);
|
||||
std::auto_ptr<node_vector> vector_ptr_ (new node_vector);
|
||||
|
||||
for (typename node_vector::const_iterator iter_ =
|
||||
followpos_->begin (), end_ = followpos_->end ();
|
||||
iter_ != end_; ++iter_)
|
||||
{
|
||||
closure_ex (*iter_, end_state_, id_, user_id_, next_dfa_,
|
||||
push_dfa_, pop_dfa_, set_ptr_.get (),
|
||||
vector_ptr_.get (), hash_);
|
||||
}
|
||||
|
||||
bool found_ = false;
|
||||
typename size_t_vector::const_iterator hash_iter_ =
|
||||
hash_vector_.begin ();
|
||||
typename size_t_vector::const_iterator hash_end_ =
|
||||
hash_vector_.end ();
|
||||
typename node_set_vector::vector::const_iterator set_iter_ =
|
||||
seen_sets_->begin ();
|
||||
|
||||
for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
|
||||
{
|
||||
found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
|
||||
++index_;
|
||||
|
||||
if (found_) break;
|
||||
}
|
||||
|
||||
if (!found_)
|
||||
{
|
||||
seen_sets_->push_back (static_cast<node_set *>(0));
|
||||
seen_sets_->back () = set_ptr_.release ();
|
||||
seen_vectors_->push_back (static_cast<node_vector *>(0));
|
||||
seen_vectors_->back () = vector_ptr_.release ();
|
||||
hash_vector_.push_back (hash_);
|
||||
// State 0 is the jam state...
|
||||
index_ = static_cast<id_type>(seen_sets_->size ());
|
||||
|
||||
const std::size_t old_size_ = dfa_.size ();
|
||||
|
||||
dfa_.resize (old_size_ + size_, 0);
|
||||
|
||||
if (end_state_)
|
||||
{
|
||||
dfa_[old_size_] |= end_state_bit;
|
||||
|
||||
if (pop_dfa_)
|
||||
{
|
||||
dfa_[old_size_] |= pop_dfa_bit;
|
||||
}
|
||||
|
||||
dfa_[old_size_ + id_index] = id_;
|
||||
dfa_[old_size_ + user_id_index] = user_id_;
|
||||
dfa_[old_size_ + push_dfa_index] = push_dfa_;
|
||||
dfa_[old_size_ + next_dfa_index] = next_dfa_;
|
||||
}
|
||||
}
|
||||
|
||||
return index_;
|
||||
}
|
||||
|
||||
static void closure_ex (node *node_, bool &end_state_,
|
||||
id_type &id_, id_type &user_id_, id_type &next_dfa_,
|
||||
id_type &push_dfa_, bool &pop_dfa_, node_set *set_ptr_,
|
||||
node_vector *vector_ptr_, std::size_t &hash_)
|
||||
{
|
||||
const bool temp_end_state_ = node_->end_state ();
|
||||
|
||||
if (temp_end_state_)
|
||||
{
|
||||
if (!end_state_)
|
||||
{
|
||||
end_state_ = true;
|
||||
id_ = node_->id ();
|
||||
user_id_ = node_->user_id ();
|
||||
next_dfa_ = node_->next_dfa ();
|
||||
push_dfa_ = node_->push_dfa ();
|
||||
pop_dfa_ = node_->pop_dfa ();
|
||||
}
|
||||
}
|
||||
|
||||
if (set_ptr_->insert (node_).second)
|
||||
{
|
||||
vector_ptr_->push_back (node_);
|
||||
hash_ += reinterpret_cast<std::size_t> (node_);
|
||||
}
|
||||
}
|
||||
|
||||
// NFA version
|
||||
static void build_equiv_list (const node_vector *vector_,
|
||||
const index_set_vector &set_mapping_, equivset_list &lhs_,
|
||||
const false_ &)
|
||||
{
|
||||
fill_rhs_list (vector_, set_mapping_, lhs_);
|
||||
}
|
||||
|
||||
// DFA version
|
||||
static void build_equiv_list (const node_vector *vector_,
|
||||
const index_set_vector &set_mapping_, equivset_list &lhs_,
|
||||
const true_ &)
|
||||
{
|
||||
equivset_list rhs_;
|
||||
|
||||
fill_rhs_list (vector_, set_mapping_, rhs_);
|
||||
|
||||
if (!rhs_->empty ())
|
||||
{
|
||||
typename equivset_list::list::iterator iter_;
|
||||
typename equivset_list::list::iterator end_;
|
||||
equivset_ptr overlap_ (new equivset);
|
||||
|
||||
lhs_->push_back (static_cast<equivset *>(0));
|
||||
lhs_->back () = rhs_->front ();
|
||||
rhs_->pop_front ();
|
||||
|
||||
while (!rhs_->empty ())
|
||||
{
|
||||
equivset_ptr r_ (rhs_->front ());
|
||||
|
||||
rhs_->pop_front ();
|
||||
iter_ = lhs_->begin ();
|
||||
end_ = lhs_->end ();
|
||||
|
||||
while (!r_->empty () && iter_ != end_)
|
||||
{
|
||||
typename equivset_list::list::iterator l_iter_ = iter_;
|
||||
|
||||
(*l_iter_)->intersect (*r_.get (), *overlap_.get ());
|
||||
|
||||
if (overlap_->empty ())
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if ((*l_iter_)->empty ())
|
||||
{
|
||||
delete *l_iter_;
|
||||
*l_iter_ = overlap_.release ();
|
||||
overlap_.reset (new equivset);
|
||||
++iter_;
|
||||
}
|
||||
else if (r_->empty ())
|
||||
{
|
||||
delete r_.release ();
|
||||
r_ = overlap_;
|
||||
overlap_.reset (new equivset);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = lhs_->insert (++iter_,
|
||||
static_cast<equivset *>(0));
|
||||
*iter_ = overlap_.release ();
|
||||
overlap_.reset (new equivset);
|
||||
++iter_;
|
||||
end_ = lhs_->end ();
|
||||
}
|
||||
}
|
||||
|
||||
if (!r_->empty ())
|
||||
{
|
||||
lhs_->push_back (static_cast<equivset *>(0));
|
||||
lhs_->back () = r_.release ();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_rhs_list (const node_vector *vector_,
|
||||
const index_set_vector &set_mapping_, equivset_list &list_)
|
||||
{
|
||||
typename node_vector::const_iterator iter_ =
|
||||
vector_->begin ();
|
||||
typename node_vector::const_iterator end_ =
|
||||
vector_->end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
const node *node_ = *iter_;
|
||||
|
||||
if (!node_->end_state ())
|
||||
{
|
||||
const id_type token_ = node_->token ();
|
||||
|
||||
if (token_ != node::null_token ())
|
||||
{
|
||||
list_->push_back (static_cast<equivset *>(0));
|
||||
|
||||
if (token_ == parser::bol_token () ||
|
||||
token_ == parser::eol_token ())
|
||||
{
|
||||
std::set<id_type> index_set_;
|
||||
|
||||
index_set_.insert (token_);
|
||||
list_->back () = new equivset (index_set_,
|
||||
token_, node_->greedy (), node_->followpos ());
|
||||
}
|
||||
else
|
||||
{
|
||||
list_->back () = new equivset (set_mapping_[token_],
|
||||
token_, node_->greedy (), node_->followpos ());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_generator<rules, state_machine> generator;
|
||||
typedef basic_generator<wrules, wstate_machine> wgenerator;
|
||||
typedef basic_generator<rules, char_state_machine> char_generator;
|
||||
typedef basic_generator<wrules, wchar_state_machine> wchar_generator;
|
||||
}
|
||||
|
||||
#endif
|
||||
80
inc/lexertl/internals.hpp
Normal file
80
inc/lexertl/internals.hpp
Normal file
@@ -0,0 +1,80 @@
|
||||
// internals.hpp
|
||||
// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_INTERNALS_HPP
|
||||
#define LEXERTL_INTERNALS_HPP
|
||||
|
||||
#include "enums.hpp"
|
||||
#include "containers/ptr_vector.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
struct basic_internals
|
||||
{
|
||||
typedef std::vector<id_type> id_type_vector;
|
||||
typedef ptr_vector<id_type_vector> id_type_vector_vector;
|
||||
|
||||
id_type _eoi;
|
||||
id_type_vector_vector _lookup;
|
||||
id_type_vector _dfa_alphabet;
|
||||
id_type _features;
|
||||
id_type_vector_vector _dfa;
|
||||
|
||||
basic_internals () :
|
||||
_eoi (0),
|
||||
_lookup (),
|
||||
_dfa_alphabet (),
|
||||
_features (0),
|
||||
_dfa ()
|
||||
{
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_eoi = 0;
|
||||
_lookup.clear ();
|
||||
_dfa_alphabet.clear ();
|
||||
_features = 0;
|
||||
_dfa.clear ();
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _dfa->empty ();
|
||||
}
|
||||
|
||||
void add_states (const std::size_t num_)
|
||||
{
|
||||
for (std::size_t index_ = 0; index_ < num_; ++index_)
|
||||
{
|
||||
_lookup->push_back (static_cast<id_type_vector *>(0));
|
||||
// lookup *always* has a size 256 now.
|
||||
_lookup->back () = new id_type_vector (256, dead_state_index);
|
||||
_dfa_alphabet.push_back (0);
|
||||
_dfa->push_back (static_cast<id_type_vector *>(0));
|
||||
_dfa->back () = new id_type_vector;
|
||||
}
|
||||
}
|
||||
|
||||
void swap (basic_internals &internals_)
|
||||
{
|
||||
std::swap (_eoi, internals_._eoi);
|
||||
_lookup->swap (*internals_._lookup);
|
||||
_dfa_alphabet.swap (internals_._dfa_alphabet);
|
||||
std::swap (_features, internals_._features);
|
||||
_dfa->swap (*internals_._dfa);
|
||||
}
|
||||
|
||||
private:
|
||||
basic_internals (const basic_internals &); // No copy construction.
|
||||
basic_internals &operator = (const basic_internals &); // No assignment.
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
29
inc/lexertl/is_same.hpp
Normal file
29
inc/lexertl/is_same.hpp
Normal file
@@ -0,0 +1,29 @@
|
||||
// is_same.hpp
|
||||
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_IS_SAME_HPP
|
||||
#define LEXERTL_IS_SAME_HPP
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename t1, typename t2>
|
||||
struct is_same
|
||||
{
|
||||
enum {same = false};
|
||||
};
|
||||
|
||||
template<typename t1>
|
||||
struct is_same<t1, t1>
|
||||
{
|
||||
enum {same = true};
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
24
inc/lexertl/licence_1_0.txt
Normal file
24
inc/lexertl/licence_1_0.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
Boost Software License - Version 1.0 - August 17th, 2003
|
||||
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
477
inc/lexertl/lookup.hpp
Normal file
477
inc/lexertl/lookup.hpp
Normal file
@@ -0,0 +1,477 @@
|
||||
// lookup.hpp
|
||||
// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_LOOKUP_HPP
|
||||
#define LEXERTL_LOOKUP_HPP
|
||||
|
||||
#include <assert.h>
|
||||
#include "bool.hpp"
|
||||
#include "match_results.hpp"
|
||||
#include "state_machine.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<bool>
|
||||
struct bol_state
|
||||
{
|
||||
bol_state (const bool)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct bol_state<true>
|
||||
{
|
||||
bool _bol;
|
||||
bool _end_bol;
|
||||
|
||||
bol_state (const bool bol_) :
|
||||
_bol (bol_),
|
||||
_end_bol (bol_)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, bool>
|
||||
struct eol_state
|
||||
{
|
||||
};
|
||||
|
||||
template<typename id_type>
|
||||
struct eol_state<id_type, true>
|
||||
{
|
||||
id_type _EOL_state;
|
||||
|
||||
eol_state () :
|
||||
_EOL_state (0)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, bool>
|
||||
struct multi_state_state
|
||||
{
|
||||
multi_state_state (const id_type)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type>
|
||||
struct multi_state_state<id_type, true>
|
||||
{
|
||||
id_type _start_state;
|
||||
|
||||
multi_state_state (const id_type state_) :
|
||||
_start_state (state_)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, bool>
|
||||
struct recursive_state
|
||||
{
|
||||
recursive_state (const id_type *)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type>
|
||||
struct recursive_state<id_type, true>
|
||||
{
|
||||
bool _pop;
|
||||
id_type _push_dfa;
|
||||
|
||||
recursive_state (const id_type *ptr_) :
|
||||
_pop ((*ptr_ & pop_dfa_bit) != 0),
|
||||
_push_dfa (*(ptr_ + push_dfa_index))
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<typename id_type, typename index_type, std::size_t flags>
|
||||
struct lookup_state
|
||||
{
|
||||
typedef basic_internals<id_type> internals;
|
||||
|
||||
const id_type *_lookup;
|
||||
id_type _dfa_alphabet;
|
||||
const id_type *_dfa;
|
||||
const id_type *_ptr;
|
||||
bool _end_state;
|
||||
id_type _id;
|
||||
id_type _uid;
|
||||
bol_state<(flags & bol_bit) != 0> _bol_state;
|
||||
eol_state<id_type, (flags & eol_bit) != 0> _eol_state;
|
||||
multi_state_state<id_type, (flags & multi_state_bit) != 0>
|
||||
_multi_state_state;
|
||||
recursive_state<id_type, (flags & recursive_bit) != 0> _recursive_state;
|
||||
|
||||
lookup_state (const internals &internals_, const bool bol_,
|
||||
const id_type state_) :
|
||||
_lookup (&internals_._lookup[state_]->front ()),
|
||||
_dfa_alphabet (internals_._dfa_alphabet[state_]),
|
||||
_dfa (&internals_._dfa[state_]->front ()),
|
||||
_ptr (_dfa + _dfa_alphabet),
|
||||
_end_state (*_ptr != 0),
|
||||
_id (*(_ptr + id_index)),
|
||||
_uid (*(_ptr + user_id_index)),
|
||||
_bol_state (bol_),
|
||||
_eol_state (),
|
||||
_multi_state_state (state_),
|
||||
_recursive_state (_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
void reset_recursive (const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void reset_recursive (const true_ &)
|
||||
{
|
||||
_recursive_state._pop = (*_ptr & pop_dfa_bit) != 0;
|
||||
_recursive_state._push_dfa = *(_ptr + push_dfa_index);
|
||||
}
|
||||
|
||||
void bol_start_state (const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void bol_start_state (const true_ &)
|
||||
{
|
||||
if (_bol_state._bol)
|
||||
{
|
||||
const id_type state_ = *_dfa;
|
||||
|
||||
if (state_)
|
||||
{
|
||||
_ptr = &_dfa[state_ * _dfa_alphabet];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
bool eol (const char_type, const false_ &)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
bool eol (const char_type curr_, const true_ &)
|
||||
{
|
||||
bool ret_ = false;
|
||||
|
||||
_eol_state._EOL_state = _ptr[eol_index];
|
||||
ret_ = _eol_state._EOL_state && curr_ == '\n';
|
||||
|
||||
if (ret_)
|
||||
{
|
||||
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
|
||||
}
|
||||
|
||||
return ret_;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
id_type next_char (const char_type prev_char_, const false_ &)
|
||||
{
|
||||
const id_type state_= _ptr[_lookup
|
||||
[static_cast<index_type>(prev_char_)]];
|
||||
|
||||
if (state_ != 0)
|
||||
{
|
||||
_ptr = &_dfa[state_ * _dfa_alphabet];
|
||||
}
|
||||
|
||||
return state_;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
id_type next_char (const char_type prev_char_, const true_ &)
|
||||
{
|
||||
const std::size_t bytes_ = sizeof (char_type) < 3 ?
|
||||
sizeof (char_type) : 3;
|
||||
const std::size_t shift_[] = {0, 8, 16};
|
||||
id_type state_= 0;
|
||||
|
||||
for (std::size_t i_ = 0; i_ < bytes_; ++i_)
|
||||
{
|
||||
state_ = _ptr[_lookup[static_cast<unsigned char>((prev_char_ >>
|
||||
shift_[bytes_ - 1 - i_]) & 0xff)]];
|
||||
|
||||
if (state_ == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
_ptr = &_dfa[state_ * _dfa_alphabet];
|
||||
}
|
||||
|
||||
return state_;
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
void bol (const char_type, const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
template<typename char_type>
|
||||
void bol (const char_type prev_char_, const true_ &)
|
||||
{
|
||||
_bol_state._bol = prev_char_ == '\n';
|
||||
}
|
||||
|
||||
void eol (const id_type, const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void eol (const id_type err_val_, const true_ &)
|
||||
{
|
||||
_eol_state._EOL_state = err_val_;
|
||||
}
|
||||
|
||||
void reset_start_state (const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void reset_start_state (const true_ &)
|
||||
{
|
||||
_multi_state_state._start_state = *(_ptr + next_dfa_index);
|
||||
}
|
||||
|
||||
void reset_end_bol (const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void reset_end_bol (const true_ &)
|
||||
{
|
||||
_bol_state._end_bol = _bol_state._bol;
|
||||
}
|
||||
|
||||
template<typename iter_type>
|
||||
void end_state (iter_type &end_token_, iter_type &curr_)
|
||||
{
|
||||
if (*_ptr)
|
||||
{
|
||||
_end_state = true;
|
||||
reset_end_bol (bool_<(flags & bol_bit) != 0> ());
|
||||
_id = *(_ptr + id_index);
|
||||
_uid = *(_ptr + user_id_index);
|
||||
reset_recursive (bool_<(flags & recursive_bit) != 0> ());
|
||||
reset_start_state (bool_<(flags & multi_state_bit) != 0> ());
|
||||
end_token_ = curr_;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iter_type, typename char_type>
|
||||
void check_eol (iter_type &, iter_type &, const id_type,
|
||||
const char_type, const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
template<typename iter_type, typename char_type>
|
||||
void check_eol (iter_type &end_token_, iter_type &curr_,
|
||||
const id_type npos, const char_type eoi_, const true_ &)
|
||||
{
|
||||
if (_eol_state._EOL_state != npos && curr_ == eoi_)
|
||||
{
|
||||
_eol_state._EOL_state = _ptr[eol_index];
|
||||
|
||||
if (_eol_state._EOL_state)
|
||||
{
|
||||
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
|
||||
end_state (end_token_, curr_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
void pop (results &, const false_ &)
|
||||
{
|
||||
// Nothing to do
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
void pop (results &results_, const true_ &)
|
||||
{
|
||||
if (_recursive_state._pop)
|
||||
{
|
||||
_multi_state_state._start_state = results_.stack.top ().first;
|
||||
results_.stack.pop ();
|
||||
}
|
||||
else if (_recursive_state._push_dfa != results::npos ())
|
||||
{
|
||||
results_.stack.push (typename results::id_type_pair
|
||||
(_recursive_state._push_dfa, _id));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
bool id_eoi (const id_type eoi_, const results &, const false_ &)
|
||||
{
|
||||
return _id == eoi_;
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
bool id_eoi (const id_type eoi_, const results &results_, const true_ &)
|
||||
{
|
||||
return _id == eoi_ || (_recursive_state._pop &&
|
||||
!results_.stack.empty () && results_.stack.top ().second == eoi_);
|
||||
}
|
||||
|
||||
void start_state (id_type &, const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void start_state (id_type &start_state_, const true_ &)
|
||||
{
|
||||
start_state_ = _multi_state_state._start_state;
|
||||
}
|
||||
|
||||
void bol (bool &, const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
void bol (bool &end_bol_, const true_ &)
|
||||
{
|
||||
end_bol_ = _bol_state._end_bol;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename results>
|
||||
void inc_end (results &, const false_ &)
|
||||
{
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
template<typename results>
|
||||
void inc_end (results &results_, const true_ &)
|
||||
{
|
||||
++results_.end;
|
||||
}
|
||||
|
||||
template<typename iter_type, std::size_t flags, typename id_type,
|
||||
typename results, bool compressed, bool recursive>
|
||||
void next (const basic_state_machine<typename std::iterator_traits
|
||||
<iter_type>::value_type, id_type> &sm_,
|
||||
results &results_, const bool_<compressed> &compressed_,
|
||||
const bool_<recursive> &recursive_)
|
||||
{
|
||||
const basic_internals<id_type> &internals_ = sm_.data ();
|
||||
typename results::iter_type end_token_ = results_.end;
|
||||
|
||||
skip:
|
||||
typename results::iter_type curr_ = results_.end;
|
||||
|
||||
results_.start = curr_;
|
||||
|
||||
again:
|
||||
if (curr_ == results_.eoi)
|
||||
{
|
||||
results_.id = internals_._eoi;
|
||||
results_.user_id = results::npos ();
|
||||
return;
|
||||
}
|
||||
|
||||
lookup_state<id_type, typename results::index_type, flags> lu_state_
|
||||
(internals_, results_.bol, results_.state);
|
||||
lu_state_.bol_start_state (bool_<(flags & bol_bit) != 0> ());
|
||||
|
||||
while (curr_ != results_.eoi)
|
||||
{
|
||||
if (!lu_state_.eol (*curr_, bool_<(flags & eol_bit) != 0> ()))
|
||||
{
|
||||
const typename results::char_type prev_char_ = *curr_++;
|
||||
const id_type state_ = lu_state_.next_char (prev_char_,
|
||||
compressed_);
|
||||
|
||||
lu_state_.bol (prev_char_, bool_<(flags & bol_bit) != 0> ());
|
||||
|
||||
if (state_ == 0)
|
||||
{
|
||||
lu_state_.eol (results::npos (),
|
||||
bool_<(flags & eol_bit) != 0> ());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lu_state_.end_state (end_token_, curr_);
|
||||
}
|
||||
|
||||
lu_state_.check_eol (end_token_, curr_, results::npos (), results_.eoi,
|
||||
bool_<(flags & eol_bit) != 0> ());
|
||||
|
||||
if (lu_state_._end_state)
|
||||
{
|
||||
// Return longest match
|
||||
lu_state_.pop (results_, recursive_);
|
||||
|
||||
lu_state_.start_state (results_.state,
|
||||
bool_<(flags & multi_state_bit) != 0> ());
|
||||
lu_state_.bol (results_.bol, bool_<(flags & bol_bit) != 0> ());
|
||||
results_.end = end_token_;
|
||||
|
||||
if (lu_state_._id == sm_.skip ()) goto skip;
|
||||
|
||||
if (lu_state_.id_eoi (internals_._eoi, results_, recursive_))
|
||||
{
|
||||
curr_ = end_token_;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
results_.end = end_token_;
|
||||
results_.bol = *results_.end == '\n';
|
||||
results_.start = results_.end;
|
||||
// No match causes char to be skipped
|
||||
inc_end (results_, bool_<(flags & advance_bit) != 0> ());
|
||||
lu_state_._id = results::npos ();
|
||||
lu_state_._uid = results::npos ();
|
||||
}
|
||||
|
||||
results_.id = lu_state_._id;
|
||||
results_.user_id = lu_state_._uid;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iter_type, typename id_type, std::size_t flags>
|
||||
void lookup (const basic_state_machine<typename std::iterator_traits
|
||||
<iter_type>::value_type, id_type> &sm_,
|
||||
match_results<iter_type, id_type, flags> &results_)
|
||||
{
|
||||
// If this asserts, you have either not defined all the correct
|
||||
// flags, or you should be using recursive_match_results instead
|
||||
// of match_results.
|
||||
assert ((sm_.data ()._features & flags) == sm_.data ()._features);
|
||||
detail::next<iter_type, flags, id_type> (sm_, results_, bool_<(sizeof
|
||||
(typename std::iterator_traits<iter_type>::value_type) > 1)> (),
|
||||
false_ ());
|
||||
}
|
||||
|
||||
template<typename iter_type, typename id_type, std::size_t flags>
|
||||
void lookup (const basic_state_machine<typename std::iterator_traits
|
||||
<iter_type>::value_type, id_type> &sm_,
|
||||
recursive_match_results<iter_type, id_type, flags> &results_)
|
||||
{
|
||||
// If this asserts, you have not defined all the correct flags
|
||||
assert ((sm_.data ()._features & flags) == sm_.data ()._features);
|
||||
detail::next<iter_type, flags | recursive_bit, id_type> (sm_, results_,
|
||||
bool_<(sizeof(typename std::iterator_traits<iter_type>::
|
||||
value_type) > 1)> (), true_ ());
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
150
inc/lexertl/match_results.hpp
Normal file
150
inc/lexertl/match_results.hpp
Normal file
@@ -0,0 +1,150 @@
|
||||
// match_results.hpp
|
||||
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_MATCH_RESULTS_HPP
|
||||
#define LEXERTL_MATCH_RESULTS_HPP
|
||||
|
||||
#include "char_traits.hpp"
|
||||
#include "enums.hpp"
|
||||
#include <iterator>
|
||||
#include <stack>
|
||||
#include <string>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename iter, typename id_type = std::size_t,
|
||||
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
|
||||
multi_state_bit | advance_bit>
|
||||
struct match_results
|
||||
{
|
||||
typedef iter iter_type;
|
||||
typedef typename std::iterator_traits<iter_type>::value_type char_type;
|
||||
typedef typename basic_char_traits<char_type>::index_type index_type;
|
||||
typedef std::basic_string<char_type> string;
|
||||
|
||||
id_type id;
|
||||
id_type user_id;
|
||||
iter_type start;
|
||||
iter_type end;
|
||||
iter_type eoi;
|
||||
bool bol;
|
||||
id_type state;
|
||||
|
||||
match_results () :
|
||||
id (0),
|
||||
user_id (npos ()),
|
||||
start (iter_type ()),
|
||||
end (iter_type ()),
|
||||
eoi (iter_type ()),
|
||||
bol (true),
|
||||
state (0)
|
||||
{
|
||||
}
|
||||
|
||||
match_results (const iter_type &start_, const iter_type &end_) :
|
||||
id (0),
|
||||
user_id (npos ()),
|
||||
start (start_),
|
||||
end (start_),
|
||||
eoi (end_),
|
||||
bol (true),
|
||||
state (0)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~match_results ()
|
||||
{
|
||||
}
|
||||
|
||||
string str () const
|
||||
{
|
||||
return string (start, end);
|
||||
}
|
||||
|
||||
virtual void clear ()
|
||||
{
|
||||
id = 0;
|
||||
user_id = npos ();
|
||||
start = eoi;
|
||||
end = eoi;
|
||||
bol = true;
|
||||
state = 0;
|
||||
}
|
||||
|
||||
virtual void reset (const iter_type &start_, const iter_type &end_)
|
||||
{
|
||||
id = 0;
|
||||
user_id = npos ();
|
||||
start = start_;
|
||||
end = start_;
|
||||
eoi = end_;
|
||||
bol = true;
|
||||
state = 0;
|
||||
}
|
||||
|
||||
static id_type npos ()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
static id_type skip ()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename iter, typename id_type = std::size_t,
|
||||
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
|
||||
multi_state_bit | recursive_bit | advance_bit>
|
||||
struct recursive_match_results : public match_results<iter, id_type, flags>
|
||||
{
|
||||
typedef std::pair<id_type, id_type> id_type_pair;
|
||||
std::stack<id_type_pair> stack;
|
||||
|
||||
recursive_match_results () :
|
||||
match_results<iter, id_type, flags> (),
|
||||
stack ()
|
||||
{
|
||||
}
|
||||
|
||||
recursive_match_results (const iter &start_, const iter &end_) :
|
||||
match_results<iter, id_type, flags> (start_, end_),
|
||||
stack ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~recursive_match_results ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual void clear ()
|
||||
{
|
||||
match_results<iter, id_type, flags>::clear ();
|
||||
|
||||
while (!stack.empty()) stack.pop ();
|
||||
}
|
||||
|
||||
virtual void reset (const iter &start_, const iter &end_)
|
||||
{
|
||||
match_results<iter, id_type, flags>::reset (start_, end_);
|
||||
|
||||
while (!stack.empty()) stack.pop ();
|
||||
}
|
||||
};
|
||||
|
||||
typedef match_results<std::string::const_iterator> smatch;
|
||||
typedef match_results<const char *> cmatch;
|
||||
typedef match_results<std::wstring::const_iterator> wsmatch;
|
||||
typedef match_results<const wchar_t *> wcmatch;
|
||||
|
||||
typedef recursive_match_results<std::string::const_iterator>
|
||||
srmatch;
|
||||
typedef recursive_match_results<const char *> crmatch;
|
||||
typedef recursive_match_results<std::wstring::const_iterator>
|
||||
wsrmatch;
|
||||
typedef recursive_match_results<const wchar_t *> wcrmatch;
|
||||
}
|
||||
|
||||
#endif
|
||||
112
inc/lexertl/memory_file.hpp
Normal file
112
inc/lexertl/memory_file.hpp
Normal file
@@ -0,0 +1,112 @@
|
||||
// memory_file.hpp
|
||||
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
|
||||
// Inspired by http://en.wikibooks.org/wiki/Optimizing_C%2B%2B/General_optimization_techniques/Input/Output#Memory-mapped_file
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_MEMORY_FILE_H
|
||||
#define LEXERTL_MEMORY_FILE_H
|
||||
|
||||
#ifdef __unix__
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#elif defined _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Only files small enough to fit into memory are supported.
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename CharT>
|
||||
class basic_memory_file
|
||||
{
|
||||
public:
|
||||
basic_memory_file (const char *pathname_) :
|
||||
_data (0),
|
||||
_size (0)
|
||||
{
|
||||
#ifdef __unix__
|
||||
_fh = ::open (pathname_, O_RDONLY);
|
||||
|
||||
if (_fh > -1)
|
||||
{
|
||||
struct stat sbuf_;
|
||||
|
||||
if (::fstat (_fh, &sbuf_) > -1)
|
||||
{
|
||||
_data = static_cast<const CharT *>
|
||||
(::mmap (0, sbuf_.st_size, PROT_READ, MAP_SHARED, _fh, 0));
|
||||
|
||||
if (_data == MAP_FAILED)
|
||||
{
|
||||
_data = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
_size = sbuf_.st_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined _WIN32
|
||||
_fh = ::CreateFileA (pathname_, GENERIC_READ, FILE_SHARE_READ, 0,
|
||||
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
|
||||
_fmh = 0;
|
||||
|
||||
if (_fh != INVALID_HANDLE_VALUE)
|
||||
{
|
||||
_fmh = ::CreateFileMapping (_fh, 0, PAGE_READONLY, 0, 0, 0);
|
||||
|
||||
if (_fmh != 0)
|
||||
{
|
||||
_data = static_cast<CharT *>(::MapViewOfFile
|
||||
(_fmh, FILE_MAP_READ, 0, 0, 0));
|
||||
|
||||
if (_data) _size = ::GetFileSize(_fh, 0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
~basic_memory_file ()
|
||||
{
|
||||
#if defined(__unix__)
|
||||
::munmap(const_cast<CharT *>(_data), _size);
|
||||
::close(_fh);
|
||||
#elif defined(_WIN32)
|
||||
::UnmapViewOfFile(_data);
|
||||
::CloseHandle(_fmh);
|
||||
::CloseHandle(_fh);
|
||||
#endif
|
||||
}
|
||||
|
||||
const CharT *data () const
|
||||
{
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::size_t size () const
|
||||
{
|
||||
return _size;
|
||||
}
|
||||
|
||||
private:
|
||||
const CharT *_data;
|
||||
std::size_t _size;
|
||||
#ifdef __unix__
|
||||
int _fh;
|
||||
#elif defined _WIN32
|
||||
HANDLE _fh;
|
||||
HANDLE _fmh;
|
||||
#else
|
||||
#error Only Posix or Windows are supported.
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef basic_memory_file<char> memory_file;
|
||||
typedef basic_memory_file<wchar_t> wmemory_file;
|
||||
}
|
||||
|
||||
#endif
|
||||
45
inc/lexertl/old/fast_filebuf.hpp
Normal file
45
inc/lexertl/old/fast_filebuf.hpp
Normal file
@@ -0,0 +1,45 @@
|
||||
// Quick hack...
|
||||
// If you find this really is faster then using std::ifstream, let me know
|
||||
// as I can always spend some more time to improve it.
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename CharT, class Traits>
|
||||
class basic_fast_filebuf : public std::basic_streambuf<CharT, Traits>
|
||||
{
|
||||
public:
|
||||
basic_fast_filebuf (const char *filename_) :
|
||||
_fp (0)
|
||||
{
|
||||
_fp = ::fopen(filename_, "r");
|
||||
}
|
||||
|
||||
virtual ~basic_fast_filebuf()
|
||||
{
|
||||
::fclose(_fp);
|
||||
_fp = 0;
|
||||
}
|
||||
|
||||
protected:
|
||||
FILE *_fp;
|
||||
|
||||
virtual std::streamsize xsgetn (CharT *ptr_, std::streamsize count_)
|
||||
{
|
||||
return ::fread (ptr_, sizeof(CharT),
|
||||
static_cast<std::size_t>(count_), _fp);
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_fast_filebuf<char, std::char_traits<char> > fast_filebuf;
|
||||
typedef basic_fast_filebuf<wchar_t, std::char_traits<wchar_t> > wfast_filebuf;
|
||||
}
|
||||
|
||||
// Usage:
|
||||
// lexertl::rules rules_;
|
||||
// lexertl::state_machine state_machine_;
|
||||
// fast_filebuf buf ("Unicode/PropList.txt");
|
||||
// std::istream if_(&buf);
|
||||
// lexertl::stream_shared_iterator iter_ (if_);
|
||||
// lexertl::stream_shared_iterator end_;
|
||||
// lexertl::match_results<lexertl::stream_shared_iterator>
|
||||
// results_(iter_, end_);
|
||||
561
inc/lexertl/old/string_token.hpp
Normal file
561
inc/lexertl/old/string_token.hpp
Normal file
@@ -0,0 +1,561 @@
|
||||
// string_token.hpp
|
||||
// Copyright (c) 2005-2010 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_STRING_TOKEN_HPP
|
||||
#define LEXERTL_STRING_TOKEN_HPP
|
||||
|
||||
#include "../char_traits.hpp"
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type>
|
||||
struct basic_string_token
|
||||
{
|
||||
typedef std::basic_string<char_type> string;
|
||||
|
||||
bool _negated;
|
||||
string _chars;
|
||||
|
||||
basic_string_token () :
|
||||
_negated (false)
|
||||
{
|
||||
}
|
||||
|
||||
basic_string_token (const bool negated_, const string &chars_) :
|
||||
_negated (negated_),
|
||||
_chars (chars_)
|
||||
{
|
||||
}
|
||||
|
||||
void remove_duplicates ()
|
||||
{
|
||||
const char_type *start_ = _chars.c_str ();
|
||||
const char_type *end_ = start_ + _chars.size ();
|
||||
|
||||
// Optimisation for very large charsets:
|
||||
// sorting via pointers is much quicker than
|
||||
// via iterators...
|
||||
std::sort (const_cast<char_type *> (start_), const_cast<char_type *>
|
||||
(end_));
|
||||
_chars.erase (std::unique (_chars.begin (), _chars.end ()),
|
||||
_chars.end ());
|
||||
}
|
||||
|
||||
void normalise ()
|
||||
{
|
||||
const std::size_t max_chars_ = sizeof (char_type) == 1 ?
|
||||
num_chars : num_wchar_ts;
|
||||
|
||||
if (_chars.length () == max_chars_)
|
||||
{
|
||||
_negated = !_negated;
|
||||
_chars.clear ();
|
||||
}
|
||||
else if (_chars.length () > max_chars_ / 2)
|
||||
{
|
||||
negate ();
|
||||
}
|
||||
}
|
||||
|
||||
void negate ()
|
||||
{
|
||||
const std::size_t max_chars_ = sizeof (char_type) == 1 ?
|
||||
num_chars : num_wchar_ts;
|
||||
char_type curr_char_ = std::numeric_limits<CharT>::min ();
|
||||
string temp_;
|
||||
const char_type *curr_ = _chars.c_str ();
|
||||
const char_type *chars_end_ = curr_ + _chars.size ();
|
||||
|
||||
_negated = !_negated;
|
||||
temp_.resize (max_chars_ - _chars.size ());
|
||||
|
||||
char_type *ptr_ = const_cast<char_type *> (temp_.c_str ());
|
||||
std::size_t i_ = 0;
|
||||
|
||||
while (curr_ < chars_end_)
|
||||
{
|
||||
while (*curr_ > curr_char_)
|
||||
{
|
||||
*ptr_ = curr_char_;
|
||||
++ptr_;
|
||||
++curr_char_;
|
||||
++i_;
|
||||
}
|
||||
|
||||
++curr_char_;
|
||||
++curr_;
|
||||
++i_;
|
||||
}
|
||||
|
||||
for (; i_ < max_chars_; ++i_)
|
||||
{
|
||||
*ptr_ = curr_char_;
|
||||
++ptr_;
|
||||
++curr_char_;
|
||||
}
|
||||
|
||||
_chars = temp_;
|
||||
}
|
||||
|
||||
bool operator < (const basic_string_token &rhs_) const
|
||||
{
|
||||
return _negated < rhs_._negated ||
|
||||
(_negated == rhs_._negated && _chars < rhs_._chars);
|
||||
}
|
||||
|
||||
bool operator == (const basic_string_token &rhs_) const
|
||||
{
|
||||
return _negated == rhs_._negated && _chars == rhs_._chars;
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _chars.empty () && !_negated;
|
||||
}
|
||||
|
||||
bool any () const
|
||||
{
|
||||
return _chars.empty () && _negated;
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_negated = false;
|
||||
_chars.clear ();
|
||||
}
|
||||
|
||||
void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
|
||||
{
|
||||
if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
|
||||
!any () && !rhs_.any ()))
|
||||
{
|
||||
intersect_same_types (rhs_, overlap_);
|
||||
}
|
||||
else
|
||||
{
|
||||
intersect_diff_types (rhs_, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void merge (const basic_string_token &rhs_,
|
||||
basic_string_token &merged_) const
|
||||
{
|
||||
if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
|
||||
!any () && !rhs_.any ()))
|
||||
{
|
||||
merge_same_types (rhs_, merged_);
|
||||
}
|
||||
else
|
||||
{
|
||||
merge_diff_types (rhs_, merged_);
|
||||
}
|
||||
}
|
||||
|
||||
static string escape_char (const char_type ch_)
|
||||
{
|
||||
string out_;
|
||||
|
||||
switch (ch_)
|
||||
{
|
||||
case '\0':
|
||||
out_ += '\\';
|
||||
out_ += '0';
|
||||
break;
|
||||
case '\a':
|
||||
out_ += '\\';
|
||||
out_ += 'a';
|
||||
break;
|
||||
case '\b':
|
||||
out_ += '\\';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case 27:
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
out_ += '1';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case '\f':
|
||||
out_ += '\\';
|
||||
out_ += 'f';
|
||||
break;
|
||||
case '\n':
|
||||
out_ += '\\';
|
||||
out_ += 'n';
|
||||
break;
|
||||
case '\r':
|
||||
out_ += '\\';
|
||||
out_ += 'r';
|
||||
break;
|
||||
case '\t':
|
||||
out_ += '\\';
|
||||
out_ += 't';
|
||||
break;
|
||||
case '\v':
|
||||
out_ += '\\';
|
||||
out_ += 'v';
|
||||
break;
|
||||
case '\\':
|
||||
out_ += '\\';
|
||||
out_ += '\\';
|
||||
break;
|
||||
case '"':
|
||||
out_ += '\\';
|
||||
out_ += '"';
|
||||
break;
|
||||
case '\'':
|
||||
out_ += '\\';
|
||||
out_ += '\'';
|
||||
break;
|
||||
default:
|
||||
{
|
||||
if (ch_ < 32)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
ss_ << std::hex <<
|
||||
static_cast<std::size_t> (ch_);
|
||||
out_ += ss_.str ();
|
||||
}
|
||||
else
|
||||
{
|
||||
out_ += ch_;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return out_;
|
||||
}
|
||||
|
||||
private:
|
||||
void intersect_same_types (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (any ())
|
||||
{
|
||||
clear ();
|
||||
overlap_._negated = true;
|
||||
rhs_.clear ();
|
||||
}
|
||||
else
|
||||
{
|
||||
typename string::iterator iter_ = _chars.begin ();
|
||||
typename string::iterator end_ = _chars.end ();
|
||||
typename string::iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
overlap_._negated = _negated;
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
overlap_._chars += *iter_;
|
||||
iter_ = _chars.erase (iter_);
|
||||
end_ = _chars.end ();
|
||||
rhs_iter_ = rhs_._chars.erase (rhs_iter_);
|
||||
rhs_end_ = rhs_._chars.end ();
|
||||
}
|
||||
}
|
||||
|
||||
if (_negated)
|
||||
{
|
||||
// duplicates already merged, so safe to merge
|
||||
// using std lib.
|
||||
|
||||
// src, dest
|
||||
merge (_chars, overlap_._chars);
|
||||
// duplicates already merged, so safe to merge
|
||||
// using std lib.
|
||||
|
||||
// src, dest
|
||||
merge (rhs_._chars, overlap_._chars);
|
||||
_negated = false;
|
||||
rhs_._negated = false;
|
||||
std::swap (_chars, rhs_._chars);
|
||||
normalise ();
|
||||
overlap_.normalise ();
|
||||
rhs_.normalise ();
|
||||
}
|
||||
else if (!overlap_._chars.empty ())
|
||||
{
|
||||
normalise ();
|
||||
overlap_.normalise ();
|
||||
rhs_.normalise ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_diff_types (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (any ())
|
||||
{
|
||||
intersect_any (rhs_, overlap_);
|
||||
}
|
||||
else if (_negated)
|
||||
{
|
||||
intersect_negated (rhs_, overlap_);
|
||||
}
|
||||
else // _negated == false
|
||||
{
|
||||
intersect_charset (rhs_, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_)
|
||||
{
|
||||
if (rhs_._negated)
|
||||
{
|
||||
rhs_.intersect_negated (*this, overlap_);
|
||||
}
|
||||
else // rhs._negated == false
|
||||
{
|
||||
rhs_.intersect_charset (*this, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_negated (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (rhs_.any ())
|
||||
{
|
||||
overlap_._negated = true;
|
||||
overlap_._chars = _chars;
|
||||
rhs_._negated = false;
|
||||
rhs_._chars = _chars;
|
||||
clear ();
|
||||
}
|
||||
else // rhs._negated == false
|
||||
{
|
||||
rhs_.intersect_charset (*this, overlap_);
|
||||
}
|
||||
}
|
||||
|
||||
void intersect_charset (basic_string_token &rhs_,
|
||||
basic_string_token &overlap_)
|
||||
{
|
||||
if (rhs_.any ())
|
||||
{
|
||||
overlap_._chars = _chars;
|
||||
rhs_._negated = true;
|
||||
rhs_._chars = _chars;
|
||||
clear ();
|
||||
}
|
||||
else // rhs_._negated == true
|
||||
{
|
||||
typename string::iterator iter_ = _chars.begin ();
|
||||
typename string::iterator end_ = _chars.end ();
|
||||
typename string::iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
overlap_._chars += *iter_;
|
||||
rhs_iter_ = rhs_._chars.insert (rhs_iter_, *iter_);
|
||||
++rhs_iter_;
|
||||
rhs_end_ = rhs_._chars.end ();
|
||||
iter_ = _chars.erase (iter_);
|
||||
end_ = _chars.end ();
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
++iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
if (iter_ != end_)
|
||||
{
|
||||
// nothing bigger in rhs_ than iter_,
|
||||
// so safe to merge using std lib.
|
||||
string temp_ (iter_, end_);
|
||||
|
||||
// src, dest
|
||||
merge (temp_, overlap_._chars);
|
||||
_chars.erase (iter_, end_);
|
||||
}
|
||||
|
||||
if (!overlap_._chars.empty ())
|
||||
{
|
||||
merge (overlap_._chars, rhs_._chars);
|
||||
// possible duplicates, so check for any and erase.
|
||||
rhs_._chars.erase (std::unique (rhs_._chars.begin (),
|
||||
rhs_._chars.end ()), rhs_._chars.end ());
|
||||
normalise ();
|
||||
overlap_.normalise ();
|
||||
rhs_.normalise ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void merge (string &src_, string &dest_)
|
||||
{
|
||||
string tmp_ (src_.size () + dest_.size (), 0);
|
||||
|
||||
std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (),
|
||||
tmp_.begin ());
|
||||
dest_ = tmp_;
|
||||
}
|
||||
|
||||
void merge_same_types (const basic_string_token &rhs_,
|
||||
basic_string_token &merged_) const
|
||||
{
|
||||
if (any ())
|
||||
{
|
||||
merged_._negated = true;
|
||||
}
|
||||
else if (_negated)
|
||||
{
|
||||
typename string::const_iterator iter_ = _chars.begin ();
|
||||
typename string::const_iterator end_ = _chars.end ();
|
||||
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
merged_._negated = _negated;
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
merged_._chars += *iter_;
|
||||
++iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
merged_.normalise ();
|
||||
}
|
||||
else
|
||||
{
|
||||
typename string::const_iterator iter_ = _chars.begin ();
|
||||
typename string::const_iterator end_ = _chars.end ();
|
||||
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*iter_ < *rhs_iter_)
|
||||
{
|
||||
merged_._chars += *iter_;
|
||||
++iter_;
|
||||
}
|
||||
else if (*iter_ > *rhs_iter_)
|
||||
{
|
||||
merged_._chars += *rhs_iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
merged_._chars += *iter_;
|
||||
++iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
// Include any trailing chars
|
||||
if (iter_ != end_)
|
||||
{
|
||||
string temp_ (iter_, end_);
|
||||
|
||||
merged_._chars += temp_;
|
||||
}
|
||||
else if (rhs_iter_ != rhs_end_)
|
||||
{
|
||||
string temp_ (rhs_iter_, rhs_end_);
|
||||
|
||||
merged_._chars += temp_;
|
||||
}
|
||||
|
||||
merged_.normalise ();
|
||||
}
|
||||
}
|
||||
|
||||
void merge_diff_types (const basic_string_token &rhs_,
|
||||
basic_string_token &merged_) const
|
||||
{
|
||||
if (_negated)
|
||||
{
|
||||
merge_negated (*this, rhs_, merged_);
|
||||
}
|
||||
else
|
||||
{
|
||||
merge_negated (rhs_, *this, merged_);
|
||||
}
|
||||
|
||||
merged_.normalise ();
|
||||
}
|
||||
|
||||
void merge_negated (const basic_string_token &lhs_,
|
||||
const basic_string_token &rhs_, basic_string_token &merged_) const
|
||||
{
|
||||
typename string::const_iterator lhs_iter_ = lhs_._chars.begin ();
|
||||
typename string::const_iterator lhs_end_ = lhs_._chars.end ();
|
||||
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
|
||||
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
|
||||
|
||||
merged_._negated = true;
|
||||
|
||||
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (*lhs_iter_ < *rhs_iter_)
|
||||
{
|
||||
merged_._chars += *lhs_iter_;
|
||||
++lhs_iter_;
|
||||
}
|
||||
else if (*lhs_iter_ > *rhs_iter_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
++lhs_iter_;
|
||||
++rhs_iter_;
|
||||
}
|
||||
}
|
||||
|
||||
// Only interested in any remaining 'negated' chars
|
||||
if (lhs_iter_ != lhs_end_)
|
||||
{
|
||||
string temp_ (lhs_iter_, lhs_end_);
|
||||
|
||||
merged_._chars += temp_;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
1076
inc/lexertl/parser/parser.hpp
Normal file
1076
inc/lexertl/parser/parser.hpp
Normal file
File diff suppressed because it is too large
Load Diff
100
inc/lexertl/parser/tokeniser/re_token.hpp
Normal file
100
inc/lexertl/parser/tokeniser/re_token.hpp
Normal file
@@ -0,0 +1,100 @@
|
||||
// re_token.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RE_TOKEN_HPP
|
||||
#define LEXERTL_RE_TOKEN_HPP
|
||||
|
||||
#include "../../string_token.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
enum token_type {BEGIN, REGEX, OREXP, SEQUENCE, SUB, EXPRESSION, REPEAT,
|
||||
DUP, OR, CHARSET, BOL, EOL, MACRO, OPENPAREN, CLOSEPAREN, OPT, AOPT,
|
||||
ZEROORMORE, AZEROORMORE, ONEORMORE, AONEORMORE, REPEATN, AREPEATN,
|
||||
END};
|
||||
|
||||
template<typename input_char_type, typename char_type>
|
||||
struct basic_re_token
|
||||
{
|
||||
typedef basic_string_token<char_type> string_token;
|
||||
typedef std::basic_string<input_char_type> string;
|
||||
|
||||
token_type _type;
|
||||
string _extra;
|
||||
string_token _str;
|
||||
|
||||
basic_re_token (const token_type type_ = BEGIN) :
|
||||
_type (type_),
|
||||
_extra (),
|
||||
_str ()
|
||||
{
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_type = BEGIN;
|
||||
_extra.clear ();
|
||||
_str.clear ();
|
||||
}
|
||||
|
||||
basic_re_token &operator = (const basic_re_token &rhs_)
|
||||
{
|
||||
_type = rhs_._type;
|
||||
_extra = rhs_._extra;
|
||||
_str = rhs_._str;
|
||||
return *this;
|
||||
}
|
||||
|
||||
char precedence (const token_type type_) const
|
||||
{
|
||||
// Moved in here for Solaris compiler.
|
||||
static const char precedence_table_[END + 1][END + 1] = {
|
||||
// BEG, REG, ORE, SEQ, SUB, EXP, RPT, DUP, | , CHR, BOL, EOL, MCR, ( , ) , ? , ?? , * , *? , + , +?, {n}?, {n}, END
|
||||
/*BEGIN*/{' ', '<', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/*REGEX*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/*OREXP*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* SEQ */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* SUB */{' ', ' ', ' ', ' ', ' ', '=', '<', ' ', '>', '<', '<', '<', '<', '<', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/*EXPRE*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* RPT */{' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', '>', '>', '>', '<', '<', '<', '<', '<', '<', '<', '<', '>'},
|
||||
/*DUPLI*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* | */{' ', ' ', ' ', '=', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '},
|
||||
/*CHARA*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
|
||||
/* BOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
|
||||
/* EOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
|
||||
/*MACRO*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
|
||||
/* ( */{' ', '=', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '},
|
||||
/* ) */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
|
||||
/* ? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* ?? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* * */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* *? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* + */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* +? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/*{n,m}*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/*{nm}?*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
|
||||
/* END */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '}
|
||||
};
|
||||
|
||||
return precedence_table_[_type][type_];
|
||||
}
|
||||
|
||||
const char *precedence_string () const
|
||||
{
|
||||
// Moved in here for Solaris compiler.
|
||||
static const char *precedence_strings_[END + 1] =
|
||||
{"BEGIN", "REGEX", "OREXP", "SEQUENCE", "SUB", "EXPRESSION",
|
||||
"REPEAT", "DUPLICATE", "|", "CHARSET", "^", "$", "MACRO", "(", ")",
|
||||
"?", "??", "*", "*?", "+", "+?", "{n[,[m]]}", "{n[,[m]]}?", "END"};
|
||||
|
||||
return precedence_strings_[_type];
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
829
inc/lexertl/parser/tokeniser/re_tokeniser.hpp
Normal file
829
inc/lexertl/parser/tokeniser/re_tokeniser.hpp
Normal file
@@ -0,0 +1,829 @@
|
||||
// tokeniser.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RE_TOKENISER_HPP
|
||||
#define LEXERTL_RE_TOKENISER_HPP
|
||||
|
||||
#include <cstring>
|
||||
#include "re_token.hpp"
|
||||
#include "../../runtime_error.hpp"
|
||||
#include "../../size_t.hpp"
|
||||
#include <sstream>
|
||||
#include "../../string_token.hpp"
|
||||
#include "re_tokeniser_helper.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename rules_char_type, typename char_type, typename id_type>
|
||||
class basic_re_tokeniser
|
||||
{
|
||||
public:
|
||||
typedef basic_re_token<rules_char_type, char_type> re_token;
|
||||
typedef basic_re_tokeniser_helper<rules_char_type, char_type, id_type>
|
||||
tokeniser_helper;
|
||||
typedef typename tokeniser_helper::char_state char_state;
|
||||
typedef typename tokeniser_helper::state state;
|
||||
typedef basic_string_token<char_type> string_token;
|
||||
|
||||
static void next (re_token *lhs_, state &state_, re_token *token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = state_.next (ch_);
|
||||
bool skipped_ = false;
|
||||
|
||||
token_->clear ();
|
||||
|
||||
do
|
||||
{
|
||||
// string begin/end
|
||||
while (!eos_ && ch_ == '"')
|
||||
{
|
||||
state_._in_string ^= 1;
|
||||
eos_ = state_.next (ch_);
|
||||
}
|
||||
|
||||
// (?# ...)
|
||||
skipped_ = comment (eos_, ch_, state_);
|
||||
// skip_ws set
|
||||
skipped_ |= skip (eos_, ch_, state_);
|
||||
} while (skipped_);
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
if (state_._in_string)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '\"') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (state_._paren_count)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing ')') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
token_->_type = END;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ch_ == '\\')
|
||||
{
|
||||
// Even if we are in a string, respect escape sequences...
|
||||
token_->_type = CHARSET;
|
||||
escape (state_, token_->_str);
|
||||
}
|
||||
else if (state_._in_string)
|
||||
{
|
||||
// All other meta characters lose their special meaning
|
||||
// inside a string.
|
||||
token_->_type = CHARSET;
|
||||
token_->_str.insert (typename string_token::range (ch_, ch_));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Not an escape sequence and not inside a string, so
|
||||
// check for meta characters.
|
||||
switch (ch_)
|
||||
{
|
||||
case '(':
|
||||
token_->_type = OPENPAREN;
|
||||
++state_._paren_count;
|
||||
read_options (state_);
|
||||
break;
|
||||
case ')':
|
||||
--state_._paren_count;
|
||||
|
||||
if (state_._paren_count < 0)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Number of open parenthesis < 0 "
|
||||
"at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
token_->_type = CLOSEPAREN;
|
||||
|
||||
if (!state_._flags_stack.empty ())
|
||||
{
|
||||
state_._flags = state_._flags_stack.top ();
|
||||
state_._flags_stack.pop ();
|
||||
}
|
||||
|
||||
break;
|
||||
case '?':
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AOPT;
|
||||
state_.increment ();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_type = OPT;
|
||||
}
|
||||
|
||||
break;
|
||||
case '*':
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AZEROORMORE;
|
||||
state_.increment ();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_type = ZEROORMORE;
|
||||
}
|
||||
|
||||
break;
|
||||
case '+':
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AONEORMORE;
|
||||
state_.increment ();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_type = ONEORMORE;
|
||||
}
|
||||
|
||||
break;
|
||||
case '{':
|
||||
open_curly (lhs_, state_, token_);
|
||||
break;
|
||||
case '|':
|
||||
token_->_type = OR;
|
||||
break;
|
||||
case '^':
|
||||
if (!state_._macro && state_._curr - 1 == state_._start)
|
||||
{
|
||||
token_->_type = BOL;
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_type = CHARSET;
|
||||
token_->_str.insert (typename string_token::range
|
||||
(ch_, ch_));
|
||||
}
|
||||
|
||||
break;
|
||||
case '$':
|
||||
if (!state_._macro && state_._curr == state_._end)
|
||||
{
|
||||
token_->_type = EOL;
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_type = CHARSET;
|
||||
token_->_str.insert (typename string_token::range
|
||||
(ch_, ch_));
|
||||
}
|
||||
|
||||
break;
|
||||
case '.':
|
||||
{
|
||||
token_->_type = CHARSET;
|
||||
|
||||
if (state_._flags & dot_not_newline)
|
||||
{
|
||||
token_->_str.insert (typename string_token::range
|
||||
('\n', '\n'));
|
||||
}
|
||||
|
||||
token_->_str.negate ();
|
||||
break;
|
||||
}
|
||||
case '[':
|
||||
{
|
||||
token_->_type = CHARSET;
|
||||
tokeniser_helper::charset (state_, token_->_str);
|
||||
break;
|
||||
}
|
||||
case '/':
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Lookahead ('/') is not supported yet in " <<
|
||||
"rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
token_->_type = CHARSET;
|
||||
|
||||
if ((state_._flags & icase) &&
|
||||
(std::isupper (ch_, state_._locale) ||
|
||||
std::islower (ch_, state_._locale)))
|
||||
{
|
||||
char_type upper_ = std::toupper
|
||||
(ch_, state_._locale);
|
||||
char_type lower_ = std::tolower
|
||||
(ch_, state_._locale);
|
||||
|
||||
token_->_str.insert (typename string_token::range
|
||||
(upper_, upper_));
|
||||
token_->_str.insert (typename string_token::range
|
||||
(lower_, lower_));
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_str.insert (typename string_token::range
|
||||
(ch_, ch_));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static bool comment (bool &eos_, rules_char_type &ch_, state &state_)
|
||||
{
|
||||
bool skipped_ = false;
|
||||
|
||||
if (!eos_ && !state_._in_string && ch_ == '(' &&
|
||||
!state_.eos () && *state_._curr == '?' &&
|
||||
state_._curr + 1 < state_._end && *(state_._curr + 1) == '#')
|
||||
{
|
||||
std::size_t paren_count_ = 1;
|
||||
|
||||
state_.increment ();
|
||||
state_.increment ();
|
||||
|
||||
do
|
||||
{
|
||||
eos_ = state_.next (ch_);
|
||||
|
||||
if (ch_ == '(')
|
||||
{
|
||||
++paren_count_;
|
||||
}
|
||||
else if (ch_ == ')')
|
||||
{
|
||||
--paren_count_;
|
||||
}
|
||||
} while (!eos_ && !(ch_ == ')' && paren_count_ == 0));
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (unterminated comment) " <<
|
||||
"in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
else
|
||||
{
|
||||
eos_ = state_.next (ch_);
|
||||
}
|
||||
|
||||
skipped_ = true;
|
||||
}
|
||||
|
||||
return skipped_;
|
||||
}
|
||||
|
||||
static bool skip (bool &eos_, rules_char_type &ch_, state &state_)
|
||||
{
|
||||
bool skipped_ = false;
|
||||
|
||||
if (!eos_ && (state_._flags & skip_ws) && !state_._in_string)
|
||||
{
|
||||
bool c_comment_ = false;
|
||||
bool skip_ws_ = false;
|
||||
|
||||
do
|
||||
{
|
||||
c_comment_ = ch_ == '/' && !state_.eos () &&
|
||||
*state_._curr == '*';
|
||||
skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' ||
|
||||
ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v');
|
||||
|
||||
if (c_comment_)
|
||||
{
|
||||
state_.increment ();
|
||||
eos_ = state_.next (ch_);
|
||||
|
||||
while (!eos_ && !(ch_ == '*' && !state_.eos () &&
|
||||
*state_._curr == '/'))
|
||||
{
|
||||
eos_ = state_.next (ch_);
|
||||
}
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (unterminated " <<
|
||||
"C style comment) in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
else
|
||||
{
|
||||
state_.increment ();
|
||||
eos_ = state_.next (ch_);
|
||||
}
|
||||
|
||||
skipped_ = true;
|
||||
}
|
||||
else if (skip_ws_)
|
||||
{
|
||||
eos_ = state_.next (ch_);
|
||||
skipped_ = true;
|
||||
}
|
||||
} while (c_comment_ || skip_ws_);
|
||||
}
|
||||
|
||||
return skipped_;
|
||||
}
|
||||
|
||||
static void read_options (state &state_)
|
||||
{
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = false;
|
||||
bool negate_ = false;
|
||||
|
||||
state_.increment ();
|
||||
eos_ = state_.next (ch_);
|
||||
state_._flags_stack.push (state_._flags);
|
||||
|
||||
while (!eos_ && ch_ != ':')
|
||||
{
|
||||
switch (ch_)
|
||||
{
|
||||
case '-':
|
||||
negate_ ^= 1;
|
||||
break;
|
||||
case 'i':
|
||||
if (negate_)
|
||||
{
|
||||
state_._flags = state_._flags & ~icase;
|
||||
}
|
||||
else
|
||||
{
|
||||
state_._flags = state_._flags | icase;
|
||||
}
|
||||
|
||||
negate_ = false;
|
||||
break;
|
||||
case 's':
|
||||
if (negate_)
|
||||
{
|
||||
state_._flags = state_._flags | dot_not_newline;
|
||||
}
|
||||
else
|
||||
{
|
||||
state_._flags = state_._flags & ~dot_not_newline;
|
||||
}
|
||||
|
||||
negate_ = false;
|
||||
break;
|
||||
case 'x':
|
||||
if (negate_)
|
||||
{
|
||||
state_._flags = state_._flags & ~skip_ws;
|
||||
}
|
||||
else
|
||||
{
|
||||
state_._flags = state_._flags | skip_ws;
|
||||
}
|
||||
|
||||
negate_ = false;
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Unknown option at index " <<
|
||||
state_.index () - 1 << " in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
}
|
||||
|
||||
eos_ = state_.next (ch_);
|
||||
}
|
||||
|
||||
// End of string handler will handle early termination
|
||||
}
|
||||
else if (!state_._flags_stack.empty ())
|
||||
{
|
||||
state_._flags_stack.push (state_._flags);
|
||||
}
|
||||
}
|
||||
|
||||
static void escape (state &state_, string_token &token_)
|
||||
{
|
||||
char_type ch_ = 0;
|
||||
std::size_t str_len_ = 0;
|
||||
const char *str_ = tokeniser_helper::escape_sequence (state_,
|
||||
ch_, str_len_);
|
||||
|
||||
if (str_)
|
||||
{
|
||||
char_state state2_ (str_ + 1, str_ + str_len_, state_._id,
|
||||
state_._flags, state_._locale, false);
|
||||
|
||||
tokeniser_helper::charset (state2_, token_);
|
||||
}
|
||||
else
|
||||
{
|
||||
token_.insert (typename string_token::range (ch_, ch_));
|
||||
}
|
||||
}
|
||||
|
||||
static void open_curly (re_token *lhs_, state &state_,
|
||||
re_token *token_)
|
||||
{
|
||||
if (state_.eos ())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
else if (*state_._curr == '-')
|
||||
{
|
||||
charset_difference (lhs_, state_, token_);
|
||||
}
|
||||
else if (*state_._curr == '+')
|
||||
{
|
||||
charset_union (lhs_, state_, token_);
|
||||
}
|
||||
else if (*state_._curr >= '0' && *state_._curr <= '9')
|
||||
{
|
||||
repeat_n (state_, token_);
|
||||
}
|
||||
else
|
||||
{
|
||||
macro (state_, token_);
|
||||
}
|
||||
}
|
||||
|
||||
static void charset_difference (re_token *lhs_, state &state_,
|
||||
re_token *token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
|
||||
if (lhs_->_type != CHARSET)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "CHARSET must precede {-} at index " <<
|
||||
state_.index () - 1 << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
state_.next (ch_);
|
||||
|
||||
if (state_.next (ch_))
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing '}' at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
re_token rhs_;
|
||||
|
||||
next (lhs_, state_, &rhs_);
|
||||
|
||||
if (rhs_._type != CHARSET)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "CHARSET must follow {-} at index " <<
|
||||
state_.index () - 1 << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
lhs_->_str.remove (rhs_._str);
|
||||
|
||||
if (lhs_->_str.empty ())
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Empty charset created by {-} at index " <<
|
||||
state_.index () - 1 << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
next (lhs_, state_, token_);
|
||||
}
|
||||
|
||||
static void charset_union (re_token *lhs_, state &state_,
|
||||
re_token *token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
|
||||
if (lhs_->_type != CHARSET)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "CHARSET must precede {+} at index " <<
|
||||
state_.index () - 1 << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
state_.next (ch_);
|
||||
|
||||
if (state_.next (ch_))
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing '}' at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
re_token rhs_;
|
||||
|
||||
next (lhs_, state_, &rhs_);
|
||||
|
||||
if (rhs_._type != CHARSET)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "CHARSET must follow {+} at index " <<
|
||||
state_.index () - 1 << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
lhs_->_str.insert (rhs_._str);
|
||||
next (lhs_, state_, token_);
|
||||
}
|
||||
|
||||
// SYNTAX:
|
||||
// {n[,[n]]}
|
||||
// SEMANTIC RULES:
|
||||
// {0} - INVALID (throw exception)
|
||||
// {0,} = *
|
||||
// {0,0} - INVALID (throw exception)
|
||||
// {0,1} = ?
|
||||
// {1,} = +
|
||||
// {min,max} where min == max - {min}
|
||||
// {min,max} where max < min - INVALID (throw exception)
|
||||
static void repeat_n (state &state_, re_token *token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = state_.next (ch_);
|
||||
std::size_t min_ = 0;
|
||||
std::size_t max_ = 0;
|
||||
|
||||
while (!eos_ && ch_ >= '0' && ch_ <= '9')
|
||||
{
|
||||
min_ *= 10;
|
||||
min_ += ch_ - '0';
|
||||
token_->_extra += ch_;
|
||||
eos_ = state_.next (ch_);
|
||||
}
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
bool min_max_ = false;
|
||||
bool repeatn_ = true;
|
||||
|
||||
if (ch_ == ',')
|
||||
{
|
||||
token_->_extra += ch_;
|
||||
eos_ = state_.next (ch_);
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
|
||||
state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (ch_ == '}')
|
||||
{
|
||||
// Small optimisation: Check for '*' equivalency.
|
||||
if (min_ == 0)
|
||||
{
|
||||
token_->_type = ZEROORMORE;
|
||||
repeatn_ = false;
|
||||
}
|
||||
// Small optimisation: Check for '+' equivalency.
|
||||
else if (min_ == 1)
|
||||
{
|
||||
token_->_type = ONEORMORE;
|
||||
repeatn_ = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ch_ < '0' || ch_ > '9')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing '}' at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
min_max_ = true;
|
||||
|
||||
do
|
||||
{
|
||||
max_ *= 10;
|
||||
max_ += ch_ - '0';
|
||||
token_->_extra += ch_;
|
||||
eos_ = state_.next (ch_);
|
||||
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex (missing '}') "
|
||||
"in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
// Small optimisation: Check for '?' equivalency.
|
||||
if (min_ == 0 && max_ == 1)
|
||||
{
|
||||
token_->_type = OPT;
|
||||
repeatn_ = false;
|
||||
}
|
||||
// Small optimisation: if min == max, then min.
|
||||
else if (min_ == max_)
|
||||
{
|
||||
token_->_extra.erase (token_->_extra.find (','));
|
||||
min_max_ = false;
|
||||
max_ = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing '}' at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (repeatn_)
|
||||
{
|
||||
// SEMANTIC VALIDATION follows:
|
||||
// NOTE: {0,} has already become *
|
||||
// therefore we don't check for a comma.
|
||||
if (min_ == 0 && max_ == 0)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Cannot have exactly zero repeats preceding index " <<
|
||||
state_.index () << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (min_max_ && max_ < min_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Max less than min preceding index " <<
|
||||
state_.index () << " in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AREPEATN;
|
||||
state_.increment ();
|
||||
}
|
||||
else
|
||||
{
|
||||
token_->_type = REPEATN;
|
||||
}
|
||||
}
|
||||
else if (token_->_type == ZEROORMORE)
|
||||
{
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AZEROORMORE;
|
||||
state_.increment ();
|
||||
}
|
||||
}
|
||||
else if (token_->_type == ONEORMORE)
|
||||
{
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AONEORMORE;
|
||||
state_.increment ();
|
||||
}
|
||||
}
|
||||
else if (token_->_type == OPT)
|
||||
{
|
||||
if (!state_.eos () && *state_._curr == '?')
|
||||
{
|
||||
token_->_type = AOPT;
|
||||
state_.increment ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void macro (state &state_, re_token *token_)
|
||||
{
|
||||
rules_char_type ch_ = 0;
|
||||
bool eos_ = false;
|
||||
|
||||
state_.next (ch_);
|
||||
|
||||
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
|
||||
!(ch_ >= 'a' && ch_ <= 'z'))
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Invalid MACRO name at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
token_->_extra += ch_;
|
||||
eos_ = state_.next (ch_);
|
||||
|
||||
if (eos_)
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
// Pointless returning index if at end of string
|
||||
ss_ << "Unexpected end of regex " <<
|
||||
"(missing '}') in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
|
||||
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
|
||||
|
||||
if (ch_ != '}')
|
||||
{
|
||||
std::ostringstream ss_;
|
||||
|
||||
ss_ << "Missing '}' at index " << state_.index () - 1 <<
|
||||
" in rule id " << state_._id << '.';
|
||||
throw runtime_error (ss_.str ());
|
||||
}
|
||||
|
||||
token_->_type = MACRO;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
2351
inc/lexertl/parser/tokeniser/re_tokeniser_helper.hpp
Normal file
2351
inc/lexertl/parser/tokeniser/re_tokeniser_helper.hpp
Normal file
File diff suppressed because it is too large
Load Diff
115
inc/lexertl/parser/tokeniser/re_tokeniser_state.hpp
Normal file
115
inc/lexertl/parser/tokeniser/re_tokeniser_state.hpp
Normal file
@@ -0,0 +1,115 @@
|
||||
// tokeniser_state.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RE_TOKENISER_STATE_HPP
|
||||
#define LEXERTL_RE_TOKENISER_STATE_HPP
|
||||
|
||||
#include "../../char_traits.hpp"
|
||||
#include "../../enums.hpp"
|
||||
#include <locale>
|
||||
#include "../../size_t.hpp"
|
||||
#include <stack>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename ch_type, typename id_type>
|
||||
struct basic_re_tokeniser_state
|
||||
{
|
||||
typedef ch_type char_type;
|
||||
typedef typename basic_char_traits<char_type>::index_type index_type;
|
||||
|
||||
const char_type * const _start;
|
||||
const char_type * const _end;
|
||||
const char_type *_curr;
|
||||
id_type _id;
|
||||
std::size_t _flags;
|
||||
std::stack<std::size_t> _flags_stack;
|
||||
std::locale _locale;
|
||||
bool _macro;
|
||||
long _paren_count;
|
||||
bool _in_string;
|
||||
id_type _nl_id;
|
||||
|
||||
basic_re_tokeniser_state (const char_type *start_,
|
||||
const char_type * const end_, id_type id_, const std::size_t flags_,
|
||||
const std::locale locale_, const bool macro_) :
|
||||
_start (start_),
|
||||
_end (end_),
|
||||
_curr (start_),
|
||||
_id (id_),
|
||||
_flags (flags_),
|
||||
_flags_stack (),
|
||||
_locale (locale_),
|
||||
_macro (macro_),
|
||||
_paren_count (0),
|
||||
_in_string (false),
|
||||
_nl_id (static_cast<id_type>(~0))
|
||||
{
|
||||
}
|
||||
|
||||
basic_re_tokeniser_state (const basic_re_tokeniser_state &rhs_)
|
||||
{
|
||||
assign (rhs_);
|
||||
}
|
||||
|
||||
// prevent VC++ 7.1 warning:
|
||||
const basic_re_tokeniser_state &operator =
|
||||
(const basic_re_tokeniser_state &rhs_)
|
||||
{
|
||||
assign (rhs_);
|
||||
}
|
||||
|
||||
void assign (const basic_re_tokeniser_state &rhs_)
|
||||
{
|
||||
_start = rhs_._start;
|
||||
_end = rhs_._end;
|
||||
_curr = rhs_._curr;
|
||||
_id = rhs_._id;
|
||||
_flags = rhs_._flags;
|
||||
_flags_stack = rhs_._flags_stack;
|
||||
_locale = rhs_._locale;
|
||||
_macro = rhs_._macro;
|
||||
_paren_count = rhs_._paren_count;
|
||||
_in_string = rhs_._in_string;
|
||||
_nl_id = rhs_._nl_id;
|
||||
return this;
|
||||
}
|
||||
|
||||
inline bool next (char_type &ch_)
|
||||
{
|
||||
if (_curr >= _end)
|
||||
{
|
||||
ch_ = 0;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
ch_ = *_curr;
|
||||
increment ();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
inline void increment ()
|
||||
{
|
||||
++_curr;
|
||||
}
|
||||
|
||||
inline std::size_t index ()
|
||||
{
|
||||
return _curr - _start;
|
||||
}
|
||||
|
||||
inline bool eos ()
|
||||
{
|
||||
return _curr >= _end;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
112
inc/lexertl/parser/tree/end_node.hpp
Normal file
112
inc/lexertl/parser/tree/end_node.hpp
Normal file
@@ -0,0 +1,112 @@
|
||||
// end_node.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_END_NODE_HPP
|
||||
#define LEXERTL_END_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
#include "../../size_t.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_end_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
typedef basic_node<id_type> node;
|
||||
typedef typename node::bool_stack bool_stack;
|
||||
typedef typename node::const_node_stack const_node_stack;
|
||||
typedef typename node::node_ptr_vector node_ptr_vector;
|
||||
typedef typename node::node_stack node_stack;
|
||||
typedef typename node::node_type node_type;
|
||||
typedef typename node::node_vector node_vector;
|
||||
|
||||
basic_end_node (const id_type id_, const id_type user_id_,
|
||||
const id_type next_dfa_, const id_type push_dfa_,
|
||||
const bool pop_dfa_) :
|
||||
basic_node<id_type> (false),
|
||||
_id (id_),
|
||||
_user_id (user_id_),
|
||||
_next_dfa (next_dfa_),
|
||||
_push_dfa (push_dfa_),
|
||||
_pop_dfa (pop_dfa_),
|
||||
_followpos ()
|
||||
{
|
||||
basic_node<id_type>::_firstpos.push_back (this);
|
||||
basic_node<id_type>::_lastpos.push_back (this);
|
||||
}
|
||||
|
||||
virtual ~basic_end_node ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type () const
|
||||
{
|
||||
return node::END;
|
||||
}
|
||||
|
||||
virtual bool traverse (const_node_stack &/*node_stack_*/,
|
||||
bool_stack &/*perform_op_stack_*/) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual const node_vector &followpos () const
|
||||
{
|
||||
// _followpos is always empty..!
|
||||
return _followpos;
|
||||
}
|
||||
|
||||
virtual bool end_state () const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual id_type id () const
|
||||
{
|
||||
return _id;
|
||||
}
|
||||
|
||||
virtual id_type user_id () const
|
||||
{
|
||||
return _user_id;
|
||||
}
|
||||
|
||||
virtual id_type next_dfa () const
|
||||
{
|
||||
return _next_dfa;
|
||||
}
|
||||
|
||||
virtual id_type push_dfa () const
|
||||
{
|
||||
return _push_dfa;
|
||||
}
|
||||
|
||||
virtual bool pop_dfa () const
|
||||
{
|
||||
return _pop_dfa;
|
||||
}
|
||||
|
||||
private:
|
||||
id_type _id;
|
||||
id_type _user_id;
|
||||
id_type _next_dfa;
|
||||
id_type _push_dfa;
|
||||
bool _pop_dfa;
|
||||
node_vector _followpos;
|
||||
|
||||
virtual void copy_node (node_ptr_vector &/*node_ptr_vector_*/,
|
||||
node_stack &/*new_node_stack_*/, bool_stack &/*perform_op_stack_*/,
|
||||
bool &/*down_*/) const
|
||||
{
|
||||
// Nothing to do, as end_nodes are not copied.
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
103
inc/lexertl/parser/tree/iteration_node.hpp
Normal file
103
inc/lexertl/parser/tree/iteration_node.hpp
Normal file
@@ -0,0 +1,103 @@
|
||||
// iteration_node.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_ITERATION_NODE_HPP
|
||||
#define LEXERTL_ITERATION_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_iteration_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
typedef basic_node<id_type> node;
|
||||
typedef typename node::bool_stack bool_stack;
|
||||
typedef typename node::const_node_stack const_node_stack;
|
||||
typedef typename node::node_ptr_vector node_ptr_vector;
|
||||
typedef typename node::node_stack node_stack;
|
||||
typedef typename node::node_type node_type;
|
||||
typedef typename node::node_vector node_vector;
|
||||
|
||||
basic_iteration_node (basic_node<id_type> *next_, const bool greedy_) :
|
||||
basic_node<id_type> (true),
|
||||
_next (next_),
|
||||
_greedy (greedy_)
|
||||
{
|
||||
typename node_vector::iterator iter_;
|
||||
typename node_vector::iterator end_;
|
||||
|
||||
_next->append_firstpos (node::_firstpos);
|
||||
_next->append_lastpos (node::_lastpos);
|
||||
|
||||
for (iter_ = node::_lastpos.begin (), end_ = node::_lastpos.end ();
|
||||
iter_ != end_; ++iter_)
|
||||
{
|
||||
(*iter_)->append_followpos (node::_firstpos);
|
||||
}
|
||||
|
||||
for (iter_ = node::_firstpos.begin (), end_ = node::_firstpos.end ();
|
||||
iter_ != end_; ++iter_)
|
||||
{
|
||||
(*iter_)->greedy (greedy_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~basic_iteration_node ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type () const
|
||||
{
|
||||
return node::ITERATION;
|
||||
}
|
||||
|
||||
virtual bool traverse (const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const
|
||||
{
|
||||
perform_op_stack_.push (true);
|
||||
node_stack_.push (_next);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
// Not owner of this pointer...
|
||||
basic_node<id_type> *_next;
|
||||
bool _greedy;
|
||||
|
||||
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const
|
||||
{
|
||||
if (perform_op_stack_.top ())
|
||||
{
|
||||
basic_node<id_type> *ptr_ = new_node_stack_.top ();
|
||||
|
||||
node_ptr_vector_->push_back
|
||||
(static_cast<basic_iteration_node<id_type> *>(0));
|
||||
node_ptr_vector_->back () = new basic_iteration_node
|
||||
(ptr_, _greedy);
|
||||
new_node_stack_.top () = node_ptr_vector_->back ();
|
||||
}
|
||||
else
|
||||
{
|
||||
down_ = true;
|
||||
}
|
||||
|
||||
perform_op_stack_.pop ();
|
||||
}
|
||||
|
||||
// No copy construction.
|
||||
basic_iteration_node (const basic_iteration_node &);
|
||||
// No assignment.
|
||||
const basic_iteration_node &operator = (const basic_iteration_node &);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
114
inc/lexertl/parser/tree/leaf_node.hpp
Normal file
114
inc/lexertl/parser/tree/leaf_node.hpp
Normal file
@@ -0,0 +1,114 @@
|
||||
// leaf_node.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_LEAF_NODE_HPP
|
||||
#define LEXERTL_LEAF_NODE_HPP
|
||||
|
||||
#include "../../enums.hpp" // null_token
|
||||
#include "node.hpp"
|
||||
#include "../../size_t.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_leaf_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
typedef basic_node<id_type> node;
|
||||
typedef typename node::bool_stack bool_stack;
|
||||
typedef typename node::const_node_stack const_node_stack;
|
||||
typedef typename node::node_ptr_vector node_ptr_vector;
|
||||
typedef typename node::node_stack node_stack;
|
||||
typedef typename node::node_type node_type;
|
||||
typedef typename node::node_vector node_vector;
|
||||
|
||||
basic_leaf_node (const id_type token_, const bool greedy_) :
|
||||
basic_node<id_type> (token_ == node::null_token ()),
|
||||
_token (token_),
|
||||
_set_greedy (!greedy_),
|
||||
_greedy (greedy_),
|
||||
_followpos ()
|
||||
{
|
||||
if (!node::_nullable)
|
||||
{
|
||||
node::_firstpos.push_back (this);
|
||||
node::_lastpos.push_back (this);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~basic_leaf_node ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual void append_followpos (const node_vector &followpos_)
|
||||
{
|
||||
for (typename node_vector::const_iterator iter_ = followpos_.begin (),
|
||||
end_ = followpos_.end (); iter_ != end_; ++iter_)
|
||||
{
|
||||
_followpos.push_back (*iter_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual node_type what_type () const
|
||||
{
|
||||
return node::LEAF;
|
||||
}
|
||||
|
||||
virtual bool traverse (const_node_stack &/*node_stack_*/,
|
||||
bool_stack &/*perform_op_stack_*/) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual id_type token () const
|
||||
{
|
||||
return _token;
|
||||
}
|
||||
|
||||
virtual void greedy (const bool greedy_)
|
||||
{
|
||||
if (!_set_greedy)
|
||||
{
|
||||
_greedy = greedy_;
|
||||
_set_greedy = true;
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool greedy () const
|
||||
{
|
||||
return _greedy;
|
||||
}
|
||||
|
||||
virtual const node_vector &followpos () const
|
||||
{
|
||||
return _followpos;
|
||||
}
|
||||
|
||||
virtual node_vector &followpos ()
|
||||
{
|
||||
return _followpos;
|
||||
}
|
||||
|
||||
private:
|
||||
id_type _token;
|
||||
bool _set_greedy;
|
||||
bool _greedy;
|
||||
node_vector _followpos;
|
||||
|
||||
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &/*perform_op_stack_*/,
|
||||
bool &/*down_*/) const
|
||||
{
|
||||
node_ptr_vector_->push_back (static_cast<basic_leaf_node *>(0));
|
||||
node_ptr_vector_->back () = new basic_leaf_node (_token, _greedy);
|
||||
new_node_stack_.push (node_ptr_vector_->back ());
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
241
inc/lexertl/parser/tree/node.hpp
Normal file
241
inc/lexertl/parser/tree/node.hpp
Normal file
@@ -0,0 +1,241 @@
|
||||
// node.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_NODE_HPP
|
||||
#define LEXERTL_NODE_HPP
|
||||
|
||||
#include <assert.h>
|
||||
#include "../../containers/ptr_vector.hpp"
|
||||
#include "../../runtime_error.hpp"
|
||||
#include "../../size_t.hpp"
|
||||
#include <stack>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_node
|
||||
{
|
||||
public:
|
||||
enum node_type {LEAF, SEQUENCE, SELECTION, ITERATION, END};
|
||||
|
||||
typedef std::stack<bool> bool_stack;
|
||||
typedef std::stack<basic_node<id_type> *> node_stack;
|
||||
// stack and vector not owner of node pointers
|
||||
typedef std::stack<const basic_node<id_type> *> const_node_stack;
|
||||
typedef std::vector<basic_node<id_type> *> node_vector;
|
||||
typedef ptr_vector<basic_node<id_type> > node_ptr_vector;
|
||||
|
||||
basic_node () :
|
||||
_nullable (false),
|
||||
_firstpos (),
|
||||
_lastpos ()
|
||||
{
|
||||
}
|
||||
|
||||
basic_node (const bool nullable_) :
|
||||
_nullable (nullable_),
|
||||
_firstpos (),
|
||||
_lastpos ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~basic_node ()
|
||||
{
|
||||
}
|
||||
|
||||
static id_type null_token ()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
bool nullable () const
|
||||
{
|
||||
return _nullable;
|
||||
}
|
||||
|
||||
void append_firstpos (node_vector &firstpos_) const
|
||||
{
|
||||
firstpos_.insert (firstpos_.end (),
|
||||
_firstpos.begin (), _firstpos.end ());
|
||||
}
|
||||
|
||||
void append_lastpos (node_vector &lastpos_) const
|
||||
{
|
||||
lastpos_.insert (lastpos_.end (),
|
||||
_lastpos.begin (), _lastpos.end ());
|
||||
}
|
||||
|
||||
virtual void append_followpos (const node_vector &/*followpos_*/)
|
||||
{
|
||||
throw runtime_error ("Internal error node::append_followpos().");
|
||||
}
|
||||
|
||||
basic_node *copy (node_ptr_vector &node_ptr_vector_) const
|
||||
{
|
||||
basic_node *new_root_ = 0;
|
||||
const_node_stack node_stack_;
|
||||
bool_stack perform_op_stack_;
|
||||
bool down_ = true;
|
||||
node_stack new_node_stack_;
|
||||
|
||||
node_stack_.push (this);
|
||||
|
||||
while (!node_stack_.empty ())
|
||||
{
|
||||
while (down_)
|
||||
{
|
||||
down_ = node_stack_.top ()->traverse (node_stack_,
|
||||
perform_op_stack_);
|
||||
}
|
||||
|
||||
while (!down_ && !node_stack_.empty ())
|
||||
{
|
||||
const basic_node *top_ = node_stack_.top ();
|
||||
|
||||
top_->copy_node (node_ptr_vector_, new_node_stack_,
|
||||
perform_op_stack_, down_);
|
||||
|
||||
if (!down_) node_stack_.pop ();
|
||||
}
|
||||
}
|
||||
|
||||
assert (new_node_stack_.size () == 1);
|
||||
new_root_ = new_node_stack_.top ();
|
||||
new_node_stack_.pop ();
|
||||
return new_root_;
|
||||
}
|
||||
|
||||
virtual node_type what_type () const = 0;
|
||||
|
||||
virtual bool traverse (const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const = 0;
|
||||
|
||||
node_vector &firstpos ()
|
||||
{
|
||||
return _firstpos;
|
||||
}
|
||||
|
||||
const node_vector &firstpos () const
|
||||
{
|
||||
return _firstpos;
|
||||
}
|
||||
|
||||
// _lastpos modified externally, so not const &
|
||||
node_vector &lastpos ()
|
||||
{
|
||||
return _lastpos;
|
||||
}
|
||||
|
||||
virtual bool end_state () const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual id_type id () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::id().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type ();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type user_id () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::user_id().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type ();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type next_dfa () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::next_dfa().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type ();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type push_dfa () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::push_dfa().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type ();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool pop_dfa () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::pop_dfa().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual id_type token () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::token().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return id_type ();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void greedy (const bool /*greedy_*/)
|
||||
{
|
||||
throw runtime_error ("Internal error node::greedy(bool).");
|
||||
}
|
||||
|
||||
virtual bool greedy () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::greedy().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual const node_vector &followpos () const
|
||||
{
|
||||
throw runtime_error ("Internal error node::followpos().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return firstpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual node_vector &followpos ()
|
||||
{
|
||||
throw runtime_error ("Internal error node::followpos().");
|
||||
#ifdef __SUNPRO_CC
|
||||
// Stop bogus Solaris compiler warning
|
||||
return firstpos;
|
||||
#endif
|
||||
}
|
||||
|
||||
protected:
|
||||
const bool _nullable;
|
||||
node_vector _firstpos;
|
||||
node_vector _lastpos;
|
||||
|
||||
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const = 0;
|
||||
|
||||
private:
|
||||
basic_node (const basic_node &); // No copy construction.
|
||||
const basic_node &operator = (const basic_node &); // No assignment.
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
106
inc/lexertl/parser/tree/selection_node.hpp
Normal file
106
inc/lexertl/parser/tree/selection_node.hpp
Normal file
@@ -0,0 +1,106 @@
|
||||
// selection_node.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SELECTION_NODE_HPP
|
||||
#define LEXERTL_SELECTION_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_selection_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
typedef basic_node<id_type> node;
|
||||
typedef typename node::bool_stack bool_stack;
|
||||
typedef typename node::const_node_stack const_node_stack;
|
||||
typedef typename node::node_ptr_vector node_ptr_vector;
|
||||
typedef typename node::node_stack node_stack;
|
||||
typedef typename node::node_type node_type;
|
||||
|
||||
basic_selection_node (basic_node<id_type> *left_,
|
||||
basic_node<id_type> *right_) :
|
||||
basic_node<id_type> (left_->nullable () || right_->nullable ()),
|
||||
_left (left_),
|
||||
_right (right_)
|
||||
{
|
||||
_left->append_firstpos (node::_firstpos);
|
||||
_right->append_firstpos (node::_firstpos);
|
||||
_left->append_lastpos (node::_lastpos);
|
||||
_right->append_lastpos (node::_lastpos);
|
||||
}
|
||||
|
||||
virtual ~basic_selection_node ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type () const
|
||||
{
|
||||
return node::SELECTION;
|
||||
}
|
||||
|
||||
virtual bool traverse (const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const
|
||||
{
|
||||
perform_op_stack_.push (true);
|
||||
|
||||
switch (_right->what_type ())
|
||||
{
|
||||
case node::SEQUENCE:
|
||||
case node::SELECTION:
|
||||
case node::ITERATION:
|
||||
perform_op_stack_.push (false);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
node_stack_.push (_right);
|
||||
node_stack_.push (_left);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
// Not owner of these pointers...
|
||||
basic_node<id_type> *_left;
|
||||
basic_node<id_type> *_right;
|
||||
|
||||
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const
|
||||
{
|
||||
if (perform_op_stack_.top ())
|
||||
{
|
||||
basic_node<id_type> *rhs_ = new_node_stack_.top ();
|
||||
|
||||
new_node_stack_.pop ();
|
||||
|
||||
basic_node<id_type> *lhs_ = new_node_stack_.top ();
|
||||
|
||||
node_ptr_vector_->push_back
|
||||
(static_cast<basic_selection_node *>(0));
|
||||
node_ptr_vector_->back () = new basic_selection_node (lhs_, rhs_);
|
||||
new_node_stack_.top () = node_ptr_vector_->back ();
|
||||
}
|
||||
else
|
||||
{
|
||||
down_ = true;
|
||||
}
|
||||
|
||||
perform_op_stack_.pop ();
|
||||
}
|
||||
|
||||
// No copy construction.
|
||||
basic_selection_node (const basic_selection_node &);
|
||||
// No assignment.
|
||||
const basic_selection_node &operator = (const basic_selection_node &);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
126
inc/lexertl/parser/tree/sequence_node.hpp
Normal file
126
inc/lexertl/parser/tree/sequence_node.hpp
Normal file
@@ -0,0 +1,126 @@
|
||||
// sequence_node.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SEQUENCE_NODE_HPP
|
||||
#define LEXERTL_SEQUENCE_NODE_HPP
|
||||
|
||||
#include "node.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
class basic_sequence_node : public basic_node<id_type>
|
||||
{
|
||||
public:
|
||||
typedef basic_node<id_type> node;
|
||||
typedef typename node::bool_stack bool_stack;
|
||||
typedef typename node::const_node_stack const_node_stack;
|
||||
typedef typename node::node_ptr_vector node_ptr_vector;
|
||||
typedef typename node::node_stack node_stack;
|
||||
typedef typename node::node_type node_type;
|
||||
typedef typename node::node_vector node_vector;
|
||||
|
||||
basic_sequence_node (basic_node<id_type> *left_,
|
||||
basic_node<id_type> *right_) :
|
||||
basic_node<id_type> (left_->nullable () && right_->nullable ()),
|
||||
_left (left_),
|
||||
_right (right_)
|
||||
{
|
||||
_left->append_firstpos (node::_firstpos);
|
||||
|
||||
if (_left->nullable ())
|
||||
{
|
||||
_right->append_firstpos (node::_firstpos);
|
||||
}
|
||||
|
||||
if (_right->nullable ())
|
||||
{
|
||||
_left->append_lastpos (node::_lastpos);
|
||||
}
|
||||
|
||||
_right->append_lastpos (node::_lastpos);
|
||||
|
||||
node_vector &lastpos_ = _left->lastpos ();
|
||||
const node_vector &firstpos_ = _right->firstpos ();
|
||||
|
||||
for (typename node_vector::iterator iter_ = lastpos_.begin (),
|
||||
end_ = lastpos_.end (); iter_ != end_; ++iter_)
|
||||
{
|
||||
(*iter_)->append_followpos (firstpos_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~basic_sequence_node ()
|
||||
{
|
||||
}
|
||||
|
||||
virtual node_type what_type () const
|
||||
{
|
||||
return node::SEQUENCE;
|
||||
}
|
||||
|
||||
virtual bool traverse (const_node_stack &node_stack_,
|
||||
bool_stack &perform_op_stack_) const
|
||||
{
|
||||
perform_op_stack_.push (true);
|
||||
|
||||
switch (_right->what_type ())
|
||||
{
|
||||
case node::SEQUENCE:
|
||||
case node::SELECTION:
|
||||
case node::ITERATION:
|
||||
perform_op_stack_.push (false);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
node_stack_.push (_right);
|
||||
node_stack_.push (_left);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
// Not owner of these pointers...
|
||||
basic_node<id_type> *_left;
|
||||
basic_node<id_type> *_right;
|
||||
|
||||
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
|
||||
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
|
||||
bool &down_) const
|
||||
{
|
||||
if (perform_op_stack_.top ())
|
||||
{
|
||||
basic_node<id_type> *rhs_ = new_node_stack_.top ();
|
||||
|
||||
new_node_stack_.pop ();
|
||||
|
||||
basic_node<id_type> *lhs_ = new_node_stack_.top ();
|
||||
|
||||
node_ptr_vector_->push_back
|
||||
(static_cast<basic_sequence_node<id_type> *>(0));
|
||||
node_ptr_vector_->back () = new basic_sequence_node<id_type>
|
||||
(lhs_, rhs_);
|
||||
new_node_stack_.top () = node_ptr_vector_->back ();
|
||||
}
|
||||
else
|
||||
{
|
||||
down_ = true;
|
||||
}
|
||||
|
||||
perform_op_stack_.pop ();
|
||||
}
|
||||
|
||||
// No copy construction.
|
||||
basic_sequence_node (const basic_sequence_node &);
|
||||
// No assignment.
|
||||
const basic_sequence_node &operator = (const basic_sequence_node &);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
73
inc/lexertl/partition/charset.hpp
Normal file
73
inc/lexertl/partition/charset.hpp
Normal file
@@ -0,0 +1,73 @@
|
||||
// charset.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_CHARSET_HPP
|
||||
#define LEXERTL_CHARSET_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <set>
|
||||
#include "../size_t.hpp"
|
||||
#include "../string_token.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename char_type, typename id_type>
|
||||
struct basic_charset
|
||||
{
|
||||
typedef basic_string_token<char_type> token;
|
||||
typedef std::set<id_type> index_set;
|
||||
|
||||
token _token;
|
||||
index_set _index_set;
|
||||
|
||||
basic_charset () :
|
||||
_token (),
|
||||
_index_set ()
|
||||
{
|
||||
}
|
||||
|
||||
basic_charset (const token &token_, const std::size_t index_) :
|
||||
_token (token_),
|
||||
_index_set ()
|
||||
{
|
||||
_index_set.insert (index_);
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _token.empty () && _index_set.empty ();
|
||||
}
|
||||
|
||||
void intersect (basic_charset &rhs_, basic_charset &overlap_)
|
||||
{
|
||||
_token.intersect (rhs_._token, overlap_._token);
|
||||
|
||||
if (!overlap_._token.empty ())
|
||||
{
|
||||
std::merge (_index_set.begin (), _index_set.end (),
|
||||
rhs_._index_set.begin (), rhs_._index_set.end (),
|
||||
std::inserter (overlap_._index_set,
|
||||
overlap_._index_set.end ()));
|
||||
|
||||
if (_token.empty ())
|
||||
{
|
||||
_index_set.clear ();
|
||||
}
|
||||
|
||||
if (rhs_._token.empty ())
|
||||
{
|
||||
rhs_._index_set.clear ();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
134
inc/lexertl/partition/equivset.hpp
Normal file
134
inc/lexertl/partition/equivset.hpp
Normal file
@@ -0,0 +1,134 @@
|
||||
// equivset.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_EQUIVSET_HPP
|
||||
#define LEXERTL_EQUIVSET_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include "../parser/tree/node.hpp"
|
||||
#include <set>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template<typename id_type>
|
||||
struct basic_equivset
|
||||
{
|
||||
typedef std::set<id_type> index_set;
|
||||
typedef std::vector<id_type> index_vector;
|
||||
// Not owner of nodes:
|
||||
typedef basic_node<id_type> node;
|
||||
typedef std::vector<node *> node_vector;
|
||||
|
||||
index_vector _index_vector;
|
||||
id_type _id;
|
||||
bool _greedy;
|
||||
node_vector _followpos;
|
||||
|
||||
basic_equivset () :
|
||||
_index_vector (),
|
||||
_id (0),
|
||||
_greedy (true),
|
||||
_followpos ()
|
||||
{
|
||||
}
|
||||
|
||||
basic_equivset (const index_set &index_set_, const id_type id_,
|
||||
const bool greedy_, const node_vector &followpos_) :
|
||||
_index_vector (index_set_.begin (), index_set_.end ()),
|
||||
_id (id_),
|
||||
_greedy (greedy_),
|
||||
_followpos (followpos_)
|
||||
{
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _index_vector.empty () && _followpos.empty ();
|
||||
}
|
||||
|
||||
void intersect (basic_equivset &rhs_, basic_equivset &overlap_)
|
||||
{
|
||||
intersect_indexes (rhs_._index_vector, overlap_._index_vector);
|
||||
|
||||
if (!overlap_._index_vector.empty ())
|
||||
{
|
||||
// Note that the LHS takes priority in order to
|
||||
// respect rule ordering priority in the lex spec.
|
||||
overlap_._id = _id;
|
||||
overlap_._greedy = _greedy;
|
||||
overlap_._followpos = _followpos;
|
||||
|
||||
typename node_vector::const_iterator overlap_begin_ =
|
||||
overlap_._followpos.begin ();
|
||||
typename node_vector::const_iterator overlap_end_ =
|
||||
overlap_._followpos.end ();
|
||||
typename node_vector::const_iterator rhs_iter_ =
|
||||
rhs_._followpos.begin ();
|
||||
typename node_vector::const_iterator rhs_end_ =
|
||||
rhs_._followpos.end ();
|
||||
|
||||
for (; rhs_iter_ != rhs_end_; ++rhs_iter_)
|
||||
{
|
||||
node *node_ = *rhs_iter_;
|
||||
|
||||
if (std::find (overlap_begin_, overlap_end_, node_) ==
|
||||
overlap_end_)
|
||||
{
|
||||
overlap_._followpos.push_back (node_);
|
||||
overlap_begin_ = overlap_._followpos.begin ();
|
||||
overlap_end_ = overlap_._followpos.end ();
|
||||
}
|
||||
}
|
||||
|
||||
if (_index_vector.empty ())
|
||||
{
|
||||
_followpos.clear ();
|
||||
}
|
||||
|
||||
if (rhs_._index_vector.empty ())
|
||||
{
|
||||
rhs_._followpos.clear ();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void intersect_indexes (index_vector &rhs_, index_vector &overlap_)
|
||||
{
|
||||
typename index_vector::iterator iter_ = _index_vector.begin ();
|
||||
typename index_vector::iterator end_ = _index_vector.end ();
|
||||
typename index_vector::iterator rhs_iter_ = rhs_.begin ();
|
||||
typename index_vector::iterator rhs_end_ = rhs_.end ();
|
||||
|
||||
while (iter_ != end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
const id_type index_ = *iter_;
|
||||
const id_type rhs_index_ = *rhs_iter_;
|
||||
|
||||
if (index_ < rhs_index_)
|
||||
{
|
||||
++iter_;
|
||||
}
|
||||
else if (index_ > rhs_index_)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
overlap_.push_back (index_);
|
||||
iter_ = _index_vector.erase (iter_);
|
||||
end_ = _index_vector.end ();
|
||||
rhs_iter_ = rhs_.erase (rhs_iter_);
|
||||
rhs_end_ = rhs_.end ();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
743
inc/lexertl/rules.hpp
Normal file
743
inc/lexertl/rules.hpp
Normal file
@@ -0,0 +1,743 @@
|
||||
// rules.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RULES_HPP
|
||||
#define LEXERTL_RULES_HPP
|
||||
|
||||
#include "compile_assert.hpp"
|
||||
#include <deque>
|
||||
#include "enums.hpp"
|
||||
#include "internals.hpp"
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include "runtime_error.hpp"
|
||||
#include <set>
|
||||
#include "size_t.hpp"
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type, typename id_ty = std::size_t>
|
||||
class basic_rules
|
||||
{
|
||||
public:
|
||||
typedef std::vector<bool> bool_vector;
|
||||
typedef std::deque<bool_vector> bool_vector_deque;
|
||||
typedef ch_type char_type;
|
||||
typedef id_ty id_type;
|
||||
typedef std::vector<id_type> id_vector;
|
||||
typedef std::deque<id_vector> id_vector_deque;
|
||||
typedef std::basic_string<char_type> string;
|
||||
typedef std::deque<string> string_deque;
|
||||
typedef std::deque<string_deque> string_deque_deque;
|
||||
typedef std::set<string> string_set;
|
||||
typedef std::pair<string, string> string_pair;
|
||||
typedef std::deque<string_pair> string_pair_deque;
|
||||
typedef std::map<string, id_type> string_id_type_map;
|
||||
typedef std::pair<string, id_type> string_id_type_pair;
|
||||
|
||||
// If you get a compile error here you have
|
||||
// failed to define an unsigned id type.
|
||||
compile_assert<(static_cast<id_type>(~0) > 0)>
|
||||
_valid_id_type;
|
||||
|
||||
basic_rules (const std::size_t flags_ = dot_not_newline) :
|
||||
_valid_id_type (),
|
||||
_statemap (),
|
||||
_macrodeque (),
|
||||
_macroset (),
|
||||
_regexes (),
|
||||
_features (),
|
||||
_ids (),
|
||||
_user_ids (),
|
||||
_next_dfas (),
|
||||
_pushes (),
|
||||
_pops (),
|
||||
_flags (flags_),
|
||||
_locale (),
|
||||
_lexer_state_names (),
|
||||
_eoi (0)
|
||||
{
|
||||
add_state (initial ());
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_statemap.clear ();
|
||||
_macrodeque.clear ();
|
||||
_macroset.clear ();
|
||||
_regexes.clear ();
|
||||
_features.clear ();
|
||||
_ids.clear ();
|
||||
_user_ids.clear ();
|
||||
_next_dfas.clear ();
|
||||
_pushes.clear ();
|
||||
_pops.clear ();
|
||||
_flags = dot_not_newline;
|
||||
_locale = std::locale ();
|
||||
_lexer_state_names.clear ();
|
||||
_eoi = 0;
|
||||
add_state (initial ());
|
||||
}
|
||||
|
||||
void clear (const id_type dfa_)
|
||||
{
|
||||
if (_regexes.size () > dfa_)
|
||||
{
|
||||
_regexes[dfa_].clear ();
|
||||
_features[dfa_] = 0;
|
||||
_ids[dfa_].clear ();
|
||||
_user_ids[dfa_].clear ();
|
||||
_next_dfas[dfa_].clear ();
|
||||
_pushes[dfa_].clear ();
|
||||
_pops[dfa_].clear ();
|
||||
}
|
||||
}
|
||||
|
||||
void flags (const std::size_t flags_)
|
||||
{
|
||||
_flags = flags_;
|
||||
}
|
||||
|
||||
std::size_t flags () const
|
||||
{
|
||||
return _flags;
|
||||
}
|
||||
|
||||
static id_type skip ()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
|
||||
void eoi (const id_type eoi_)
|
||||
{
|
||||
_eoi = eoi_;
|
||||
}
|
||||
|
||||
id_type eoi () const
|
||||
{
|
||||
return _eoi;
|
||||
}
|
||||
|
||||
std::locale imbue (const std::locale &locale_)
|
||||
{
|
||||
std::locale loc_ = _locale;
|
||||
|
||||
_locale = locale_;
|
||||
return loc_;
|
||||
}
|
||||
|
||||
const std::locale &locale () const
|
||||
{
|
||||
return _locale;
|
||||
}
|
||||
|
||||
const char_type *state (const id_type index_) const
|
||||
{
|
||||
if (index_ == 0)
|
||||
{
|
||||
return initial ();
|
||||
}
|
||||
else
|
||||
{
|
||||
const id_type i_ = index_ - 1;
|
||||
|
||||
if (_lexer_state_names.size () > i_)
|
||||
{
|
||||
return _lexer_state_names[i_].c_str ();
|
||||
}
|
||||
else
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
id_type state (const char_type *name_) const
|
||||
{
|
||||
typename string_id_type_map::const_iterator iter_ =
|
||||
_statemap.find (name_);
|
||||
|
||||
if (iter_ == _statemap.end ())
|
||||
{
|
||||
return npos ();
|
||||
}
|
||||
else
|
||||
{
|
||||
return iter_->second;
|
||||
}
|
||||
}
|
||||
|
||||
id_type add_state (const char_type *name_)
|
||||
{
|
||||
validate (name_);
|
||||
|
||||
if (_statemap.insert (string_id_type_pair (name_,
|
||||
_statemap.size ())).second)
|
||||
{
|
||||
_regexes.push_back (string_deque ());
|
||||
_features.push_back (0);
|
||||
_ids.push_back (id_vector ());
|
||||
_user_ids.push_back (id_vector ());
|
||||
_next_dfas.push_back (id_vector ());
|
||||
_pushes.push_back (id_vector ());
|
||||
_pops.push_back (bool_vector ());
|
||||
|
||||
if (string (name_) != initial ())
|
||||
{
|
||||
_lexer_state_names.push_back (name_);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return _statemap.find (name_)->second;
|
||||
}
|
||||
|
||||
if (_next_dfas.size () > npos ())
|
||||
{
|
||||
// Overflow
|
||||
throw runtime_error ("The data type you have chosen cannot hold "
|
||||
"this many lexer start states.");
|
||||
}
|
||||
|
||||
// Initial is not stored, so no need to - 1.
|
||||
return static_cast<id_type>(_lexer_state_names.size ());
|
||||
}
|
||||
|
||||
void add_macro (const char_type *name_, const char_type *regex_)
|
||||
{
|
||||
add_macro (name_, string (regex_));
|
||||
}
|
||||
|
||||
void add_macro (const char_type *name_, const char_type *regex_start_,
|
||||
const char_type *regex_end_)
|
||||
{
|
||||
add_macro (name_, string (regex_start_, regex_end_));
|
||||
}
|
||||
|
||||
void add_macro (const char_type *name_, const string ®ex_)
|
||||
{
|
||||
validate (name_);
|
||||
|
||||
typename string_set::const_iterator iter_ = _macroset.find (name_);
|
||||
|
||||
if (iter_ == _macroset.end ())
|
||||
{
|
||||
_macrodeque.push_back (string_pair (name_, regex_));
|
||||
_macroset.insert (name_);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
std::ostringstream os_;
|
||||
|
||||
os_ << "Attempt to redefine MACRO '";
|
||||
|
||||
while (*name_)
|
||||
{
|
||||
os_ << ss_.narrow (*name_++, static_cast<char_type> (' '));
|
||||
}
|
||||
|
||||
os_ << "'.";
|
||||
throw runtime_error (os_.str ());
|
||||
}
|
||||
}
|
||||
|
||||
void add_macros (const basic_rules &rules_)
|
||||
{
|
||||
const string_pair_deque ¯os_ = rules_.macrodeque ();
|
||||
typename string_pair_deque::const_iterator macro_iter_ =
|
||||
macros_.begin ();
|
||||
typename string_pair_deque::const_iterator macro_end_ =
|
||||
macros_.end ();
|
||||
|
||||
for (; macro_iter_ != macro_end_; ++macro_iter_)
|
||||
{
|
||||
add_macro (macro_iter_->first.c_str (),
|
||||
macro_iter_->second.c_str ());
|
||||
}
|
||||
}
|
||||
|
||||
void merge_macros (const basic_rules &rules_)
|
||||
{
|
||||
const string_pair_deque ¯os_ = rules_.macrodeque ();
|
||||
typename string_pair_deque::const_iterator macro_iter_ =
|
||||
macros_.begin ();
|
||||
typename string_pair_deque::const_iterator macro_end_ =
|
||||
macros_.end ();
|
||||
typename string_set::const_iterator macro_dest_iter_;
|
||||
typename string_set::const_iterator macro_dest_end_ = _macroset.end ();
|
||||
|
||||
for (; macro_iter_ != macro_end_; ++macro_iter_)
|
||||
{
|
||||
macro_dest_iter_ = _macroset.find (macro_iter_->first);
|
||||
|
||||
if (macro_dest_iter_ == macro_dest_end_)
|
||||
{
|
||||
add_macro (macro_iter_->first.c_str (),
|
||||
macro_iter_->second.c_str ());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add rule to INITIAL
|
||||
void add (const char_type *regex_, const id_type id_,
|
||||
const id_type user_id_ = npos ())
|
||||
{
|
||||
add (string (regex_), id_, user_id_);
|
||||
}
|
||||
|
||||
void add (const char_type *regex_start_, const char_type *regex_end_,
|
||||
const id_type id_, const id_type user_id_ = npos ())
|
||||
{
|
||||
add (string (regex_start_, regex_end_), id_, user_id_);
|
||||
}
|
||||
|
||||
void add (const string ®ex_, const id_type id_,
|
||||
const id_type user_id_ = npos ())
|
||||
{
|
||||
check_for_invalid_id (id_);
|
||||
_regexes.front ().push_back (regex_);
|
||||
|
||||
if (regex_[0] == '^')
|
||||
{
|
||||
_features.front () |= bol_bit;
|
||||
}
|
||||
|
||||
if (regex_.size () > 0 && regex_[regex_.size () - 1] == '$')
|
||||
{
|
||||
_features.front () |= eol_bit;
|
||||
}
|
||||
|
||||
if (id_ == skip ())
|
||||
{
|
||||
_features.front () |= skip_bit;
|
||||
}
|
||||
else if (id_ == eoi ())
|
||||
{
|
||||
_features.front () |= again_bit;
|
||||
}
|
||||
|
||||
_ids.front ().push_back (id_);
|
||||
_user_ids.front ().push_back (user_id_);
|
||||
_next_dfas.front ().push_back (0);
|
||||
_pushes.front ().push_back (npos ());
|
||||
_pops.front ().push_back (false);
|
||||
}
|
||||
|
||||
// Add rule with no id
|
||||
void add (const char_type *curr_dfa_,
|
||||
const char_type *regex_, const char_type *new_dfa_)
|
||||
{
|
||||
add (curr_dfa_, string (regex_), new_dfa_);
|
||||
}
|
||||
|
||||
void add (const char_type *curr_dfa_,
|
||||
const char_type *regex_start_, const char_type *regex_end_,
|
||||
const char_type *new_dfa_)
|
||||
{
|
||||
add (curr_dfa_, string (regex_start_, regex_end_), new_dfa_);
|
||||
}
|
||||
|
||||
void add (const char_type *curr_dfa_, const string ®ex_,
|
||||
const char_type *new_dfa_)
|
||||
{
|
||||
add (curr_dfa_, regex_, _eoi, new_dfa_, false);
|
||||
}
|
||||
|
||||
// Add rule with id
|
||||
void add (const char_type *curr_dfa_,
|
||||
const char_type *regex_, const id_type id_,
|
||||
const char_type *new_dfa_, const id_type user_id_ = npos ())
|
||||
{
|
||||
add (curr_dfa_, string (regex_), id_, new_dfa_, user_id_);
|
||||
}
|
||||
|
||||
void add (const char_type *curr_dfa_, const char_type *regex_start_,
|
||||
const char_type *regex_end_, const id_type id_,
|
||||
const char_type *new_dfa_, const id_type user_id_ = npos ())
|
||||
{
|
||||
add (curr_dfa_, string (regex_start_, regex_end_),
|
||||
id_, new_dfa_, user_id_);
|
||||
}
|
||||
|
||||
void add (const char_type *curr_dfa_, const string ®ex_,
|
||||
const id_type id_, const char_type *new_dfa_,
|
||||
const id_type user_id_ = npos ())
|
||||
{
|
||||
add (curr_dfa_, regex_, id_, new_dfa_, true, user_id_);
|
||||
}
|
||||
|
||||
const string_id_type_map &statemap () const
|
||||
{
|
||||
return _statemap;
|
||||
}
|
||||
|
||||
const string_pair_deque ¯odeque () const
|
||||
{
|
||||
return _macrodeque;
|
||||
}
|
||||
|
||||
const string_deque_deque ®exes () const
|
||||
{
|
||||
return _regexes;
|
||||
}
|
||||
|
||||
const id_vector &features () const
|
||||
{
|
||||
return _features;
|
||||
}
|
||||
|
||||
const id_vector_deque &ids () const
|
||||
{
|
||||
return _ids;
|
||||
}
|
||||
|
||||
const id_vector_deque &user_ids () const
|
||||
{
|
||||
return _user_ids;
|
||||
}
|
||||
|
||||
const id_vector_deque &next_dfas () const
|
||||
{
|
||||
return _next_dfas;
|
||||
}
|
||||
|
||||
const id_vector_deque &pushes () const
|
||||
{
|
||||
return _pushes;
|
||||
}
|
||||
|
||||
const bool_vector_deque &pops () const
|
||||
{
|
||||
return _pops;
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
typename string_deque_deque::const_iterator iter_ = _regexes.begin ();
|
||||
typename string_deque_deque::const_iterator end_ = _regexes.end ();
|
||||
bool empty_ = true;
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
if (!iter_->empty ())
|
||||
{
|
||||
empty_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return empty_;
|
||||
}
|
||||
|
||||
static const char_type *initial ()
|
||||
{
|
||||
static const char_type initial_[] =
|
||||
{'I', 'N', 'I', 'T', 'I', 'A', 'L', 0};
|
||||
|
||||
return initial_;
|
||||
}
|
||||
|
||||
static const char_type *dot ()
|
||||
{
|
||||
static const char_type dot_[] = {'.', 0};
|
||||
|
||||
return dot_;
|
||||
}
|
||||
|
||||
static const char_type *all_states ()
|
||||
{
|
||||
static const char_type star_[] = {'*', 0};
|
||||
|
||||
return star_;
|
||||
}
|
||||
|
||||
static id_type npos ()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
private:
|
||||
string_id_type_map _statemap;
|
||||
string_pair_deque _macrodeque;
|
||||
string_set _macroset;
|
||||
string_deque_deque _regexes;
|
||||
id_vector _features;
|
||||
id_vector_deque _ids;
|
||||
id_vector_deque _user_ids;
|
||||
id_vector_deque _next_dfas;
|
||||
id_vector_deque _pushes;
|
||||
bool_vector_deque _pops;
|
||||
std::size_t _flags;
|
||||
std::locale _locale;
|
||||
string_deque _lexer_state_names;
|
||||
id_type _eoi;
|
||||
|
||||
void add (const char_type *curr_dfa_, const string ®ex_,
|
||||
const id_type id_, const char_type *new_dfa_,
|
||||
const bool check_, const id_type user_id_ = npos ())
|
||||
{
|
||||
const bool star_ = *curr_dfa_ == '*' && *(curr_dfa_ + 1) == 0;
|
||||
const bool dot_ = *new_dfa_ == '.' && *(new_dfa_ + 1) == 0;
|
||||
const bool push_ = *new_dfa_ == '>';
|
||||
const char_type *push_dfa_ = 0;
|
||||
const bool pop_ = *new_dfa_ == '<';
|
||||
|
||||
if (push_ || pop_)
|
||||
{
|
||||
++new_dfa_;
|
||||
}
|
||||
|
||||
if (check_)
|
||||
{
|
||||
check_for_invalid_id (id_);
|
||||
}
|
||||
|
||||
if (!dot_ && !pop_)
|
||||
{
|
||||
const char_type *temp_ = new_dfa_;
|
||||
|
||||
while (*temp_ && *temp_ != ':')
|
||||
{
|
||||
++temp_;
|
||||
}
|
||||
|
||||
if (*temp_) push_dfa_ = temp_ + 1;
|
||||
|
||||
validate (new_dfa_, *temp_ ? temp_ : 0);
|
||||
|
||||
if (push_dfa_)
|
||||
{
|
||||
validate (push_dfa_);
|
||||
}
|
||||
}
|
||||
|
||||
// npos means pop here
|
||||
id_type new_dfa_id_ = npos ();
|
||||
id_type push_dfa_id_ = npos ();
|
||||
typename string_id_type_map::const_iterator iter_;
|
||||
typename string_id_type_map::const_iterator end_ = _statemap.end ();
|
||||
id_vector next_dfas_;
|
||||
|
||||
if (!dot_ && !pop_)
|
||||
{
|
||||
if (push_dfa_)
|
||||
{
|
||||
iter_ = _statemap.find (string (new_dfa_, push_dfa_ - 1));
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = _statemap.find (new_dfa_);
|
||||
}
|
||||
|
||||
if (iter_ == end_)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
std::ostringstream os_;
|
||||
|
||||
os_ << "Unknown state name '";
|
||||
|
||||
while (*new_dfa_)
|
||||
{
|
||||
os_ << ss_.narrow (*new_dfa_++, ' ');
|
||||
}
|
||||
|
||||
os_ << "'.";
|
||||
throw runtime_error (os_.str ());
|
||||
}
|
||||
|
||||
new_dfa_id_ = iter_->second;
|
||||
|
||||
if (push_dfa_)
|
||||
{
|
||||
iter_ = _statemap.find (push_dfa_);
|
||||
|
||||
if (iter_ == end_)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
std::ostringstream os_;
|
||||
|
||||
os_ << "Unknown state name '";
|
||||
|
||||
while (*push_dfa_)
|
||||
{
|
||||
os_ << ss_.narrow (*push_dfa_++, ' ');
|
||||
}
|
||||
|
||||
os_ << "'.";
|
||||
throw runtime_error (os_.str ());
|
||||
}
|
||||
|
||||
push_dfa_id_ = iter_->second;
|
||||
}
|
||||
}
|
||||
|
||||
if (star_)
|
||||
{
|
||||
const std::size_t size_ = _statemap.size ();
|
||||
|
||||
for (id_type i_ = 0; i_ < size_; ++i_)
|
||||
{
|
||||
next_dfas_.push_back (i_);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const char_type *start_ = curr_dfa_;
|
||||
string next_dfa_;
|
||||
|
||||
while (*curr_dfa_)
|
||||
{
|
||||
while (*curr_dfa_ && *curr_dfa_ != ',')
|
||||
{
|
||||
++curr_dfa_;
|
||||
}
|
||||
|
||||
next_dfa_.assign (start_, curr_dfa_);
|
||||
|
||||
if (*curr_dfa_)
|
||||
{
|
||||
++curr_dfa_;
|
||||
start_ = curr_dfa_;
|
||||
}
|
||||
|
||||
validate (next_dfa_.c_str ());
|
||||
iter_ = _statemap.find (next_dfa_.c_str ());
|
||||
|
||||
if (iter_ == end_)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
std::ostringstream os_;
|
||||
|
||||
os_ << "Unknown state name '";
|
||||
curr_dfa_ = next_dfa_.c_str ();
|
||||
|
||||
while (*curr_dfa_)
|
||||
{
|
||||
os_ << ss_.narrow (*curr_dfa_++, ' ');
|
||||
}
|
||||
|
||||
os_ << "'.";
|
||||
throw runtime_error (os_.str ());
|
||||
}
|
||||
|
||||
next_dfas_.push_back (iter_->second);
|
||||
}
|
||||
}
|
||||
|
||||
for (std::size_t i_ = 0, size_ = next_dfas_.size ();
|
||||
i_ < size_; ++i_)
|
||||
{
|
||||
const id_type curr_ = next_dfas_[i_];
|
||||
|
||||
_regexes[curr_].push_back (regex_);
|
||||
|
||||
if (regex_[0] == '^')
|
||||
{
|
||||
_features[curr_] |= bol_bit;
|
||||
}
|
||||
|
||||
if (regex_[regex_.size () - 1] == '$')
|
||||
{
|
||||
_features[curr_] |= eol_bit;
|
||||
}
|
||||
|
||||
if (id_ == skip ())
|
||||
{
|
||||
_features[curr_] |= skip_bit;
|
||||
}
|
||||
else if (id_ == eoi ())
|
||||
{
|
||||
_features[curr_] |= again_bit;
|
||||
}
|
||||
|
||||
if (push_ || pop_)
|
||||
{
|
||||
_features[curr_] |= recursive_bit;
|
||||
}
|
||||
|
||||
_ids[curr_].push_back (id_);
|
||||
_user_ids[curr_].push_back (user_id_);
|
||||
_next_dfas[curr_].push_back (dot_ ? curr_ : new_dfa_id_);
|
||||
_pushes[curr_].push_back (push_ ? (push_dfa_ ?
|
||||
push_dfa_id_ : curr_) : npos ());
|
||||
_pops[curr_].push_back (pop_);
|
||||
}
|
||||
}
|
||||
|
||||
void validate (const char_type *name_, const char_type *end_ = 0) const
|
||||
{
|
||||
const char_type *start_ = name_;
|
||||
|
||||
if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') &&
|
||||
!(*name_ >= 'a' && *name_ <= 'z'))
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
std::ostringstream os_;
|
||||
|
||||
os_ << "Invalid name '";
|
||||
|
||||
while (*name_)
|
||||
{
|
||||
os_ << ss_.narrow (*name_++, ' ');
|
||||
}
|
||||
|
||||
os_ << "'.";
|
||||
throw runtime_error (os_.str ());
|
||||
}
|
||||
else if (*name_)
|
||||
{
|
||||
++name_;
|
||||
}
|
||||
|
||||
while (*name_ && name_ != end_)
|
||||
{
|
||||
if (*name_ != '_' && *name_ != '-' &&
|
||||
!(*name_ >= 'A' && *name_ <= 'Z') &&
|
||||
!(*name_ >= 'a' && *name_ <= 'z') &&
|
||||
!(*name_ >= '0' && *name_ <= '9'))
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
std::ostringstream os_;
|
||||
|
||||
os_ << "Invalid name '";
|
||||
name_ = start_;
|
||||
|
||||
while (*name_)
|
||||
{
|
||||
os_ << ss_.narrow (*name_++, ' ');
|
||||
}
|
||||
|
||||
os_ << "'.";
|
||||
throw runtime_error (os_.str ());
|
||||
}
|
||||
|
||||
++name_;
|
||||
}
|
||||
}
|
||||
|
||||
void check_for_invalid_id (const id_type id_) const
|
||||
{
|
||||
if (id_ == _eoi)
|
||||
{
|
||||
throw runtime_error ("Cannot resuse the id for eoi.");
|
||||
}
|
||||
|
||||
if (id_ == npos ())
|
||||
{
|
||||
throw runtime_error ("id npos is reserved for the "
|
||||
"UNKNOWN token.");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_rules<char> rules;
|
||||
typedef basic_rules<wchar_t> wrules;
|
||||
}
|
||||
|
||||
#endif
|
||||
23
inc/lexertl/runtime_error.hpp
Normal file
23
inc/lexertl/runtime_error.hpp
Normal file
@@ -0,0 +1,23 @@
|
||||
// runtime_error.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_RUNTIME_ERROR_HPP
|
||||
#define LEXERTL_RUNTIME_ERROR_HPP
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
class runtime_error : public std::runtime_error
|
||||
{
|
||||
public:
|
||||
runtime_error (const std::string &what_arg_) :
|
||||
std::runtime_error (what_arg_)
|
||||
{
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
28
inc/lexertl/serialise.hpp
Normal file
28
inc/lexertl/serialise.hpp
Normal file
@@ -0,0 +1,28 @@
|
||||
// serialise.hpp
|
||||
// Copyright (c) 2007-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SERIALISE_HPP
|
||||
#define LEXERTL_SERIALISE_HPP
|
||||
|
||||
#include "state_machine.hpp"
|
||||
#include <boost/serialization/vector.hpp>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
// IMPORTANT! This won't work if you don't enable RTTI!
|
||||
template<typename CharT, typename id_type, class Archive>
|
||||
void serialise (basic_state_machine<CharT, id_type> &sm_, Archive &ar_)
|
||||
{
|
||||
detail::basic_internals<id_type> &internals_ = sm_.data ();
|
||||
|
||||
ar_ & internals_._eoi;
|
||||
ar_ & *internals_._lookup;
|
||||
ar_ & internals_._dfa_alphabet;
|
||||
ar_ & internals_._features;
|
||||
ar_ & *internals_._dfa;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
12
inc/lexertl/size_t.hpp
Normal file
12
inc/lexertl/size_t.hpp
Normal file
@@ -0,0 +1,12 @@
|
||||
// size_t.h
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_SIZE_T_H
|
||||
#define LEXERTL_SIZE_T_H
|
||||
|
||||
#include <stddef.h> // ptrdiff_t
|
||||
#include <cstring>
|
||||
|
||||
#endif
|
||||
44
inc/lexertl/sm_traits.hpp
Normal file
44
inc/lexertl/sm_traits.hpp
Normal file
@@ -0,0 +1,44 @@
|
||||
// sm_traits.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_SM_TRAITS_H
|
||||
#define LEXERTL_SM_TRAITS_H
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type, typename sm_type, bool comp, bool look,
|
||||
bool dfa_nfa>
|
||||
struct basic_sm_traits
|
||||
{
|
||||
enum {char_24_bit = sizeof(ch_type) > 2, compressed = comp, lookup = look,
|
||||
is_dfa = dfa_nfa};
|
||||
typedef ch_type input_char_type;
|
||||
typedef ch_type char_type;
|
||||
typedef sm_type id_type;
|
||||
|
||||
static id_type npos ()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename ch_type, typename sm_type, bool look, bool dfa_nfa>
|
||||
struct basic_sm_traits<ch_type, sm_type, true, look, dfa_nfa>
|
||||
{
|
||||
enum {char_24_bit = sizeof(ch_type) > 2, compressed = true, lookup = look,
|
||||
is_dfa = dfa_nfa};
|
||||
typedef ch_type input_char_type;
|
||||
typedef unsigned char char_type;
|
||||
typedef sm_type id_type;
|
||||
|
||||
static id_type npos ()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
525
inc/lexertl/state_machine.hpp
Normal file
525
inc/lexertl/state_machine.hpp
Normal file
@@ -0,0 +1,525 @@
|
||||
// state_machine.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_STATE_MACHINE_HPP
|
||||
#define LEXERTL_STATE_MACHINE_HPP
|
||||
|
||||
#include "compile_assert.hpp"
|
||||
// memcmp()
|
||||
#include <cstring>
|
||||
#include <deque>
|
||||
#include "internals.hpp"
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "sm_traits.hpp"
|
||||
#include "string_token.hpp"
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type, typename id_type = std::size_t>
|
||||
class basic_state_machine
|
||||
{
|
||||
public:
|
||||
typedef basic_sm_traits<char_type, id_type,
|
||||
(sizeof (char_type) > 1), true, true> traits;
|
||||
typedef detail::basic_internals<id_type> internals;
|
||||
|
||||
// If you get a compile error here you have
|
||||
// failed to define an unsigned id type.
|
||||
compile_assert<(static_cast<id_type>(~0) > 0)>
|
||||
_valid_id_type;
|
||||
|
||||
basic_state_machine () :
|
||||
_valid_id_type (),
|
||||
_internals ()
|
||||
{
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_internals.clear ();
|
||||
}
|
||||
|
||||
internals &data ()
|
||||
{
|
||||
return _internals;
|
||||
}
|
||||
|
||||
const internals &data () const
|
||||
{
|
||||
return _internals;
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _internals.empty ();
|
||||
}
|
||||
|
||||
id_type eoi () const
|
||||
{
|
||||
return _internals._eoi;
|
||||
}
|
||||
|
||||
void minimise ()
|
||||
{
|
||||
const id_type dfas_ = static_cast<id_type>(_internals.
|
||||
_dfa->size ());
|
||||
|
||||
for (id_type i_ = 0; i_ < dfas_; ++i_)
|
||||
{
|
||||
const id_type dfa_alphabet_ = _internals._dfa_alphabet[i_];
|
||||
id_type_vector *dfa_ = _internals._dfa[i_];
|
||||
|
||||
if (dfa_alphabet_ != 0)
|
||||
{
|
||||
std::size_t size_ = 0;
|
||||
|
||||
do
|
||||
{
|
||||
size_ = dfa_->size ();
|
||||
minimise_dfa (dfa_alphabet_, *dfa_, size_);
|
||||
} while (dfa_->size () != size_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static id_type npos ()
|
||||
{
|
||||
return static_cast<id_type>(~0);
|
||||
}
|
||||
|
||||
static id_type skip ()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
|
||||
void swap (basic_state_machine &rhs_)
|
||||
{
|
||||
_internals.swap (rhs_._internals);
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename internals::id_type_vector id_type_vector;
|
||||
typedef std::set<id_type> index_set;
|
||||
internals _internals;
|
||||
|
||||
void minimise_dfa (const id_type dfa_alphabet_,
|
||||
id_type_vector &dfa_, std::size_t size_)
|
||||
{
|
||||
const id_type *first_ = &dfa_.front ();
|
||||
const id_type *end_ = first_ + size_;
|
||||
id_type index_ = 1;
|
||||
id_type new_index_ = 1;
|
||||
id_type_vector lookup_ (size_ / dfa_alphabet_, npos ());
|
||||
id_type *lookup_ptr_ = &lookup_.front ();
|
||||
index_set index_set_;
|
||||
const id_type bol_index_ = dfa_.front ();
|
||||
|
||||
*lookup_ptr_ = 0;
|
||||
// Only one 'jam' state, so skip it.
|
||||
first_ += dfa_alphabet_;
|
||||
|
||||
for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
|
||||
{
|
||||
const id_type *second_ = first_ + dfa_alphabet_;
|
||||
|
||||
for (id_type curr_index_ = index_ + 1; second_ < end_;
|
||||
++curr_index_, second_ += dfa_alphabet_)
|
||||
{
|
||||
if (index_set_.find (curr_index_) != index_set_.end ())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Some systems have memcmp in namespace std.
|
||||
using namespace std;
|
||||
|
||||
if (memcmp (first_, second_, sizeof (id_type) *
|
||||
dfa_alphabet_) == 0)
|
||||
{
|
||||
index_set_.insert (curr_index_);
|
||||
lookup_ptr_[curr_index_] = new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (lookup_ptr_[index_] == npos ())
|
||||
{
|
||||
lookup_ptr_[index_] = new_index_;
|
||||
++new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (!index_set_.empty ())
|
||||
{
|
||||
const id_type *front_ = &dfa_.front ();
|
||||
id_type_vector new_dfa_ (front_, front_ + dfa_alphabet_);
|
||||
typename index_set::const_iterator set_end_ = index_set_.end ();
|
||||
const id_type *ptr_ = front_ + dfa_alphabet_;
|
||||
id_type *new_ptr_ = 0;
|
||||
|
||||
new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0);
|
||||
new_ptr_ = &new_dfa_.front () + dfa_alphabet_;
|
||||
size_ /= dfa_alphabet_;
|
||||
|
||||
if (bol_index_)
|
||||
{
|
||||
new_dfa_.front () = lookup_ptr_[bol_index_];
|
||||
}
|
||||
|
||||
for (index_ = 1; index_ < size_; ++index_)
|
||||
{
|
||||
if (index_set_.find (index_) != set_end_)
|
||||
{
|
||||
ptr_ += dfa_alphabet_;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_ptr_[end_state_index] = ptr_[end_state_index];
|
||||
new_ptr_[id_index] = ptr_[id_index];
|
||||
new_ptr_[user_id_index] = ptr_[user_id_index];
|
||||
new_ptr_[push_dfa_index] = ptr_[push_dfa_index];
|
||||
new_ptr_[next_dfa_index] = ptr_[next_dfa_index];
|
||||
new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
|
||||
new_ptr_ += transitions_index;
|
||||
ptr_ += transitions_index;
|
||||
|
||||
for (id_type i_ = transitions_index; i_ < dfa_alphabet_; ++i_)
|
||||
{
|
||||
*new_ptr_++ = lookup_ptr_[*ptr_++];
|
||||
}
|
||||
}
|
||||
|
||||
dfa_.swap (new_dfa_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_state_machine<char> state_machine;
|
||||
typedef basic_state_machine<wchar_t> wstate_machine;
|
||||
|
||||
template<typename char_type, typename id_type = std::size_t,
|
||||
bool is_dfa = true>
|
||||
struct basic_char_state_machine
|
||||
{
|
||||
typedef basic_sm_traits<char_type, id_type, false, false, is_dfa> traits;
|
||||
typedef detail::basic_internals<id_type> internals;
|
||||
typedef typename internals::id_type_vector id_type_vector;
|
||||
|
||||
struct state
|
||||
{
|
||||
typedef basic_string_token<char_type> string_token;
|
||||
typedef std::map<id_type, string_token> id_type_string_token_map;
|
||||
typedef std::pair<id_type, string_token> id_type_string_token_pair;
|
||||
enum push_pop_dfa {neither, push_dfa, pop_dfa};
|
||||
|
||||
bool _end_state;
|
||||
push_pop_dfa _push_pop_dfa;
|
||||
id_type _id;
|
||||
id_type _user_id;
|
||||
id_type _push_dfa;
|
||||
id_type _next_dfa;
|
||||
id_type _eol_index;
|
||||
id_type_string_token_map _transitions;
|
||||
|
||||
state () :
|
||||
_end_state (false),
|
||||
_push_pop_dfa (neither),
|
||||
_id (0),
|
||||
_user_id (traits::npos ()),
|
||||
_push_dfa (traits::npos ()),
|
||||
_next_dfa (0),
|
||||
_eol_index (traits::npos ()),
|
||||
_transitions ()
|
||||
{
|
||||
}
|
||||
|
||||
bool operator == (const state rhs_) const
|
||||
{
|
||||
return _end_state == rhs_._end_state &&
|
||||
_push_pop_dfa == rhs_._push_pop_dfa &&
|
||||
_id == rhs_._id &&
|
||||
_user_id == rhs_._user_id &&
|
||||
_push_dfa == rhs_._push_dfa &&
|
||||
_next_dfa == rhs_._next_dfa &&
|
||||
_eol_index == rhs_._eol_index &&
|
||||
_transitions == rhs_._transitions;
|
||||
}
|
||||
};
|
||||
|
||||
typedef typename state::string_token string_token;
|
||||
typedef std::vector<state> state_vector;
|
||||
typedef std::vector<string_token> string_token_vector;
|
||||
typedef typename state::id_type_string_token_pair
|
||||
id_type_string_token_pair;
|
||||
|
||||
struct dfa
|
||||
{
|
||||
id_type _bol_index;
|
||||
state_vector _states;
|
||||
|
||||
dfa (const std::size_t size_) :
|
||||
_bol_index (traits::npos ()),
|
||||
_states (state_vector (size_))
|
||||
{
|
||||
}
|
||||
|
||||
std::size_t size () const
|
||||
{
|
||||
return _states.size ();
|
||||
}
|
||||
|
||||
void swap (dfa &rhs_)
|
||||
{
|
||||
std::swap (_bol_index, rhs_._bol_index);
|
||||
_states.swap (rhs_._states);
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::deque<dfa> dfa_deque;
|
||||
|
||||
dfa_deque _sm_deque;
|
||||
|
||||
// If you get a compile error here you have
|
||||
// failed to define an unsigned id type.
|
||||
compile_assert<(static_cast<id_type>(~0) > 0)>
|
||||
_valid_id_type;
|
||||
|
||||
basic_char_state_machine () :
|
||||
_sm_deque (),
|
||||
_valid_id_type ()
|
||||
{
|
||||
}
|
||||
|
||||
void append (const string_token_vector &token_vector_,
|
||||
const internals &internals_, const id_type dfa_index_)
|
||||
{
|
||||
const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[dfa_index_];
|
||||
const std::size_t alphabet_ = dfa_alphabet_ - transitions_index;
|
||||
const id_type_vector &source_dfa_ = *internals_._dfa[dfa_index_];
|
||||
const id_type *ptr_ = &source_dfa_.front ();
|
||||
const std::size_t size_ = (source_dfa_.size () - dfa_alphabet_) /
|
||||
dfa_alphabet_;
|
||||
typename state::id_type_string_token_map::iterator trans_iter_;
|
||||
|
||||
_sm_deque.push_back (dfa (size_));
|
||||
|
||||
dfa &dest_dfa_ = _sm_deque.back ();
|
||||
|
||||
if (*ptr_)
|
||||
{
|
||||
dest_dfa_._bol_index = *ptr_ - 1;
|
||||
}
|
||||
|
||||
ptr_ += dfa_alphabet_;
|
||||
|
||||
for (id_type i_ = 0; i_ < size_; ++i_)
|
||||
{
|
||||
state &state_ = dest_dfa_._states[i_];
|
||||
|
||||
state_._end_state = ptr_[end_state_index] != 0;
|
||||
|
||||
if (ptr_[push_dfa_index] != npos ())
|
||||
{
|
||||
state_._push_pop_dfa = state::push_dfa;
|
||||
}
|
||||
else if (ptr_[end_state_index] & pop_dfa_bit)
|
||||
{
|
||||
state_._push_pop_dfa = state::pop_dfa;
|
||||
}
|
||||
|
||||
state_._id = ptr_[id_index];
|
||||
state_._user_id = ptr_[user_id_index];
|
||||
state_._push_dfa = ptr_[push_dfa_index];
|
||||
state_._next_dfa = ptr_[next_dfa_index];
|
||||
|
||||
if (ptr_[eol_index])
|
||||
{
|
||||
state_._eol_index = ptr_[eol_index] - 1;
|
||||
}
|
||||
|
||||
ptr_ += transitions_index;
|
||||
|
||||
for (id_type col_index_ = 0; col_index_ < alphabet_;
|
||||
++col_index_, ++ptr_)
|
||||
{
|
||||
const id_type next_ = *ptr_;
|
||||
|
||||
if (next_ > 0)
|
||||
{
|
||||
trans_iter_ = state_._transitions.find (next_ - 1);
|
||||
|
||||
if (trans_iter_ == state_._transitions.end ())
|
||||
{
|
||||
trans_iter_ = state_._transitions.insert
|
||||
(id_type_string_token_pair (next_ - 1,
|
||||
token_vector_[col_index_])).first;
|
||||
}
|
||||
else
|
||||
{
|
||||
trans_iter_->second.insert (token_vector_[col_index_]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_sm_deque.clear ();
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _sm_deque.empty ();
|
||||
}
|
||||
|
||||
void minimise ()
|
||||
{
|
||||
const id_type dfas_ = static_cast<id_type>(_sm_deque.size ());
|
||||
|
||||
for (id_type i_ = 0; i_ < dfas_; ++i_)
|
||||
{
|
||||
dfa *dfa_ = &_sm_deque[i_];
|
||||
|
||||
if (dfa_->size () > 0)
|
||||
{
|
||||
std::size_t size_ = 0;
|
||||
|
||||
do
|
||||
{
|
||||
size_ = dfa_->size ();
|
||||
minimise_dfa (*dfa_, size_);
|
||||
} while (dfa_->size () != size_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static id_type npos ()
|
||||
{
|
||||
return traits::npos ();
|
||||
}
|
||||
|
||||
id_type size () const
|
||||
{
|
||||
return static_cast<id_type>(_sm_deque.size ());
|
||||
}
|
||||
|
||||
static id_type skip ()
|
||||
{
|
||||
return static_cast<id_type>(~1);
|
||||
}
|
||||
|
||||
void swap (basic_char_state_machine &csm_)
|
||||
{
|
||||
_sm_deque.swap (csm_._sm_deque);
|
||||
}
|
||||
|
||||
private:
|
||||
typedef std::set<id_type> index_set;
|
||||
|
||||
void minimise_dfa (dfa &dfa_, std::size_t size_)
|
||||
{
|
||||
const state *first_ = &dfa_._states.front ();
|
||||
const state *end_ = first_ + size_;
|
||||
id_type index_ = 0;
|
||||
id_type new_index_ = 0;
|
||||
id_type_vector lookup_ (size_, npos ());
|
||||
id_type *lookup_ptr_ = &lookup_.front ();
|
||||
index_set index_set_;
|
||||
|
||||
for (; first_ != end_; ++first_, ++index_)
|
||||
{
|
||||
const state *second_ = first_ + 1;
|
||||
|
||||
for (id_type curr_index_ = index_ + 1; second_ != end_;
|
||||
++curr_index_, ++second_)
|
||||
{
|
||||
if (index_set_.find (curr_index_) != index_set_.end ())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (*first_ == *second_)
|
||||
{
|
||||
index_set_.insert (curr_index_);
|
||||
lookup_ptr_[curr_index_] = new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (lookup_ptr_[index_] == npos ())
|
||||
{
|
||||
lookup_ptr_[index_] = new_index_;
|
||||
++new_index_;
|
||||
}
|
||||
}
|
||||
|
||||
if (!index_set_.empty ())
|
||||
{
|
||||
const state *front_ = &dfa_._states.front ();
|
||||
dfa new_dfa_ (new_index_);
|
||||
typename index_set::const_iterator set_end_ = index_set_.end ();
|
||||
const state *ptr_ = front_;
|
||||
state *new_ptr_ = &new_dfa_._states.front ();
|
||||
|
||||
if (dfa_._bol_index != npos ())
|
||||
{
|
||||
new_dfa_._bol_index = lookup_ptr_[dfa_._bol_index];
|
||||
}
|
||||
|
||||
for (index_ = 0; index_ < size_; ++index_)
|
||||
{
|
||||
if (index_set_.find (index_) != set_end_)
|
||||
{
|
||||
++ptr_;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_ptr_->_end_state = ptr_->_end_state;
|
||||
new_ptr_->_id = ptr_->_end_state;
|
||||
new_ptr_->_user_id = ptr_->_user_id;
|
||||
new_ptr_->_next_dfa = ptr_->_next_dfa;
|
||||
|
||||
if (ptr_->_eol_index != npos ())
|
||||
{
|
||||
new_ptr_->_eol_index = lookup_ptr_[ptr_->_eol_index];
|
||||
}
|
||||
|
||||
typename state::id_type_string_token_map::const_iterator
|
||||
iter_ = ptr_->_transitions.begin ();
|
||||
typename state::id_type_string_token_map::const_iterator end_ =
|
||||
ptr_->_transitions.end ();
|
||||
typename state::id_type_string_token_map::iterator find_;
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
find_ = new_ptr_->_transitions.find
|
||||
(lookup_ptr_[iter_->first]);
|
||||
|
||||
if (find_ == new_ptr_->_transitions.end ())
|
||||
{
|
||||
new_ptr_->_transitions.insert
|
||||
(id_type_string_token_pair
|
||||
(lookup_ptr_[iter_->first], iter_->second));
|
||||
}
|
||||
else
|
||||
{
|
||||
find_->second.insert (iter_->second);
|
||||
}
|
||||
}
|
||||
|
||||
++ptr_;
|
||||
++new_ptr_;
|
||||
}
|
||||
|
||||
dfa_.swap (new_dfa_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_char_state_machine<char> char_state_machine;
|
||||
typedef basic_char_state_machine<wchar_t> wchar_state_machine;
|
||||
}
|
||||
|
||||
#endif
|
||||
350
inc/lexertl/stream_shared_iterator.hpp
Normal file
350
inc/lexertl/stream_shared_iterator.hpp
Normal file
@@ -0,0 +1,350 @@
|
||||
// stream_shared_iterator.hpp
|
||||
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef LEXERTL_STREAM_SHARED_ITERATOR_H
|
||||
#define LEXERTL_STREAM_SHARED_ITERATOR_H
|
||||
|
||||
#include <algorithm>
|
||||
// memcpy
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <math.h>
|
||||
#include "runtime_error.hpp"
|
||||
#include "size_t.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_type>
|
||||
class basic_stream_shared_iterator
|
||||
{
|
||||
public:
|
||||
typedef std::basic_istream<char_type> istream;
|
||||
typedef std::forward_iterator_tag iterator_category;
|
||||
typedef std::size_t difference_type;
|
||||
typedef char_type value_type;
|
||||
typedef char_type *pointer;
|
||||
typedef char_type &reference;
|
||||
|
||||
basic_stream_shared_iterator () :
|
||||
_master (false),
|
||||
_live (false),
|
||||
_index (shared::npos ()),
|
||||
_shared (0)
|
||||
{
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator (istream &stream_,
|
||||
const std::size_t buff_size_ = 1024,
|
||||
const std::size_t increment_ = 1024) :
|
||||
_master (true),
|
||||
_live (false),
|
||||
_index (shared::npos ()),
|
||||
// For exception safety don't call new yet
|
||||
_shared (0)
|
||||
{
|
||||
// Safe to call potentially throwing new now.
|
||||
_shared = new shared (stream_, buff_size_, increment_);
|
||||
++_shared->_ref_count;
|
||||
_iter = _shared->_clients.insert (_shared->_clients.end (), this);
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator (const basic_stream_shared_iterator &rhs_) :
|
||||
_master (false),
|
||||
_live (false),
|
||||
_index (rhs_._master ? rhs_._shared->lowest () : rhs_._index),
|
||||
_shared (rhs_._shared)
|
||||
{
|
||||
if (_shared)
|
||||
{
|
||||
// New copy of an iterator.
|
||||
// The assumption is that any copy must be live
|
||||
// even if the rhs is not (otherwise we will never
|
||||
// have a record of the start of the current range!)
|
||||
++_shared->_ref_count;
|
||||
_iter = _shared->_clients.insert (_shared->_clients.end (), this);
|
||||
_live = true;
|
||||
}
|
||||
}
|
||||
|
||||
~basic_stream_shared_iterator ()
|
||||
{
|
||||
if (_shared)
|
||||
{
|
||||
--_shared->_ref_count;
|
||||
_shared->erase (this);
|
||||
|
||||
if (_shared->_ref_count == 0)
|
||||
{
|
||||
delete _shared;
|
||||
_shared = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator &operator =
|
||||
(const basic_stream_shared_iterator &rhs_)
|
||||
{
|
||||
if (this != &rhs_)
|
||||
{
|
||||
_master = false;
|
||||
_index = rhs_._master ? rhs_._shared->lowest () : rhs_._index;
|
||||
|
||||
if (_live && !rhs_._live)
|
||||
{
|
||||
_shared->erase (this);
|
||||
|
||||
if (!rhs_._shared)
|
||||
{
|
||||
--_shared->_ref_count;
|
||||
}
|
||||
}
|
||||
else if (!_live && rhs_._live)
|
||||
{
|
||||
rhs_._iter = rhs_._shared->_clients.insert (rhs_._shared->
|
||||
_clients.end (), this);
|
||||
|
||||
if (!_shared)
|
||||
{
|
||||
++rhs_._shared->_ref_count;
|
||||
}
|
||||
}
|
||||
|
||||
_live = rhs_._live;
|
||||
_shared = rhs_._shared;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator == (const basic_stream_shared_iterator &rhs_) const
|
||||
{
|
||||
return _index == rhs_._index &&
|
||||
(_shared == rhs_._shared ||
|
||||
(_index == shared::npos () || rhs_._index == shared::npos ()) &&
|
||||
(!_shared || !rhs_._shared));
|
||||
}
|
||||
|
||||
bool operator != (const basic_stream_shared_iterator &rhs_) const
|
||||
{
|
||||
return !(*this == rhs_);
|
||||
}
|
||||
|
||||
const char_type &operator * ()
|
||||
{
|
||||
check_master ();
|
||||
return _shared->_buffer[_index];
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator &operator ++ ()
|
||||
{
|
||||
check_master ();
|
||||
++_index;
|
||||
update_state ();
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_stream_shared_iterator operator ++ (int)
|
||||
{
|
||||
basic_stream_shared_iterator iter_ = *this;
|
||||
|
||||
check_master ();
|
||||
++_index;
|
||||
update_state ();
|
||||
return iter_;
|
||||
}
|
||||
|
||||
private:
|
||||
class shared
|
||||
{
|
||||
public:
|
||||
std::size_t _ref_count;
|
||||
typedef std::vector<char_type> char_vector;
|
||||
typedef std::list<basic_stream_shared_iterator *> iter_list;
|
||||
istream &_stream;
|
||||
std::size_t _increment;
|
||||
std::size_t _len;
|
||||
char_vector _buffer;
|
||||
iter_list _clients;
|
||||
|
||||
shared (istream &stream_, const std::size_t buff_size_,
|
||||
const std::size_t increment_) :
|
||||
_ref_count (0),
|
||||
_increment (increment_),
|
||||
_stream (stream_)
|
||||
{
|
||||
_buffer.resize (buff_size_);
|
||||
_stream.read (&_buffer.front (), _buffer.size ());
|
||||
_len = static_cast<std::size_t>(_stream.gcount ());
|
||||
}
|
||||
|
||||
bool reload_buffer ()
|
||||
{
|
||||
const std::size_t lowest_ = lowest ();
|
||||
std::size_t read_ = 0;
|
||||
|
||||
if (lowest_ == 0)
|
||||
{
|
||||
// Resize buffer
|
||||
const std::size_t old_size_ = _buffer.size ();
|
||||
const std::size_t new_size_ = old_size_ + _increment;
|
||||
|
||||
_buffer.resize (new_size_);
|
||||
_stream.read (&_buffer.front () + old_size_, _increment);
|
||||
read_ = static_cast<std::size_t>(_stream.gcount ());
|
||||
|
||||
if (read_)
|
||||
{
|
||||
read_ += old_size_;
|
||||
_len = read_;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Some systems have memcpy in namespace std
|
||||
using namespace std;
|
||||
const size_t start_ = _buffer.size () - lowest_;
|
||||
const size_t len_ = _buffer.size () - start_;
|
||||
|
||||
memcpy (&_buffer.front (), &_buffer[lowest_], start_ *
|
||||
sizeof (char_type));
|
||||
_stream.read (&_buffer.front () + start_, len_);
|
||||
read_ = static_cast<size_t>(_stream.gcount ());
|
||||
subtract (lowest_);
|
||||
|
||||
if (read_)
|
||||
{
|
||||
read_ += start_;
|
||||
_len = read_;
|
||||
}
|
||||
else
|
||||
{
|
||||
_len = highest ();
|
||||
}
|
||||
}
|
||||
|
||||
return read_ != 0;
|
||||
}
|
||||
|
||||
void erase (basic_stream_shared_iterator *ptr_)
|
||||
{
|
||||
if (ptr_->_iter != _clients.end ())
|
||||
{
|
||||
_clients.erase (ptr_->_iter);
|
||||
ptr_->_iter = _clients.end ();
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t lowest () const
|
||||
{
|
||||
std::size_t lowest_ = npos ();
|
||||
typename iter_list::const_iterator iter_ = _clients.begin ();
|
||||
typename iter_list::const_iterator end_ = _clients.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
const basic_stream_shared_iterator *ptr_ = *iter_;
|
||||
|
||||
if (ptr_->_index < lowest_)
|
||||
{
|
||||
lowest_ = ptr_->_index;
|
||||
}
|
||||
}
|
||||
|
||||
if (lowest_ == npos ())
|
||||
{
|
||||
lowest_ = 0;
|
||||
}
|
||||
|
||||
return lowest_;
|
||||
}
|
||||
|
||||
std::size_t highest () const
|
||||
{
|
||||
std::size_t highest_ = 0;
|
||||
typename iter_list::const_iterator iter_ = _clients.begin ();
|
||||
typename iter_list::const_iterator end_ = _clients.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
const basic_stream_shared_iterator *ptr_ = *iter_;
|
||||
|
||||
if (ptr_->_index != npos () && ptr_->_index > highest_)
|
||||
{
|
||||
highest_ = ptr_->_index;
|
||||
}
|
||||
}
|
||||
|
||||
return highest_;
|
||||
}
|
||||
|
||||
void subtract (const std::size_t lowest_)
|
||||
{
|
||||
typename iter_list::iterator iter_ = _clients.begin ();
|
||||
typename iter_list::iterator end_ = _clients.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
basic_stream_shared_iterator *ptr_ = *iter_;
|
||||
|
||||
if (ptr_->_index != npos ())
|
||||
{
|
||||
ptr_->_index -= lowest_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static std::size_t npos ()
|
||||
{
|
||||
return static_cast<std::size_t>(~0);
|
||||
}
|
||||
|
||||
private:
|
||||
shared &operator = (const shared &rhs_);
|
||||
};
|
||||
|
||||
bool _master;
|
||||
bool _live;
|
||||
std::size_t _index;
|
||||
shared *_shared;
|
||||
mutable typename shared::iter_list::iterator _iter;
|
||||
|
||||
void check_master ()
|
||||
{
|
||||
if (!_shared)
|
||||
{
|
||||
throw runtime_error ("Cannot manipulate null (end) "
|
||||
"stream_shared_iterators.");
|
||||
}
|
||||
|
||||
if (_master)
|
||||
{
|
||||
_master = false;
|
||||
_live = true;
|
||||
_index = _shared->lowest ();
|
||||
}
|
||||
}
|
||||
|
||||
void update_state ()
|
||||
{
|
||||
if (_index >= _shared->_len)
|
||||
{
|
||||
if (!_shared->reload_buffer ())
|
||||
{
|
||||
_shared->erase (this);
|
||||
_index = shared::npos ();
|
||||
_live = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef basic_stream_shared_iterator<char> stream_shared_iterator;
|
||||
typedef basic_stream_shared_iterator<wchar_t> wstream_shared_iterator;
|
||||
}
|
||||
|
||||
#endif
|
||||
421
inc/lexertl/string_token.hpp
Normal file
421
inc/lexertl/string_token.hpp
Normal file
@@ -0,0 +1,421 @@
|
||||
// string_token.hpp
|
||||
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_STRING_TOKEN_HPP
|
||||
#define LEXERTL_STRING_TOKEN_HPP
|
||||
|
||||
#include "char_traits.hpp"
|
||||
#include <ios> // Needed by GCC 4.4
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename ch_type>
|
||||
struct basic_string_token
|
||||
{
|
||||
typedef ch_type char_type;
|
||||
typedef basic_char_traits<char_type> char_traits;
|
||||
typedef typename char_traits::index_type index_type;
|
||||
typedef std::pair<index_type, index_type> range;
|
||||
typedef std::vector<range> range_vector;
|
||||
typedef std::basic_string<char_type> string;
|
||||
typedef basic_string_token<char_type> string_token;
|
||||
|
||||
range_vector _ranges;
|
||||
|
||||
basic_string_token () :
|
||||
_ranges ()
|
||||
{
|
||||
}
|
||||
|
||||
basic_string_token (char_type ch_) :
|
||||
_ranges ()
|
||||
{
|
||||
insert (range (ch_, ch_));
|
||||
}
|
||||
|
||||
basic_string_token (char_type first_, char_type second_) :
|
||||
_ranges ()
|
||||
{
|
||||
insert (range (first_, second_));
|
||||
}
|
||||
|
||||
void clear ()
|
||||
{
|
||||
_ranges.clear ();
|
||||
}
|
||||
|
||||
bool empty () const
|
||||
{
|
||||
return _ranges.empty ();
|
||||
}
|
||||
|
||||
bool any () const
|
||||
{
|
||||
return _ranges.size () == 1 && _ranges.front ().first == 0 &&
|
||||
_ranges.front ().second == char_traits::max_val ();
|
||||
}
|
||||
|
||||
bool operator < (const basic_string_token &rhs_) const
|
||||
{
|
||||
return _ranges < rhs_._ranges;
|
||||
}
|
||||
|
||||
bool operator == (const basic_string_token &rhs_) const
|
||||
{
|
||||
return _ranges == rhs_._ranges;
|
||||
}
|
||||
|
||||
bool negatable () const
|
||||
{
|
||||
std::size_t size_ = 0;
|
||||
typename range_vector::const_iterator iter_ = _ranges.begin ();
|
||||
typename range_vector::const_iterator end_ = _ranges.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
size_ += static_cast<std::size_t>(iter_->second) + 1 -
|
||||
static_cast<std::size_t>(iter_->first);
|
||||
}
|
||||
|
||||
return size_ > static_cast<std::size_t>(char_traits::max_val ()) / 2;
|
||||
}
|
||||
|
||||
void swap (basic_string_token &rhs_)
|
||||
{
|
||||
_ranges.swap (rhs_._ranges);
|
||||
}
|
||||
|
||||
void insert (const basic_string_token &rhs_)
|
||||
{
|
||||
typename range_vector::const_iterator iter_ = rhs_._ranges.begin ();
|
||||
typename range_vector::const_iterator end_ = rhs_._ranges.end ();
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
insert (*iter_);
|
||||
}
|
||||
}
|
||||
|
||||
// Deliberately pass by value - may modify
|
||||
typename range_vector::iterator insert (range rhs_)
|
||||
{
|
||||
bool insert_ = true;
|
||||
typename range_vector::iterator iter_ = _ranges.begin ();
|
||||
typename range_vector::const_iterator end_ = _ranges.end ();
|
||||
|
||||
while (iter_ != end_)
|
||||
{
|
||||
// follows current item
|
||||
if (rhs_.first > iter_->second)
|
||||
{
|
||||
if (rhs_.first == iter_->second + 1)
|
||||
{
|
||||
// Auto normalise
|
||||
rhs_.first = iter_->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
// No intersection, consider next
|
||||
++iter_;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Precedes current item
|
||||
else if (rhs_.second < iter_->first)
|
||||
{
|
||||
if (rhs_.second == iter_->first - 1)
|
||||
{
|
||||
// Auto normalise
|
||||
rhs_.second = iter_->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
// insert here
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// overlap (under)
|
||||
if (rhs_.first < iter_->first)
|
||||
{
|
||||
if (rhs_.second < iter_->second)
|
||||
{
|
||||
rhs_.second = iter_->second;
|
||||
}
|
||||
}
|
||||
// overlap (over)
|
||||
else if (rhs_.second > iter_->second)
|
||||
{
|
||||
if (rhs_.first > iter_->first)
|
||||
{
|
||||
rhs_.first = iter_->first;
|
||||
}
|
||||
}
|
||||
// subset
|
||||
else
|
||||
{
|
||||
insert_ = false;
|
||||
iter_ = _ranges.end ();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Code minimisation: this always applies unless we have already
|
||||
// exited the loop, or "continue" executed.
|
||||
iter_ = _ranges.erase (iter_);
|
||||
end_ = _ranges.end ();
|
||||
}
|
||||
|
||||
if (insert_)
|
||||
{
|
||||
iter_ = _ranges.insert(iter_, rhs_);
|
||||
}
|
||||
|
||||
return iter_;
|
||||
}
|
||||
|
||||
void negate ()
|
||||
{
|
||||
index_type next_ = 0;
|
||||
const index_type max_ = char_traits::max_val ();
|
||||
string_token temp_;
|
||||
typename range_vector::iterator iter_ = _ranges.begin ();
|
||||
typename range_vector::const_iterator end_ = _ranges.end ();
|
||||
bool finished_ = false;
|
||||
|
||||
for (; iter_ != end_; ++iter_)
|
||||
{
|
||||
if (next_ < iter_->first)
|
||||
{
|
||||
temp_.insert (range (next_, iter_->first - 1));
|
||||
}
|
||||
|
||||
if (iter_->second < max_)
|
||||
{
|
||||
next_ = iter_->second + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
finished_ = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!finished_)
|
||||
{
|
||||
temp_.insert (range (next_, max_));
|
||||
}
|
||||
|
||||
swap (temp_);
|
||||
}
|
||||
|
||||
void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
|
||||
{
|
||||
typename range_vector::iterator lhs_iter_ = _ranges.begin ();
|
||||
typename range_vector::const_iterator lhs_end_ = _ranges.end ();
|
||||
typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin ();
|
||||
typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end ();
|
||||
|
||||
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (rhs_iter_->first > lhs_iter_->second)
|
||||
{
|
||||
++lhs_iter_;
|
||||
}
|
||||
else if (rhs_iter_->second < lhs_iter_->first)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
range range_;
|
||||
|
||||
if (rhs_iter_->first > lhs_iter_->first)
|
||||
{
|
||||
range_.first = rhs_iter_->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.first = lhs_iter_->first;
|
||||
}
|
||||
|
||||
if (rhs_iter_->second < lhs_iter_->second)
|
||||
{
|
||||
range_.second = rhs_iter_->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.second = lhs_iter_->second;
|
||||
}
|
||||
|
||||
adjust (range_, *this, lhs_iter_, lhs_end_);
|
||||
adjust (range_, rhs_, rhs_iter_, rhs_end_);
|
||||
overlap_.insert (range_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void remove (basic_string_token &rhs_)
|
||||
{
|
||||
typename range_vector::iterator lhs_iter_ = _ranges.begin ();
|
||||
typename range_vector::const_iterator lhs_end_ = _ranges.end ();
|
||||
typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin ();
|
||||
typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end ();
|
||||
|
||||
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
|
||||
{
|
||||
if (rhs_iter_->first > lhs_iter_->second)
|
||||
{
|
||||
++lhs_iter_;
|
||||
}
|
||||
else if (rhs_iter_->second < lhs_iter_->first)
|
||||
{
|
||||
++rhs_iter_;
|
||||
}
|
||||
else
|
||||
{
|
||||
range range_;
|
||||
|
||||
if (rhs_iter_->first > lhs_iter_->first)
|
||||
{
|
||||
range_.first = rhs_iter_->first;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.first = lhs_iter_->first;
|
||||
}
|
||||
|
||||
if (rhs_iter_->second < lhs_iter_->second)
|
||||
{
|
||||
range_.second = rhs_iter_->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
range_.second = lhs_iter_->second;
|
||||
}
|
||||
|
||||
adjust (range_, *this, lhs_iter_, lhs_end_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static string escape_char (const typename char_traits::index_type ch_)
|
||||
{
|
||||
string out_;
|
||||
|
||||
switch (ch_)
|
||||
{
|
||||
case '\0':
|
||||
out_ += '\\';
|
||||
out_ += '0';
|
||||
break;
|
||||
case '\a':
|
||||
out_ += '\\';
|
||||
out_ += 'a';
|
||||
break;
|
||||
case '\b':
|
||||
out_ += '\\';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case 27:
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
out_ += '1';
|
||||
out_ += 'b';
|
||||
break;
|
||||
case '\f':
|
||||
out_ += '\\';
|
||||
out_ += 'f';
|
||||
break;
|
||||
case '\n':
|
||||
out_ += '\\';
|
||||
out_ += 'n';
|
||||
break;
|
||||
case '\r':
|
||||
out_ += '\\';
|
||||
out_ += 'r';
|
||||
break;
|
||||
case '\t':
|
||||
out_ += '\\';
|
||||
out_ += 't';
|
||||
break;
|
||||
case '\v':
|
||||
out_ += '\\';
|
||||
out_ += 'v';
|
||||
break;
|
||||
case '\\':
|
||||
out_ += '\\';
|
||||
out_ += '\\';
|
||||
break;
|
||||
case '"':
|
||||
out_ += '\\';
|
||||
out_ += '"';
|
||||
break;
|
||||
case '\'':
|
||||
out_ += '\\';
|
||||
out_ += '\'';
|
||||
break;
|
||||
default:
|
||||
{
|
||||
if (ch_ < 32 || ch_ > 126)
|
||||
{
|
||||
std::basic_stringstream<char_type> ss_;
|
||||
|
||||
out_ += '\\';
|
||||
out_ += 'x';
|
||||
ss_ << std::hex <<
|
||||
static_cast<std::size_t> (ch_);
|
||||
out_ += ss_.str ();
|
||||
}
|
||||
else
|
||||
{
|
||||
out_ += ch_;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return out_;
|
||||
}
|
||||
|
||||
private:
|
||||
void adjust (const range &range_, basic_string_token &token_,
|
||||
typename range_vector::iterator &iter_,
|
||||
typename range_vector::const_iterator &end_)
|
||||
{
|
||||
if (range_.first > iter_->first)
|
||||
{
|
||||
const index_type second_ = iter_->second;
|
||||
|
||||
iter_->second = range_.first - 1;
|
||||
|
||||
if (range_.second < second_)
|
||||
{
|
||||
range new_range_ (range_.second + 1, second_);
|
||||
|
||||
iter_ = token_.insert (new_range_);
|
||||
end_ = token_._ranges.end ();
|
||||
}
|
||||
}
|
||||
else if (range_.second < iter_->second)
|
||||
{
|
||||
iter_->first = range_.second + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
iter_ = token_._ranges.erase (iter_);
|
||||
end_ = token_._ranges.end ();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
380
inc/lexertl/utf_iterators.hpp
Normal file
380
inc/lexertl/utf_iterators.hpp
Normal file
@@ -0,0 +1,380 @@
|
||||
// utf_iterators.hpp
|
||||
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
|
||||
// Inspired by http://utfcpp.sourceforge.net/
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
||||
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
#ifndef LEXERTL_UTF_ITERATORS_HPP
|
||||
#define LEXERTL_UTF_ITERATORS_HPP
|
||||
|
||||
#include <iterator>
|
||||
|
||||
namespace lexertl
|
||||
{
|
||||
template<typename char_iterator, typename char_type>
|
||||
class basic_utf8_in_iterator :
|
||||
public std::iterator<std::input_iterator_tag, char_type>
|
||||
{
|
||||
public:
|
||||
basic_utf8_in_iterator () :
|
||||
_char (0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf8_in_iterator (const char_iterator& it_) :
|
||||
_it (it_),
|
||||
_char (0)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
char_type operator * () const
|
||||
{
|
||||
return _char;
|
||||
}
|
||||
|
||||
bool operator == (const basic_utf8_in_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator != (const basic_utf8_in_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf8_in_iterator &operator ++ ()
|
||||
{
|
||||
next ();
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf8_in_iterator operator ++ (int)
|
||||
{
|
||||
basic_utf8_in_iterator temp_ = *this;
|
||||
|
||||
next ();
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename std::iterator_traits<char_iterator>::
|
||||
difference_type difference_type;
|
||||
char_iterator _it;
|
||||
char_type _char;
|
||||
|
||||
void next ()
|
||||
{
|
||||
const char len_ = len (_it);
|
||||
char_type ch_ = *_it & 0xff;
|
||||
|
||||
switch (len_)
|
||||
{
|
||||
case 1:
|
||||
break;
|
||||
case 2:
|
||||
++_it;
|
||||
ch_ = (ch_ << 6 & 0x7ff) | (*_it & 0x3f);
|
||||
break;
|
||||
case 3:
|
||||
++_it;
|
||||
ch_ = (ch_ << 12 & 0xffff) | ((*_it & 0xff) << 6 & 0xfff);
|
||||
++_it;
|
||||
ch_ |= *_it & 0x3f;
|
||||
break;
|
||||
case 4:
|
||||
++_it;
|
||||
ch_ = (ch_ << 18 & 0x1fffff) | ((*_it & 0xff) << 12 & 0x3ffff);
|
||||
++_it;
|
||||
ch_ |= (*_it & 0xff) << 6 & 0xfff;
|
||||
++_it;
|
||||
ch_ |= *_it & 0x3f;
|
||||
break;
|
||||
}
|
||||
|
||||
++_it;
|
||||
_char = ch_;
|
||||
}
|
||||
|
||||
char len (const char_iterator &it_) const
|
||||
{
|
||||
const unsigned char ch_ = *it_;
|
||||
|
||||
return ch_ < 0x80 ? 1 :
|
||||
ch_ >> 5 == 0x06 ? 2 :
|
||||
ch_ >> 4 == 0x0e ? 3 :
|
||||
ch_ >> 3 == 0x1e ? 4 : 0;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename char_iterator>
|
||||
class basic_utf8_out_iterator :
|
||||
public std::iterator<std::input_iterator_tag, char>
|
||||
{
|
||||
public:
|
||||
basic_utf8_out_iterator () :
|
||||
_count (0),
|
||||
_index (0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf8_out_iterator (const char_iterator& it_) :
|
||||
_it (it_),
|
||||
_count (0),
|
||||
_index (0)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
char operator * () const
|
||||
{
|
||||
return _bytes[_index];
|
||||
}
|
||||
|
||||
bool operator == (const basic_utf8_out_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator != (const basic_utf8_out_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf8_out_iterator &operator ++ ()
|
||||
{
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf8_out_iterator operator ++ (int)
|
||||
{
|
||||
basic_utf8_out_iterator temp_ = *this;
|
||||
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
char_iterator _it;
|
||||
char _bytes[4];
|
||||
unsigned char _count;
|
||||
unsigned char _index;
|
||||
|
||||
void next ()
|
||||
{
|
||||
const std::size_t ch_ = *_it;
|
||||
|
||||
_count = len (ch_);
|
||||
_index = 0;
|
||||
|
||||
switch (_count)
|
||||
{
|
||||
case 1:
|
||||
_bytes[0] = static_cast<char>(ch_);
|
||||
break;
|
||||
case 2:
|
||||
_bytes[0] = static_cast<char>((ch_ >> 6) | 0xc0);
|
||||
_bytes[1] = (ch_ & 0x3f) | 0x80;
|
||||
break;
|
||||
case 3:
|
||||
_bytes[0] = static_cast<char>((ch_ >> 12) | 0xe0);
|
||||
_bytes[1] = ((ch_ >> 6) & 0x3f) | 0x80;
|
||||
_bytes[2] = (ch_ & 0x3f) | 0x80;
|
||||
break;
|
||||
case 4:
|
||||
_bytes[0] = static_cast<char>((ch_ >> 18) | 0xf0);
|
||||
_bytes[1] = ((ch_ >> 12) & 0x3f) | 0x80;
|
||||
_bytes[2] = ((ch_ >> 6) & 0x3f) | 0x80;
|
||||
_bytes[3] = (ch_ & 0x3f) | 0x80;
|
||||
break;
|
||||
}
|
||||
|
||||
++_it;
|
||||
}
|
||||
|
||||
char len (const std::size_t ch_) const
|
||||
{
|
||||
return ch_ < 0x80 ? 1 :
|
||||
ch_ < 0x800 ? 2 :
|
||||
ch_ < 0x10000 ? 3 :
|
||||
4;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename char_iterator, typename char_type>
|
||||
class basic_utf16_in_iterator :
|
||||
public std::iterator<std::input_iterator_tag, char_type>
|
||||
{
|
||||
public:
|
||||
basic_utf16_in_iterator () :
|
||||
_char (0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf16_in_iterator (const char_iterator &it_) :
|
||||
_it (it_),
|
||||
_char (0)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
char_type operator * () const
|
||||
{
|
||||
return _char;
|
||||
}
|
||||
|
||||
bool operator == (const basic_utf16_in_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator != (const basic_utf16_in_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf16_in_iterator &operator ++ ()
|
||||
{
|
||||
next ();
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf16_in_iterator operator ++ (int)
|
||||
{
|
||||
basic_utf16_in_iterator temp_ = *this;
|
||||
|
||||
next ();
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
typedef typename std::iterator_traits<char_iterator>::
|
||||
difference_type difference_type;
|
||||
char_iterator _it;
|
||||
char_type _char;
|
||||
|
||||
void next ()
|
||||
{
|
||||
char_type ch_ = *_it & 0xffff;
|
||||
|
||||
if (ch_ >= 0xd800 && ch_ <= 0xdbff)
|
||||
{
|
||||
const char_type surrogate_ = *++_it & 0xffff;
|
||||
|
||||
ch_ = (((ch_ - 0xd800) << 10) | (surrogate_ - 0xdc00)) + 0x10000;
|
||||
}
|
||||
|
||||
++_it;
|
||||
_char = ch_;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename char_iterator>
|
||||
class basic_utf16_out_iterator :
|
||||
public std::iterator<std::input_iterator_tag, wchar_t>
|
||||
{
|
||||
public:
|
||||
basic_utf16_out_iterator () :
|
||||
_count (0),
|
||||
_index (0)
|
||||
{
|
||||
}
|
||||
|
||||
explicit basic_utf16_out_iterator (const char_iterator& it_) :
|
||||
_it (it_),
|
||||
_count (0),
|
||||
_index (0)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
wchar_t operator * () const
|
||||
{
|
||||
return _chars[_index];
|
||||
}
|
||||
|
||||
bool operator == (const basic_utf16_out_iterator &rhs_) const
|
||||
{
|
||||
return _it == rhs_._it;
|
||||
}
|
||||
|
||||
bool operator != (const basic_utf16_out_iterator &rhs_) const
|
||||
{
|
||||
return _it != rhs_._it;
|
||||
}
|
||||
|
||||
basic_utf16_out_iterator &operator ++ ()
|
||||
{
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
basic_utf16_out_iterator operator ++ (int)
|
||||
{
|
||||
basic_utf16_out_iterator temp_ = *this;
|
||||
|
||||
++_index;
|
||||
|
||||
if (_index >= _count)
|
||||
{
|
||||
next ();
|
||||
}
|
||||
|
||||
return temp_;
|
||||
}
|
||||
|
||||
private:
|
||||
char_iterator _it;
|
||||
wchar_t _chars[2];
|
||||
unsigned char _count;
|
||||
unsigned char _index;
|
||||
|
||||
void next ()
|
||||
{
|
||||
const std::size_t ch_ = *_it;
|
||||
|
||||
_count = len (ch_);
|
||||
_index = 0;
|
||||
|
||||
switch (_count)
|
||||
{
|
||||
case 1:
|
||||
_chars[0] = static_cast<wchar_t>(ch_);
|
||||
break;
|
||||
case 2:
|
||||
_chars[0] = static_cast<wchar_t>((ch_ >> 10) + 0xdc00u -
|
||||
(0x10000 >> 10));
|
||||
_chars[1] = static_cast<wchar_t>((ch_ & 0x3ff) + 0xdc00u);
|
||||
break;
|
||||
}
|
||||
|
||||
++_it;
|
||||
}
|
||||
|
||||
char len (const std::size_t ch_) const
|
||||
{
|
||||
return ch_ > 0xffff ? 2 : 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user