From f3d3389f54a10fe3be3d841b6a164306841b4540 Mon Sep 17 00:00:00 2001 From: Markus Hauschild Date: Fri, 31 May 2013 21:35:44 +0200 Subject: [PATCH] Fix typo in grammar. Add lexertl. --- grammar/grammar.y | 5 +- inc/lexertl/bool.hpp | 22 + inc/lexertl/char_traits.hpp | 50 + inc/lexertl/compile_assert.hpp | 24 + inc/lexertl/containers/bitvector.hpp | 228 ++ inc/lexertl/containers/ptr_list.hpp | 69 + inc/lexertl/containers/ptr_map.hpp | 72 + inc/lexertl/containers/ptr_stack.hpp | 69 + inc/lexertl/containers/ptr_vector.hpp | 106 + inc/lexertl/debug.hpp | 353 +++ inc/lexertl/enums.hpp | 25 + inc/lexertl/generate_cpp.hpp | 1122 ++++++++ inc/lexertl/generator.hpp | 829 ++++++ inc/lexertl/internals.hpp | 80 + inc/lexertl/is_same.hpp | 29 + inc/lexertl/licence_1_0.txt | 24 + inc/lexertl/lookup.hpp | 477 ++++ inc/lexertl/match_results.hpp | 150 ++ inc/lexertl/memory_file.hpp | 112 + inc/lexertl/old/fast_filebuf.hpp | 45 + inc/lexertl/old/string_token.hpp | 561 ++++ inc/lexertl/parser/parser.hpp | 1076 ++++++++ inc/lexertl/parser/tokeniser/re_token.hpp | 100 + inc/lexertl/parser/tokeniser/re_tokeniser.hpp | 829 ++++++ .../parser/tokeniser/re_tokeniser_helper.hpp | 2351 +++++++++++++++++ .../parser/tokeniser/re_tokeniser_state.hpp | 115 + inc/lexertl/parser/tree/end_node.hpp | 112 + inc/lexertl/parser/tree/iteration_node.hpp | 103 + inc/lexertl/parser/tree/leaf_node.hpp | 114 + inc/lexertl/parser/tree/node.hpp | 241 ++ inc/lexertl/parser/tree/selection_node.hpp | 106 + inc/lexertl/parser/tree/sequence_node.hpp | 126 + inc/lexertl/partition/charset.hpp | 73 + inc/lexertl/partition/equivset.hpp | 134 + inc/lexertl/rules.hpp | 743 ++++++ inc/lexertl/runtime_error.hpp | 23 + inc/lexertl/serialise.hpp | 28 + inc/lexertl/size_t.hpp | 12 + inc/lexertl/sm_traits.hpp | 44 + inc/lexertl/state_machine.hpp | 525 ++++ inc/lexertl/stream_shared_iterator.hpp | 350 +++ inc/lexertl/string_token.hpp | 421 +++ inc/lexertl/utf_iterators.hpp | 380 +++ src/test.cpp | 9 + 44 files changed, 12465 insertions(+), 2 deletions(-) create mode 100644 inc/lexertl/bool.hpp create mode 100644 inc/lexertl/char_traits.hpp create mode 100644 inc/lexertl/compile_assert.hpp create mode 100644 inc/lexertl/containers/bitvector.hpp create mode 100644 inc/lexertl/containers/ptr_list.hpp create mode 100644 inc/lexertl/containers/ptr_map.hpp create mode 100644 inc/lexertl/containers/ptr_stack.hpp create mode 100644 inc/lexertl/containers/ptr_vector.hpp create mode 100644 inc/lexertl/debug.hpp create mode 100644 inc/lexertl/enums.hpp create mode 100644 inc/lexertl/generate_cpp.hpp create mode 100644 inc/lexertl/generator.hpp create mode 100644 inc/lexertl/internals.hpp create mode 100644 inc/lexertl/is_same.hpp create mode 100644 inc/lexertl/licence_1_0.txt create mode 100644 inc/lexertl/lookup.hpp create mode 100644 inc/lexertl/match_results.hpp create mode 100644 inc/lexertl/memory_file.hpp create mode 100644 inc/lexertl/old/fast_filebuf.hpp create mode 100644 inc/lexertl/old/string_token.hpp create mode 100644 inc/lexertl/parser/parser.hpp create mode 100644 inc/lexertl/parser/tokeniser/re_token.hpp create mode 100644 inc/lexertl/parser/tokeniser/re_tokeniser.hpp create mode 100644 inc/lexertl/parser/tokeniser/re_tokeniser_helper.hpp create mode 100644 inc/lexertl/parser/tokeniser/re_tokeniser_state.hpp create mode 100644 inc/lexertl/parser/tree/end_node.hpp create mode 100644 inc/lexertl/parser/tree/iteration_node.hpp create mode 100644 inc/lexertl/parser/tree/leaf_node.hpp create mode 100644 inc/lexertl/parser/tree/node.hpp create mode 100644 inc/lexertl/parser/tree/selection_node.hpp create mode 100644 inc/lexertl/parser/tree/sequence_node.hpp create mode 100644 inc/lexertl/partition/charset.hpp create mode 100644 inc/lexertl/partition/equivset.hpp create mode 100644 inc/lexertl/rules.hpp create mode 100644 inc/lexertl/runtime_error.hpp create mode 100644 inc/lexertl/serialise.hpp create mode 100644 inc/lexertl/size_t.hpp create mode 100644 inc/lexertl/sm_traits.hpp create mode 100644 inc/lexertl/state_machine.hpp create mode 100644 inc/lexertl/stream_shared_iterator.hpp create mode 100644 inc/lexertl/string_token.hpp create mode 100644 inc/lexertl/utf_iterators.hpp diff --git a/grammar/grammar.y b/grammar/grammar.y index e067bb2..6264236 100644 --- a/grammar/grammar.y +++ b/grammar/grammar.y @@ -1,5 +1,6 @@ %include { +#include #include #include #include @@ -20,8 +21,8 @@ programm(A) ::= fundefs(B). { A = B; } -fundefs(A) ::= . { A = 0: } -fundefs(A) ::= fundefs fundef(B). { A = A + B: } +fundefs(A) ::= . { A = 0; } +fundefs(A) ::= fundefs fundef(B). { A = A + B; } fundef(A) ::= type(T) T_IDENTIFIER(ID) params(P) T_BEGIN statements(S) T_END. { A = T + ID + P + S; } diff --git a/inc/lexertl/bool.hpp b/inc/lexertl/bool.hpp new file mode 100644 index 0000000..2965d3d --- /dev/null +++ b/inc/lexertl/bool.hpp @@ -0,0 +1,22 @@ +// bool.hpp +// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_BOOL_H +#define LEXERTL_BOOL_H + +namespace lexertl +{ +// Named template param for compiler compatibility +template +struct bool_ +{ +}; + +typedef bool_ true_; +typedef bool_ false_; +} + +#endif diff --git a/inc/lexertl/char_traits.hpp b/inc/lexertl/char_traits.hpp new file mode 100644 index 0000000..64315c7 --- /dev/null +++ b/inc/lexertl/char_traits.hpp @@ -0,0 +1,50 @@ +// char_traits.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_CHAR_TRAITS_H +#define LEXERTL_CHAR_TRAITS_H + +#include + +namespace lexertl +{ +template +struct basic_char_traits +{ + typedef ch_type char_type; + typedef ch_type index_type; + + static index_type index (const char_type ch) + { + return ch; + } + + static index_type max_val () + { + return sizeof(char_type) > 2 ? 0x10ffff : + static_cast(~0); + } +}; + +template<> +struct basic_char_traits +{ + typedef char char_type; + typedef unsigned char index_type; + + static index_type index (const char ch) + { + return static_cast(ch); + } + + static index_type max_val () + { + return static_cast(~0); + } +}; +} + +#endif diff --git a/inc/lexertl/compile_assert.hpp b/inc/lexertl/compile_assert.hpp new file mode 100644 index 0000000..a36a668 --- /dev/null +++ b/inc/lexertl/compile_assert.hpp @@ -0,0 +1,24 @@ +// compile_assert.hpp +// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_COMPILE_ASSERT_H +#define LEXERTL_COMPILE_ASSERT_H + +namespace lexertl +{ +// Named template param for compiler compatibility +template +struct compile_assert; + +// enum for compiler compatibility +template<> +struct compile_assert +{ + enum {value = 1}; +}; +} + +#endif diff --git a/inc/lexertl/containers/bitvector.hpp b/inc/lexertl/containers/bitvector.hpp new file mode 100644 index 0000000..00c2fd4 --- /dev/null +++ b/inc/lexertl/containers/bitvector.hpp @@ -0,0 +1,228 @@ +// bitvector.hpp +// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_BITVECTOR_HPP +#define LEXERTL_BITVECTOR_HPP + +#include + +namespace lexertl +{ +template +class basic_bitvector +{ +public: + template + class reference + { + public: + reference (Ty &block_, const std::size_t mask_) : + _block (block_), + _mask (mask_) + { + } + + operator bool () const + { + return (_block & _mask) != 0; + } + + reference &operator = (const bool bit_) + { + if (bit_) + { + _block |= _mask; + } + else + { + _block &= ~_mask; + } + + return *this; + } + + reference &operator = (reference &rhs_) + { + if (rhs_) + { + _block |= _mask; + } + else + { + _block &= ~_mask; + } + } + + private: + Ty &_block; + const std::size_t _mask; + }; + + basic_bitvector (const std::size_t size_) : + _vec (block (size_) + (bit (size_) ? 1 : 0), 0) + { + } + + basic_bitvector (const basic_bitvector &rhs_) : + _vec (rhs_._vec) + { + } + + basic_bitvector &operator = (const basic_bitvector &rhs_) + { + if (&rhs_ != this) + { + _vec = rhs_._vec; + } + + return *this; + } + + bool operator [] (const std::size_t index_) const + { + return (_vec[block (index_)] & (1 << bit (index_))) != 0; + } + + reference operator [] (const std::size_t index_) + { + return reference (_vec[block (index_)], (1 << bit (index_))); + } + + basic_bitvector &operator |= (const basic_bitvector &rhs_) + { + typename t_vector::iterator lhs_iter_ = _vec.begin (); + typename t_vector::iterator lhs_end_ = _vec.end (); + typename t_vector::const_iterator rhs_iter_ = rhs_._vec.begin (); + typename t_vector::const_iterator rhs_end_ = rhs_._vec.end (); + + for (; lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_; + ++lhs_iter_, ++rhs_iter_) + { + *lhs_iter_ |= *rhs_iter_; + } + + return *this; + } + + basic_bitvector &operator &= (const basic_bitvector &rhs_) + { + typename t_vector::iterator lhs_iter_ = _vec.begin (); + typename t_vector::iterator lhs_end_ = _vec.end (); + typename t_vector::const_iterator rhs_iter_ = rhs_._vec.begin (); + typename t_vector::const_iterator rhs_end_ = rhs_._vec.end (); + + for (; lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_; + ++lhs_iter_, ++rhs_iter_) + { + *lhs_iter_ &= *rhs_iter_; + } + + return *this; + } + + void clear () + { + typename t_vector::iterator iter_ = _vec.begin (); + typename t_vector::iterator end_ = _vec.end (); + + for (; iter_ != end_; ++iter_) + { + *iter_ = 0; + } + } + + bool any () const + { + typename t_vector::const_iterator iter_ = _vec.begin (); + typename t_vector::const_iterator end_ = _vec.end (); + + for (; iter_ != end_; ++iter_) + { + if (*iter_) break; + } + + return iter_ != end_; + } + + void negate () + { + typename t_vector::iterator iter_ = _vec.begin (); + typename t_vector::iterator end_ = _vec.end (); + + for (; iter_ != end_; ++iter_) + { + *iter_ = ~*iter_; + } + } + + std::size_t find_first () const + { + return find_next (npos ()); + } + + std::size_t find_next (const std::size_t index_) const + { + std::size_t ret_ = npos (); + const std::size_t block_ = index_ == npos () ? 0 : block (index_ + 1); + std::size_t bit_ = index_ == npos () ? 0 : bit (index_ + 1); + typename t_vector::const_iterator iter_ = _vec.begin () + block_; + typename t_vector::const_iterator end_ = _vec.end (); + + for (std::size_t i_ = block_; iter_ != end_; ++iter_, ++i_) + { + const bool bits_ = (*iter_ & (static_cast(~0) << bit_)) != 0; + + if (bits_) + { + std::size_t j_ = bit_; + std::size_t b_ = 1 << bit_; + bool found_ = false; + + for (; j_ < sizeof(T) * 8; ++j_, b_ <<= 1) + { + if (*iter_ & b_) + { + found_ = true; + break; + } + } + + if (found_) + { + ret_ = i_ * sizeof(T) * 8 + j_; + break; + } + } + + bit_ = 0; + } + + return ret_; + } + + std::size_t npos () const + { + return static_cast(~0); + } + +private: + typedef std::vector t_vector; + + t_vector _vec; + + std::size_t block (const std::size_t index_) const + { + return index_ / (sizeof(T) * 8); + } + + std::size_t bit (const std::size_t index_) const + { + return index_ % (sizeof(T) * 8); + } +}; +} + +#endif diff --git a/inc/lexertl/containers/ptr_list.hpp b/inc/lexertl/containers/ptr_list.hpp new file mode 100644 index 0000000..53630e6 --- /dev/null +++ b/inc/lexertl/containers/ptr_list.hpp @@ -0,0 +1,69 @@ +// ptr_list.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_PTR_LIST_HPP +#define LEXERTL_PTR_LIST_HPP + +#include + +namespace lexertl +{ +namespace detail +{ +template +class ptr_list +{ +public: + typedef std::list list; + + ptr_list () : + _list () + { + } + + ~ptr_list () + { + clear (); + } + + list *operator -> () + { + return &_list; + } + + const list *operator -> () const + { + return &_list; + } + + list &operator * () + { + return _list; + } + + const list &operator * () const + { + return _list; + } + + void clear () + { + while (!_list.empty ()) + { + delete _list.front (); + _list.pop_front (); + } + } + +private: + list _list; + + ptr_list (const ptr_list &); // No copy construction. + ptr_list &operator = (const ptr_list &); // No assignment. +}; +} +} + +#endif diff --git a/inc/lexertl/containers/ptr_map.hpp b/inc/lexertl/containers/ptr_map.hpp new file mode 100644 index 0000000..28a7aa4 --- /dev/null +++ b/inc/lexertl/containers/ptr_map.hpp @@ -0,0 +1,72 @@ +// ptr_map.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_PTR_MAP_HPP +#define LEXERTL_PTR_MAP_HPP + +#include + +namespace lexertl +{ +namespace detail +{ +template +class ptr_map +{ +public: + typedef std::map map; + typedef std::pair pair; + typedef std::pair iter_pair; + + ptr_map () + { + } + + ~ptr_map () + { + clear (); + } + + map *operator -> () + { + return &_map; + } + + const map *operator -> () const + { + return &_map; + } + + map &operator * () + { + return _map; + } + + const map &operator * () const + { + return _map; + } + + void clear () + { + for (typename map::iterator iter_ = _map.begin (), end_ = _map.end (); + iter_ != end_; ++iter_) + { + delete iter_->second; + } + + _map.clear (); + } + +private: + map _map; + + ptr_map (const ptr_map &); // No copy construction. + ptr_map &operator = (const ptr_map &); // No assignment. +}; +} +} + +#endif diff --git a/inc/lexertl/containers/ptr_stack.hpp b/inc/lexertl/containers/ptr_stack.hpp new file mode 100644 index 0000000..291067f --- /dev/null +++ b/inc/lexertl/containers/ptr_stack.hpp @@ -0,0 +1,69 @@ +// ptr_stack.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_PTR_STACK_HPP +#define LEXERTL_PTR_STACK_HPP + +#include + +namespace lexertl +{ +namespace detail +{ +template +class ptr_stack +{ +public: + typedef std::stack stack; + + ptr_stack () : + _stack () + { + } + + ~ptr_stack () + { + clear (); + } + + stack *operator -> () + { + return &_stack; + } + + const stack *operator -> () const + { + return &_stack; + } + + stack &operator * () + { + return _stack; + } + + const stack &operator * () const + { + return _stack; + } + + void clear () + { + while (!_stack.empty ()) + { + delete _stack.top (); + _stack.pop (); + } + } + +private: + stack _stack; + + ptr_stack (const ptr_stack &); // No copy construction. + ptr_stack &operator = (const ptr_stack &); // No assignment. +}; +} +} + +#endif diff --git a/inc/lexertl/containers/ptr_vector.hpp b/inc/lexertl/containers/ptr_vector.hpp new file mode 100644 index 0000000..0108b83 --- /dev/null +++ b/inc/lexertl/containers/ptr_vector.hpp @@ -0,0 +1,106 @@ +// ptr_vector.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_PTR_VECTOR_HPP +#define LEXERTL_PTR_VECTOR_HPP + +#include "../size_t.hpp" +#include + +namespace lexertl +{ +namespace detail +{ +template +class ptr_vector +{ +public: + typedef std::vector vector; + + ptr_vector () : + _vector () + { + } + + ~ptr_vector () + { + clear (); + } + + vector *operator -> () + { + return &_vector; + } + + const vector *operator -> () const + { + return &_vector; + } + + vector &operator * () + { + return _vector; + } + + const vector &operator * () const + { + return _vector; + } + + ptr_type * &operator [] (const std::size_t index_) + { + return _vector[index_]; + } + + ptr_type * const &operator [] (const std::size_t index_) const + { + return _vector[index_]; + } + + bool operator == (const ptr_vector &rhs_) const + { + bool equal_ = _vector.size () == rhs_._vector.size (); + + if (equal_) + { + typename vector::const_iterator lhs_iter_ = _vector.begin (); + typename vector::const_iterator end_ = _vector.end (); + typename vector::const_iterator rhs_iter_ = rhs_._vector.begin (); + + for (; equal_ && lhs_iter_ != end_; ++lhs_iter_, ++rhs_iter_) + { + equal_ = **lhs_iter_ == **rhs_iter_; + } + } + + return equal_; + } + + void clear () + { + if (!_vector.empty ()) + { + ptr_type **iter_ = &_vector.front (); + ptr_type **end_ = iter_ + _vector.size (); + + for (; iter_ != end_; ++iter_) + { + delete *iter_; + } + } + + _vector.clear (); + } + +private: + vector _vector; + + ptr_vector (const ptr_vector &); // No copy construction. + ptr_vector &operator = (const ptr_vector &); // No assignment. +}; +} +} + +#endif diff --git a/inc/lexertl/debug.hpp b/inc/lexertl/debug.hpp new file mode 100644 index 0000000..85a61a0 --- /dev/null +++ b/inc/lexertl/debug.hpp @@ -0,0 +1,353 @@ +// debug.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_DEBUG_HPP +#define LEXERTL_DEBUG_HPP + +#include +#include +#include "rules.hpp" +#include "size_t.hpp" +#include "state_machine.hpp" +#include "string_token.hpp" +#include + +namespace lexertl +{ +template +class basic_debug +{ +public: + typedef lexertl::basic_char_state_machine + char_state_machine; + typedef std::basic_ostream ostream; + typedef lexertl::basic_rules rules; + typedef std::basic_string string; + + static void dump (const sm &sm_, rules &rules_, ostream &stream_) + { + char_state_machine csm_; + + sm_to_csm (sm_, csm_); + dump (csm_, rules_, stream_); + } + + static void dump (const sm &sm_, ostream &stream_) + { + char_state_machine csm_; + + sm_to_csm (sm_, csm_); + dump (csm_, stream_); + } + + static void dump (const char_state_machine &csm_, rules &rules_, + ostream &stream_) + { + for (std::size_t dfa_ = 0, dfas_ = csm_.size (); dfa_ < dfas_; ++dfa_) + { + lexer_state (stream_); + stream_ << rules_.state (dfa_) << std::endl << std::endl; + + dump_ex (csm_._sm_deque[dfa_], stream_); + } + } + + static void dump (const char_state_machine &csm_, ostream &stream_) + { + for (std::size_t dfa_ = 0, dfas_ = csm_.size (); dfa_ < dfas_; ++dfa_) + { + lexer_state (stream_); + stream_ << dfa_ << std::endl << std::endl; + + dump_ex (csm_._sm_deque[dfa_], stream_); + } + } + +protected: + typedef typename char_state_machine::state dfa_state; + typedef typename dfa_state::string_token string_token; + typedef std::basic_stringstream stringstream; + + static void sm_to_csm (const sm &sm_, char_state_machine &csm_) + { + const detail::basic_internals &internals_ = sm_.data (); + const std::size_t dfas_ = internals_._dfa->size (); + + for (id_type i_ = 0; i_ < dfas_; ++i_) + { + if (internals_._dfa_alphabet[i_] == 0) continue; + + const std::size_t alphabet_ = internals_._dfa_alphabet[i_] - + transitions_index; + typename char_state_machine::string_token_vector token_vector_ + (alphabet_, string_token ()); + id_type *ptr_ = &internals_._lookup[i_]->front (); + + for (std::size_t c_ = 0; c_ < 256; ++c_, ++ptr_) + { + if (*ptr_ >= transitions_index) + { + string_token &token_ = token_vector_ + [*ptr_ - transitions_index]; + + token_.insert (typename string_token::range + (typename string_token::index_type (c_), + typename string_token::index_type (c_))); + } + } + + csm_.append (token_vector_, internals_, i_); + } + } + + static void dump_ex (const typename char_state_machine::dfa &dfa_, + ostream &stream_) + { + const std::size_t states_ = dfa_._states.size (); + const id_type bol_index_ = dfa_._bol_index; + typename dfa_state::id_type_string_token_map::const_iterator iter_; + typename dfa_state::id_type_string_token_map::const_iterator end_; + + for (std::size_t i_ = 0; i_ < states_; ++i_) + { + const dfa_state &state_ = dfa_._states[i_]; + + state (stream_); + stream_ << i_ << std::endl; + + if (state_._end_state) + { + end_state (stream_); + + if (state_._push_pop_dfa == dfa_state::push_dfa) + { + push (stream_); + stream_ << state_._push_dfa; + } + else if (state_._push_pop_dfa == dfa_state::pop_dfa) + { + pop (stream_); + } + + id (stream_); + stream_ << static_cast(state_._id); + user_id (stream_); + stream_ << static_cast(state_._user_id); + dfa (stream_); + stream_ << static_cast(state_._next_dfa); + stream_ << std::endl; + } + + if (i_ == 0 && bol_index_ != char_state_machine::npos ()) + { + bol (stream_); + stream_ << static_cast(bol_index_) << std::endl; + } + + if (state_._eol_index != char_state_machine::npos ()) + { + eol (stream_); + stream_ << static_cast(state_._eol_index) << + std::endl; + } + + iter_ = state_._transitions.begin (); + end_ = state_._transitions.end (); + + for (; iter_ != end_; ++iter_) + { + string_token token_ = iter_->second; + + open_bracket (stream_); + + if (!iter_->second.any () && iter_->second.negatable ()) + { + token_.negate (); + negated (stream_); + } + + string chars_; + typename string_token::range_vector::const_iterator + ranges_iter_ = token_._ranges.begin (); + typename string_token::range_vector::const_iterator + ranges_end_ = token_._ranges.end (); + + for (; ranges_iter_ != ranges_end_; ++ranges_iter_) + { + if (ranges_iter_->first == '^' || + ranges_iter_->first == ']') + { + stream_ << '\\'; + } + + chars_ = string_token::escape_char + (ranges_iter_->first); + + if (ranges_iter_->first != ranges_iter_->second) + { + if (ranges_iter_->first + 1 < ranges_iter_->second) + { + chars_ += '-'; + } + + if (ranges_iter_->second == '^' || + ranges_iter_->second == ']') + { + stream_ << '\\'; + } + + chars_ += string_token::escape_char + (ranges_iter_->second); + } + + stream_ << chars_; + } + + close_bracket (stream_); + stream_ << static_cast(iter_->first) << + std::endl; + } + + stream_ << std::endl; + } + } + + static void lexer_state (std::ostream &stream_) + { + stream_ << "Lexer state: "; + } + + static void lexer_state (std::wostream &stream_) + { + stream_ << L"Lexer state: "; + } + + static void state (std::ostream &stream_) + { + stream_ << "State: "; + } + + static void state (std::wostream &stream_) + { + stream_ << L"State: "; + } + + static void bol (std::ostream &stream_) + { + stream_ << " BOL -> "; + } + + static void bol (std::wostream &stream_) + { + stream_ << L" BOL -> "; + } + + static void eol (std::ostream &stream_) + { + stream_ << " EOL -> "; + } + + static void eol (std::wostream &stream_) + { + stream_ << L" EOL -> "; + } + + static void end_state (std::ostream &stream_) + { + stream_ << " END STATE"; + } + + static void end_state (std::wostream &stream_) + { + stream_ << L" END STATE"; + } + + static void id (std::ostream &stream_) + { + stream_ << ", Id = "; + } + + static void id (std::wostream &stream_) + { + stream_ << L", Id = "; + } + + static void push (std::ostream &stream_) + { + stream_ << ", PUSH "; + } + + static void push (std::wostream &stream_) + { + stream_ << L", PUSH "; + } + + static void pop (std::ostream &stream_) + { + stream_ << ", POP"; + } + + static void pop (std::wostream &stream_) + { + stream_ << L", POP"; + } + + static void user_id (std::ostream &stream_) + { + stream_ << ", User Id = "; + } + + static void user_id (std::wostream &stream_) + { + stream_ << L", User Id = "; + } + + static void open_bracket (std::ostream &stream_) + { + stream_ << " ["; + } + + static void open_bracket (std::wostream &stream_) + { + stream_ << L" ["; + } + + static void negated (std::ostream &stream_) + { + stream_ << "^"; + } + + static void negated (std::wostream &stream_) + { + stream_ << L"^"; + } + + static void close_bracket (std::ostream &stream_) + { + stream_ << "] -> "; + } + + static void close_bracket (std::wostream &stream_) + { + stream_ << L"] -> "; + } + + static void dfa (std::ostream &stream_) + { + stream_ << ", dfa = "; + } + + static void dfa (std::wostream &stream_) + { + stream_ << L", dfa = "; + } +}; + +typedef basic_debug, char> debug; +typedef basic_debug, wchar_t> wdebug; +} + +#endif diff --git a/inc/lexertl/enums.hpp b/inc/lexertl/enums.hpp new file mode 100644 index 0000000..bec5ddc --- /dev/null +++ b/inc/lexertl/enums.hpp @@ -0,0 +1,25 @@ +// enums.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_ENUMS_H +#define LEXERTL_ENUMS_H + +namespace lexertl +{ + enum regex_flags {icase = 1, dot_not_newline = 2, skip_ws = 4, + match_zero_len = 8}; + // 0 = end state, 1 = id, 2 = user id, 3 = push_dfa_index + // 4 = next dfa, 5 = dead state, 6 = dfa_start + enum {end_state_index, id_index, user_id_index, push_dfa_index, + next_dfa_index, eol_index, dead_state_index, transitions_index}; + // Rule flags: + enum feature_flags {bol_bit = 1, eol_bit = 2, skip_bit = 4, again_bit = 8, + multi_state_bit = 16, recursive_bit = 32, advance_bit = 64}; + // End state flags: + enum {end_state_bit = 1, pop_dfa_bit = 2}; +} + +#endif diff --git a/inc/lexertl/generate_cpp.hpp b/inc/lexertl/generate_cpp.hpp new file mode 100644 index 0000000..1c0c330 --- /dev/null +++ b/inc/lexertl/generate_cpp.hpp @@ -0,0 +1,1122 @@ +// generate_cpp.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_GENERATE_CPP_HPP +#define LEXERTL_GENERATE_CPP_HPP + +#include "bool.hpp" +#include "enums.hpp" +#include +#include "state_machine.hpp" + +namespace lexertl +{ +class table_based_cpp +{ +public: + template + static void generate_cpp + (const std::string &name_, + const basic_state_machine &sm_, + const bool pointers_, std::ostream &os_) + { + typedef basic_state_machine sm; + typedef typename sm::internals internals; + const internals &internals_ = sm_.data (); + std::size_t additional_tabs_ = 0; + + os_ << "template\n"; + os_ << "void " << name_ << " (lexertl::"; + + if (internals_._features & recursive_bit) + { + os_ << "recursive_match_results"; + } + else + { + os_ << "match_results"; + } + + os_ << " &results_)\n"; + os_ << "{\n"; + os_ << " typedef lexertl::"; + + if (internals_._features & recursive_bit) + { + os_ << "recursive_match_results"; + } + else + { + os_ << "match_results"; + } + + os_ << " results;\n"; + os_ << " typedef typename results::char_type char_type;\n"; + os_ << " typename results::iter_type end_token_ = results_.end;\n"; + + if (internals_._features & skip_bit) + { + os_ << "skip:\n"; + } + + os_ << " typename results::iter_type curr_ = results_.end;\n\n"; + os_ << " results_.start = curr_;\n\n"; + + if (internals_._features & again_bit) + { + os_ << "again:\n"; + } + + os_ << " if (curr_ == results_.eoi)\n"; + os_ << " {\n"; + // We want a number regardless of id_type. + os_ << " results_.id = " << static_cast + (internals_._eoi) << ";\n"; + os_ << " results_.user_id = results::npos ();\n"; + os_ << " return;\n"; + os_ << " }\n\n"; + + if (internals_._features & bol_bit) + { + os_ << " bool bol_ = results_.bol;\n"; + } + + dump_tables (sm_, 1, pointers_, os_); + + if (internals_._dfa->size () > 1) + { + os_ << " const id_type *lookup_ = lookups_[results_.state];\n"; + os_ << " const id_type dfa_alphabet_ = dfa_alphabets_" + "[results_.state];\n"; + os_ << " const "; + + if (pointers_) + { + os_ << "void * const"; + } + else + { + os_ << "id_type"; + } + + os_ << " *dfa_ = dfas_[results_.state];\n"; + } + + os_ << " const "; + + if (pointers_) + { + os_ << "void * const"; + } + else + { + os_ << "id_type"; + } + + os_ << " *ptr_ = dfa_ + dfa_alphabet_;\n"; + os_ << " bool end_state_ = *ptr_ != 0;\n"; + + if (internals_._features & recursive_bit) + { + os_ << " bool pop_ = ("; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*ptr_"; + + if (pointers_) + { + os_ << ')'; + } + + os_ <<" & " << pop_dfa_bit; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ") != 0;\n"; + } + + os_ << " id_type id_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << id_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + os_ << " id_type uid_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << user_id_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + + if (internals_._features & recursive_bit) + { + os_ << " id_type push_dfa_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << push_dfa_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + } + + if (internals_._dfa->size () > 1) + { + os_ << " id_type start_state_ = results_.state;\n"; + } + + if (internals_._features & bol_bit) + { + os_ << " bool end_bol_ = bol_;\n"; + } + + if (internals_._features & eol_bit) + { + os_ << " "; + + if (pointers_) + { + os_ << "const void * const *"; + } + else + { + os_ << "id_type "; + } + + os_ << "EOL_state_ = 0;\n"; + } + + os_ << '\n'; + + if (internals_._features & bol_bit) + { + os_ << " if (bol_)\n"; + os_ << " {\n"; + os_ << " const "; + + if (pointers_) + { + os_ << "void *"; + } + else + { + os_ << "id_type "; + } + + os_ << "state_ = *dfa_;\n\n"; + os_ << " if (state_)\n"; + os_ << " {\n"; + os_ << " ptr_ = "; + + if (pointers_) + { + os_ << "reinterpret_cast(state_);\n"; + } + else + { + os_ << "&dfa_[state_ * dfa_alphabet_];\n"; + } + + os_ << " }\n"; + os_ << " }\n\n"; + } + + os_ << " while (curr_ != results_.eoi)\n"; + os_ << " {\n"; + + if (internals_._features & eol_bit) + { + os_ << " EOL_state_ = "; + + if (pointers_) + { + os_ << "reinterpret_cast("; + } + + os_ << "ptr_[" << eol_index << ']'; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ";\n\n"; + os_ << " if (EOL_state_ && *curr_ == '\\n')\n"; + os_ << " {\n"; + os_ << " ptr_ = "; + + if (pointers_) + { + os_ << "EOL_state_"; + } + else + { + os_ << "&dfa_[EOL_state_ * dfa_alphabet_]"; + } + + os_ << ";\n"; + os_ << " }\n"; + os_ << " else\n"; + os_ << " {\n"; + ++additional_tabs_; + } + + output_char_loop (internals_._features, additional_tabs_, pointers_, + os_, bool_<(sizeof (typename sm::traits::input_char_type) > 1)> ()); + + if (internals_._features & eol_bit) + { + output_tabs (additional_tabs_, os_); + os_ << " }\n"; + --additional_tabs_; + } + + os_ << '\n'; + os_ << " if (*ptr_)\n"; + os_ << " {\n"; + os_ << " end_state_ = true;\n"; + + + if (internals_._features & recursive_bit) + { + os_ << " pop_ = ("; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*ptr_"; + + if (pointers_) + { + os_ << ')'; + } + + os_ <<" & " << pop_dfa_bit; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ") != 0;\n"; + } + + os_ << " id_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << id_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + os_ << " uid_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << user_id_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + + if (internals_._features & recursive_bit) + { + os_ << " push_dfa_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << push_dfa_index << ')'; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + } + + if (internals_._dfa->size () > 1) + { + os_ << " start_state_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << next_dfa_index << ')'; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + } + + if (internals_._features & bol_bit) + { + os_ << " end_bol_ = bol_;\n"; + } + + os_ << " end_token_ = curr_;\n"; + os_ << " }\n"; + os_ << " }\n\n"; + output_quit (os_, + bool_<(sizeof (typename sm::traits::input_char_type) > 1)> ()); + + if (internals_._features & eol_bit) + { + os_ << " if (curr_ == results_.eoi)\n"; + os_ << " {\n"; + os_ << " EOL_state_ = "; + + if (pointers_) + { + os_ << "reinterpret_cast("; + } + + os_ << "ptr_[" << eol_index << ']'; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ";\n"; + os_ << "\n"; + os_ << " if (EOL_state_)\n"; + os_ << " {\n"; + os_ << " ptr_ = "; + + if (pointers_) + { + os_ << "EOL_state_"; + } + else + { + os_ << "&dfa_[EOL_state_ * dfa_alphabet_]"; + } + + os_ << ";\n\n"; + os_ << " if (*ptr_)\n"; + os_ << " {\n"; + os_ << " end_state_ = true;\n"; + + + if (internals_._features & recursive_bit) + { + os_ << " pop_ = ("; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*ptr_"; + + if (pointers_) + { + os_ << ')'; + } + + os_ <<" & " << pop_dfa_bit; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ") != 0;\n"; + } + + os_ << " id_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << id_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + os_ << " uid_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << user_id_index << ")"; + + if (pointers_) + { + os_ << "))"; + } + + os_ <<";\n"; + + if (internals_._features & recursive_bit) + { + os_ << " push_dfa_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << push_dfa_index << ')'; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + } + + if (internals_._dfa->size () > 1) + { + os_ << " start_state_ = "; + + if (pointers_) + { + // Done this way for GCC: + os_ << "static_cast(reinterpret_cast("; + } + + os_ << "*(ptr_ + " << next_dfa_index << ')'; + + if (pointers_) + { + os_ << "))"; + } + + os_ << ";\n"; + } + + if (internals_._features & bol_bit) + { + os_ << " end_bol_ = bol_;\n"; + } + + os_ << " end_token_ = curr_;\n"; + os_ << " }\n"; + os_ << " }\n"; + os_ << " }\n\n"; + } + + os_ << " if (end_state_)\n"; + os_ << " {\n"; + os_ << " // Return longest match\n"; + + if (internals_._features & recursive_bit) + { + os_ << " if (pop_)\n"; + os_ << " {\n"; + os_ << " start_state_ = results_." + "stack.top ().first;\n"; + os_ << " results_.stack.pop ();\n"; + os_ << " }\n"; + os_ << " else if (push_dfa_ != results_.npos ())\n"; + os_ << " {\n"; + os_ << " results_.stack.push (typename results::" + "id_type_pair\n"; + os_ << " (push_dfa_, id_));\n"; + os_ << " }\n\n"; + } + + if (internals_._dfa->size () > 1) + { + os_ << " results_.state = start_state_;\n"; + } + + if (internals_._features & bol_bit) + { + os_ << " results_.bol = end_bol_;\n"; + } + + os_ << " results_.end = end_token_;\n"; + + if (internals_._features & skip_bit) + { + // We want a number regardless of id_type. + os_ << "\n if (id_ == results_.skip ()) goto skip;\n"; + } + + if (internals_._features & again_bit) + { + // We want a number regardless of id_type. + os_ << "\n if (id_ == " + << static_cast(internals_._eoi); + + if (internals_._features & recursive_bit) + { + os_ << " || (pop_ && !results_.stack.empty () &&\n"; + // We want a number regardless of id_type. + os_ << " results_.stack.top ().second == " + << static_cast(internals_._eoi) << ')'; + } + + os_ << ")\n"; + os_ << " {\n"; + os_ << " curr_ = end_token_;\n"; + os_ << " goto again;\n"; + os_ << " }\n"; + } + + os_ << " }\n"; + os_ << " else\n"; + os_ << " {\n"; + os_ << " // No match causes char to be skipped\n"; + os_ << " results_.end = end_token_;\n"; + + if (internals_._features & bol_bit) + { + os_ << " results_.bol = *results_.end == '\\n';\n"; + } + + os_ << " results_.start = results_.end;\n"; + os_ << " ++results_.end;\n"; + os_ << " id_ = results::npos ();\n"; + os_ << " uid_ = results::npos ();\n"; + os_ << " }\n\n"; + os_ << " results_.id = id_;\n"; + os_ << " results_.user_id = uid_;\n"; + os_ << "}\n"; + } + + template + static void dump_tables + (const basic_state_machine &sm_, + const std::size_t tabs_, const bool pointers_, std::ostream &os_) + { + const typename detail::basic_internals &internals_ = + sm_.data (); + const std::size_t lookup_divisor_ = 8; + // Lookup is always 256 entries long now + const std::size_t lookup_quotient_ = 256 / lookup_divisor_; + const std::size_t dfas_ = internals_._lookup->size (); + std::size_t col_ = 1; + std::size_t row_ = 1; + + output_tabs (tabs_, os_); + os_ << "static const id_type lookup"; + + if (dfas_ > 1) + { + os_ << "s_[][" << 256; + } + else + { + os_ << "_["; + } + + os_ << "] = \n"; + output_tabs (tabs_ + 1, os_); + + if (dfas_ > 1) + { + os_ << '{'; + } + + for (std::size_t l_ = 0; l_ < dfas_; ++l_) + { + const id_type *ptr_ = &internals_._lookup[l_]->front (); + + // We want numbers regardless of id_type. + os_ << "{0x" << std::hex << static_cast(*ptr_++); + + for (col_ = 1; col_ < lookup_divisor_; ++col_) + { + // We want numbers regardless of id_type. + os_ << ", 0x" << std::hex << static_cast(*ptr_++); + } + + for (row_ = 1; row_ < lookup_quotient_; ++row_) + { + os_ << ",\n"; + output_tabs (tabs_ + 1, os_); + // We want numbers regardless of id_type. + os_ << "0x" << std::hex << static_cast(*ptr_++); + + for (col_ = 1; col_ < lookup_divisor_; ++col_) + { + // We want numbers regardless of id_type. + os_ << ", 0x" << std::hex << + static_cast(*ptr_++); + } + } + + os_ << '}'; + + if (l_ + 1 < dfas_) + { + os_ << ",\n"; + output_tabs (tabs_ + 1, os_); + } + } + + if (dfas_ > 1) + { + os_ << '}'; + } + + os_ << ";\n"; + output_tabs (tabs_, os_); + os_ << "static const id_type dfa_alphabet"; + + if (dfas_ > 1) + { + os_ << "s_[" << dfas_ << "] = {"; + } + else + { + os_ << "_ = "; + } + + // We want numbers regardless of id_type. + os_ << "0x" << std::hex << static_cast + (internals_._dfa_alphabet[0]); + + for (col_ = 1; col_ < dfas_; ++col_) + { + // We want numbers regardless of id_type. + os_ << ", 0x" << std::hex << static_cast(internals_. + _dfa_alphabet[col_]); + } + + if (dfas_ > 1) + { + os_ << '}'; + } + + os_ << ";\n"; + + // DFAs are usually different sizes, so dump separately + for (std::size_t dfa_ = 0; dfa_ < dfas_; ++dfa_) + { + const id_type dfa_alphabet_ = internals_._dfa_alphabet[dfa_]; + const std::size_t rows_ = internals_._dfa[dfa_]->size () / + dfa_alphabet_; + const id_type *ptr_ = &internals_._dfa[dfa_]->front (); + std::string dfa_name_ = "dfa"; + + output_tabs (tabs_, os_); + os_ << "static const "; + + if (pointers_) + { + os_ << "void *"; + } + else + { + os_ << "id_type "; + } + + os_ << dfa_name_; + + if (dfas_ > 1) + { + std::ostringstream ss_; + + ss_ << dfa_; + dfa_name_ += ss_.str (); + os_ << dfa_; + } + + dfa_name_ += '_'; + os_ << "_[] = {"; + + for (std::size_t row_ = 0; row_ < rows_; ++row_) + { + dump_row (row_ == 0, ptr_, dfa_name_, dfa_alphabet_, + pointers_, os_); + + if (row_ + 1 < rows_) + { + os_ << ",\n"; + output_tabs (tabs_ + 1, os_); + } + } + + os_ << "};\n"; + } + + if (dfas_ > 1) + { + output_tabs (tabs_, os_); + os_ << "static const "; + + if (pointers_) + { + os_ << "void * const"; + } + else + { + os_ << "id_type"; + } + + os_ << " *dfas_[] = {dfa0_"; + + for (col_ = 1; col_ < dfas_; ++col_) + { + os_ << ", dfa" << col_ << '_'; + } + + os_ << "};\n"; + } + } + +protected: + template + static void dump_row (const bool first_, const id_type * &ptr_, + const std::string &dfa_name_, const id_type dfa_alphabet_, + const bool pointers_, std::ostream &os_) + { + if (pointers_) + { + bool zero_ = *ptr_ == 0; + + if (first_) + { + // We want numbers regardless of id_type. + os_ << dfa_name_ << " + 0x" << std::hex << + static_cast(*ptr_++) * dfa_alphabet_; + } + else if (!zero_) + { + os_ << "reinterpret_cast(0x" + // We want numbers regardless of id_type. + << std::hex << static_cast(*ptr_++) << ')'; + } + else + { + // We want numbers regardless of id_type. + os_ << "0x" << std::hex << static_cast(*ptr_++); + } + + for (id_type id_index_ = id_index; id_index_ < transitions_index; + ++id_index_, ++ptr_) + { + os_ << ", "; + zero_ = *ptr_ == 0; + + if (!zero_) + { + os_ << "reinterpret_cast("; + } + + // We want numbers regardless of id_type. + os_ << "0x" << std::hex << static_cast(*ptr_); + + if (!zero_) + { + os_ << ')'; + } + } + + for (id_type alphabet_ = transitions_index; + alphabet_ < dfa_alphabet_; ++alphabet_, ++ptr_) + { + // We want numbers regardless of id_type. + os_ << ", "; + + if (*ptr_ == 0) + { + os_ << 0; + } + else + { + // We want numbers regardless of id_type. + os_ << dfa_name_ + " + 0x" << std::hex << + static_cast(*ptr_) * dfa_alphabet_; + } + } + } + else + { + // We want numbers regardless of id_type. + os_ << "0x" << std::hex << static_cast(*ptr_++); + + for (id_type alphabet_ = 1; alphabet_ < dfa_alphabet_; + ++alphabet_, ++ptr_) + { + // We want numbers regardless of id_type. + os_ << ", 0x" << std::hex << static_cast(*ptr_); + } + } + } + + static void output_tabs (const std::size_t tabs_, std::ostream &os_) + { + for (std::size_t i_ = 0; i_ < tabs_; ++i_) + { + os_ << " "; + } + } + + template + static void output_char_loop (const id_type features_, + const std::size_t additional_tabs_, const bool pointers_, + std::ostream &os_, const false_ &) + { + output_tabs (additional_tabs_, os_); + os_ << " const typename results::char_type prev_char_ = " + "*curr_++;\n"; + output_tabs (additional_tabs_, os_); + os_ << " const "; + + if (pointers_) + { + os_ << "void * const *"; + } + else + { + os_ << "id_type "; + } + + os_ << "state_ = "; + + if (pointers_) + { + os_ << "reinterpret_cast\n "; + output_tabs (additional_tabs_, os_); + os_ << '('; + } + + os_ << "ptr_[lookup_"; + + if (!pointers_) + { + os_ << "\n "; + output_tabs (additional_tabs_, os_); + } + + os_ << "[static_cast"; + + if (pointers_) + { + os_ << "\n "; + output_tabs (additional_tabs_, os_); + } + + os_ << "(prev_char_)]]"; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ";\n\n"; + + if (features_ & bol_bit) + { + output_tabs (additional_tabs_, os_); + os_ << " bol_ = prev_char_ == '\\n';\n\n"; + } + + output_tabs (additional_tabs_, os_); + os_ << " if (state_ == 0)\n"; + output_tabs (additional_tabs_, os_); + os_ << " {\n"; + + if (features_ & eol_bit) + { + output_tabs (additional_tabs_, os_); + os_ << " EOL_state_ = 0;\n"; + } + + output_tabs (additional_tabs_, os_); + os_ << " break;\n"; + output_tabs (additional_tabs_, os_); + os_ << " }\n\n"; + output_tabs (additional_tabs_, os_); + os_ << " ptr_ = "; + + if (pointers_) + { + os_ << "state_"; + } + else + { + os_ << "&dfa_[state_ * dfa_alphabet_]"; + } + + os_ << ";\n"; + } + + template + static void output_char_loop (const id_type features_, + const std::size_t additional_tabs_, const bool pointers_, + std::ostream &os_, const true_ &) + { + output_tabs (additional_tabs_, os_); + os_ << " const std::size_t bytes_ =\n"; + output_tabs (additional_tabs_, os_); + os_ << " sizeof (typename results::char_type) < 3 ?\n"; + output_tabs (additional_tabs_, os_); + os_ << " sizeof (typename results::char_type) : 3;\n"; + output_tabs (additional_tabs_, os_); + os_ << " const std::size_t shift_[] = {0, 8, 16};\n"; + output_tabs (additional_tabs_, os_); + os_ << " typename results::char_type prev_char_ = " + "*curr_++;\n\n"; + + if (features_ & bol_bit) + { + output_tabs (additional_tabs_, os_); + os_ << " bol_ = prev_char_ == '\\n';\n\n"; + } + + output_tabs (additional_tabs_, os_); + os_ << " for (std::size_t i_ = 0; i_ < bytes_; ++i_)\n"; + output_tabs (additional_tabs_, os_); + os_ << " {\n"; + output_tabs (additional_tabs_, os_); + os_ << " const "; + + if (pointers_) + { + os_ << "void * const *"; + } + else + { + os_ << "id_type "; + } + + os_ << "state_ = "; + + if (pointers_) + { + os_ << "reinterpret_cast\n "; + output_tabs (additional_tabs_, os_); + os_ << '('; + } + + os_ << "ptr_[lookup_[static_cast\n"; + output_tabs (additional_tabs_, os_); + os_ << " ((prev_char_ >>\n" + " shift_[bytes_ - 1 - i_]) & 0xff)]]"; + + if (pointers_) + { + os_ << ')'; + } + + os_ << ";\n\n"; + output_tabs (additional_tabs_, os_); + os_ << " if (state_ == 0)\n"; + output_tabs (additional_tabs_, os_); + os_ << " {\n"; + + if (features_ & eol_bit) + { + output_tabs (additional_tabs_, os_); + os_ << " EOL_state_ = 0;\n"; + } + + output_tabs (additional_tabs_, os_); + os_ << " goto quit;\n"; + output_tabs (additional_tabs_, os_); + os_ << " }\n\n"; + output_tabs (additional_tabs_, os_); + os_ << " ptr_ = "; + + if (pointers_) + { + os_ << "state_"; + } + else + { + os_ << "&dfa_[state_ * dfa_alphabet_]"; + } + + os_ << ";\n"; + output_tabs (additional_tabs_, os_); + os_ << " }\n"; + } + + static void output_quit (std::ostream &, const false_ &) + { + // Nothing to do + } + + static void output_quit (std::ostream &os_, const true_ &) + { + os_ << "quit:\n"; + } +}; +} + +#endif diff --git a/inc/lexertl/generator.hpp b/inc/lexertl/generator.hpp new file mode 100644 index 0000000..f930d23 --- /dev/null +++ b/inc/lexertl/generator.hpp @@ -0,0 +1,829 @@ +// generator.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_GENERATOR_HPP +#define LEXERTL_GENERATOR_HPP + +#include +#include "bool.hpp" +#include "partition/charset.hpp" +#include "char_traits.hpp" +#include "partition/equivset.hpp" +#include +#include "parser/parser.hpp" +#include "containers/ptr_list.hpp" +#include "rules.hpp" +#include "size_t.hpp" +#include "state_machine.hpp" + +namespace lexertl +{ +template > +class basic_generator +{ +public: + typedef typename rules::id_type id_type; + typedef typename rules::char_type rules_char_type; + typedef typename sm::traits sm_traits; + typedef detail::basic_parser parser; + typedef typename parser::charset_map charset_map; + typedef typename parser::node node; + typedef typename parser::node_ptr_vector node_ptr_vector; + + static void build (const rules &rules_, sm &sm_) + { + const std::size_t size_ = rules_.statemap ().size (); + // Strong exception guarantee + // http://www.boost.org/community/exception_safety.html + internals internals_; + sm temp_sm_; + node_ptr_vector node_ptr_vector_; + + internals_._eoi = rules_.eoi (); + internals_.add_states (size_); + + for (id_type index_ = 0; index_ < size_; ++index_) + { + if (rules_.regexes ()[index_].empty ()) + { + std::ostringstream ss_; + + ss_ << "Lexer states with no rules are not allowed " + "(lexer state " << index_ << ".)"; + throw runtime_error (ss_.str ()); + } + else + { + // Note that the following variables are per DFA. + // Map of regex charset tokens (strings) to index + charset_map charset_map_; + // Used to fix up $ and \n clashes. + id_type nl_id_ = sm_traits::npos (); + // Regex syntax tree + node *root_ = build_tree (rules_, index_, node_ptr_vector_, + charset_map_, nl_id_); + + build_dfa (charset_map_, root_, internals_, temp_sm_, index_, + nl_id_); + + if (internals_._dfa[index_]->size () / + internals_._dfa_alphabet[index_] >= sm_traits::npos ()) + { + // Overflow + throw runtime_error ("The data type you have chosen " + "cannot hold this many DFA rows."); + } + } + } + + // If you get a compile error here the id_type from rules and + // state machine do no match. + create (internals_, temp_sm_, rules_.features (), lookup ()); + sm_.swap (temp_sm_); + } + + static node *build_tree (const rules &rules_, const std::size_t dfa_, + node_ptr_vector &node_ptr_vector_, charset_map &charset_map_, + id_type &nl_id_) + { + typename parser::macro_map macro_map_; + parser parser_ (rules_.locale (), node_ptr_vector_, macro_map_, + charset_map_, rules_.eoi ()); + const typename rules::string_deque_deque ®exes_ = + rules_.regexes (); + typename rules::string_deque::const_iterator regex_iter_ = + regexes_[dfa_].begin (); + typename rules::string_deque::const_iterator regex_iter_end_ = + regexes_[dfa_].end (); + const typename rules::string ®ex_ = *regex_iter_; + const typename rules::id_vector_deque &ids_ = rules_.ids (); + const typename rules::id_vector_deque &user_ids_ = + rules_.user_ids (); + typename rules::id_vector::const_iterator id_iter_ = + ids_[dfa_].begin (); + typename rules::id_vector::const_iterator user_id_iter_ = + user_ids_[dfa_].begin (); + const typename rules::id_vector_deque &next_dfas_ = + rules_.next_dfas (); + const typename rules::id_vector_deque &pushes_ = rules_.pushes (); + const typename rules::bool_vector_deque &pops_ = rules_.pops (); + typename rules::id_vector::const_iterator next_dfa_iter_ = + next_dfas_[dfa_].begin (); + typename rules::id_vector::const_iterator push_dfa_iter_ = + pushes_[dfa_].begin (); + typename rules::bool_vector::const_iterator pop_dfa_iter_ = + pops_[dfa_].begin (); + const bool seen_bol_ = (rules_.features ()[dfa_] & bol_bit) != 0; + node *root_ = 0; + + // Macros have a different context per lexer state + // as equivsets (generally) differ. + build_macros (rules_, macro_map_, node_ptr_vector_, charset_map_, + nl_id_); + root_ = parser_.parse (regex_.c_str (), + regex_.c_str () + regex_.size (), *id_iter_, *user_id_iter_, + *next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_, + rules_.flags (), nl_id_, seen_bol_, false); + ++regex_iter_; + ++id_iter_; + ++user_id_iter_; + ++next_dfa_iter_; + ++push_dfa_iter_; + ++pop_dfa_iter_; + + // Build syntax trees + while (regex_iter_ != regex_iter_end_) + { + // Re-declare var, otherwise we perform an assignment..! + const typename rules::string ®ex_ = *regex_iter_; + node *rhs_ = parser_.parse (regex_.c_str (), + regex_.c_str () + regex_.size (), *id_iter_, *user_id_iter_, + *next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_, + rules_.flags (), nl_id_, + (rules_.features ()[dfa_] & bol_bit) != 0, false); + + node_ptr_vector_->push_back + (static_cast(0)); + node_ptr_vector_->back () = new selection_node (root_, rhs_); + root_ = node_ptr_vector_->back (); + + ++regex_iter_; + ++id_iter_; + ++user_id_iter_; + ++next_dfa_iter_; + ++push_dfa_iter_; + ++pop_dfa_iter_; + } + + return root_; + } + +protected: + typedef bool_ compressed; + typedef detail::basic_equivset equivset; + typedef detail::ptr_list equivset_list; + typedef std::auto_ptr equivset_ptr; + typedef typename sm_traits::char_type sm_char_type; + typedef detail::basic_charset charset; + typedef std::auto_ptr charset_ptr; + typedef detail::ptr_list charset_list; + typedef detail::basic_internals internals; + typedef typename std::set id_type_set; + typedef typename internals::id_type_vector id_type_vector; + typedef typename charset::index_set index_set; + typedef std::vector index_set_vector; + typedef bool_ is_dfa; + typedef bool_ lookup; + typedef typename parser::macro_map macro_map; + typedef typename macro_map::iterator macro_iter; + typedef std::pair macro_iter_pair; + typedef std::set node_set; + typedef detail::ptr_vector node_set_vector; + typedef typename node::node_vector node_vector; + typedef detail::ptr_vector node_vector_vector; + typedef std::pair macro_pair; + typedef typename parser::selection_node selection_node; + typedef typename std::vector size_t_vector; + typedef typename parser::string_token string_token; + + static void build_macros (const rules &rules_, + macro_map ¯o_map_, node_ptr_vector &node_ptr_vector_, + charset_map &charset_map_, id_type &nl_id_) + { + const typename rules::string_pair_deque ¯odeque_ = + rules_.macrodeque (); + + for (typename rules::string_pair_deque::const_iterator iter_ = + macrodeque_.begin (), end_ = macrodeque_.end (); + iter_ != end_; ++iter_) + { + const typename rules::string &name_ = iter_->first; + const typename rules::string ®ex_ = iter_->second; + parser parser_ (rules_.locale (), node_ptr_vector_, macro_map_, + charset_map_, rules_.eoi ()); + node *node_ = parser_.parse (regex_.c_str (), + regex_.c_str () + regex_.size (), 0, 0, 0, false, false, + rules_.flags (), nl_id_, false, true); + macro_iter_pair map_iter_ = macro_map_.insert (macro_pair (name_, + static_cast(0))); + + map_iter_.first->second = node_; + } + } + + static void build_dfa (const charset_map &charset_map_, const node *root_, + internals &internals_, sm &sm_, const id_type dfa_index_, + id_type &nl_id_) + { + // partitioned charset list + charset_list charset_list_; + // vector mapping token indexes to partitioned token index sets + index_set_vector set_mapping_; + typename internals::id_type_vector &dfa_ = + *internals_._dfa[dfa_index_]; + std::size_t dfa_alphabet_ = 0; + const node_vector *followpos_ = &root_->firstpos (); + node_set_vector seen_sets_; + node_vector_vector seen_vectors_; + size_t_vector hash_vector_; + id_type zero_id_ = sm_traits::npos (); + id_type_set eol_set_; + + set_mapping_.resize (charset_map_.size ()); + partition_charsets (charset_map_, charset_list_, is_dfa ()); + build_set_mapping (charset_list_, internals_, dfa_index_, + set_mapping_); + + if (nl_id_ != sm_traits::npos ()) + { + nl_id_ = *set_mapping_[nl_id_].begin (); + zero_id_ = sm_traits::compressed ? + *set_mapping_[charset_map_.find (string_token (0, 0))-> + second].begin () : sm_traits::npos (); + } + + dfa_alphabet_ = charset_list_->size () + transitions_index + + (nl_id_ == sm_traits::npos () ? 0 : 1); + + if (dfa_alphabet_ > sm_traits::npos ()) + { + // Overflow + throw runtime_error ("The data type you have chosen cannot hold " + "the dfa alphabet."); + } + + internals_._dfa_alphabet[dfa_index_] = dfa_alphabet_; + // 'jam' state + dfa_.resize (dfa_alphabet_, 0); + closure (followpos_, seen_sets_, seen_vectors_, hash_vector_, + dfa_alphabet_, dfa_); + + for (id_type index_ = 0; index_ < static_cast + (seen_vectors_->size ()); ++index_) + { + equivset_list equiv_list_; + + build_equiv_list (seen_vectors_[index_], set_mapping_, + equiv_list_, is_dfa ()); + + for (typename equivset_list::list::const_iterator iter_ = + equiv_list_->begin (), end_ = equiv_list_->end (); + iter_ != end_; ++iter_) + { + equivset *equivset_ = *iter_; + const id_type transition_ = closure + (&equivset_->_followpos, seen_sets_, seen_vectors_, + hash_vector_, dfa_alphabet_, dfa_); + + if (transition_ != sm_traits::npos ()) + { + id_type *ptr_ = &dfa_.front () + ((index_ + 1) * + dfa_alphabet_); + + // Prune abstemious transitions from end states. + if (*ptr_ && !equivset_->_greedy) continue; + + for (typename equivset::index_vector::const_iterator + equiv_iter_ = equivset_->_index_vector.begin (), + equiv_end_ = equivset_->_index_vector.end (); + equiv_iter_ != equiv_end_; ++equiv_iter_) + { + const id_type i_ = *equiv_iter_; + + if (i_ == parser::bol_token ()) + { + dfa_.front () = transition_; + } + else if (i_ == parser:: eol_token ()) + { + ptr_[eol_index] = transition_; + eol_set_.insert (index_ + 1); + } + else + { + ptr_[i_ + transitions_index] = transition_; + } + } + } + } + } + + fix_clashes (eol_set_, nl_id_, zero_id_, dfa_, dfa_alphabet_, + compressed ()); + append_dfa (charset_list_, internals_, sm_, dfa_index_, lookup ()); + } + + // Uncompressed + static void fix_clashes (const id_type_set &eol_set_, + const id_type nl_id_, const id_type /*zero_id_*/, + typename internals::id_type_vector &dfa_, + const std::size_t dfa_alphabet_, const false_ &) + { + typename id_type_set::const_iterator eol_iter_ = + eol_set_.begin (); + typename id_type_set::const_iterator eol_end_ = + eol_set_.end (); + + for (; eol_iter_ != eol_end_; ++eol_iter_) + { + id_type *ptr_ = &dfa_.front () + *eol_iter_ * dfa_alphabet_; + const id_type eol_state_ = ptr_[eol_index]; + const id_type nl_state_ = ptr_[nl_id_ + transitions_index]; + + if (nl_state_) + { + ptr_[transitions_index + nl_id_] = 0; + ptr_ = &dfa_.front () + eol_state_ * dfa_alphabet_; + + if (ptr_[transitions_index + nl_id_] == 0) + { + ptr_[transitions_index + nl_id_] = nl_state_; + } + } + } + } + + // Compressed + static void fix_clashes (const id_type_set &eol_set_, + const id_type nl_id_, const id_type zero_id_, + typename internals::id_type_vector &dfa_, + const std::size_t dfa_alphabet_, const true_ &) + { + typename id_type_set::const_iterator eol_iter_ = + eol_set_.begin (); + typename id_type_set::const_iterator eol_end_ = + eol_set_.end (); + std::size_t i_ = 0; + + for (; eol_iter_ != eol_end_; ++eol_iter_) + { + id_type *ptr_ = &dfa_.front () + *eol_iter_ * dfa_alphabet_; + const id_type eol_state_ = ptr_[eol_index]; + id_type nl_state_ = 0; + + for (; i_ < (sm_traits::char_24_bit ? 2 : 1); ++i_) + { + ptr_ = &dfa_.front () + ptr_[transitions_index + zero_id_] * + dfa_alphabet_; + } + + nl_state_ = ptr_[transitions_index + nl_id_]; + + if (nl_state_) + { + ptr_ = &dfa_.front () + eol_state_ * dfa_alphabet_; + + if (ptr_[transitions_index + zero_id_] != 0) continue; + + ptr_[transitions_index + zero_id_] = dfa_.size () / + dfa_alphabet_; + dfa_.resize (dfa_.size () + dfa_alphabet_, 0); + + for (i_ = 0; i_ < (sm_traits::char_24_bit ? 1 : 0); ++i_) + { + ptr_ = &dfa_.front () + dfa_.size () - dfa_alphabet_; + ptr_[transitions_index + zero_id_] = dfa_.size () / + dfa_alphabet_; + dfa_.resize (dfa_.size () + dfa_alphabet_, 0); + } + + ptr_ = &dfa_.front () + dfa_.size () - dfa_alphabet_; + ptr_[transitions_index + nl_id_] = nl_state_; + } + } + } + + // char_state_machine version + static void append_dfa (const charset_list &charset_list_, + const internals &internals_, sm &sm_, const id_type dfa_index_, + const false_ &) + { + typename charset_list::list::const_iterator list_iter_ = + charset_list_->begin (); + std::size_t size_ = charset_list_->size (); + typename sm::string_token_vector token_vector_; + + token_vector_.reserve (size_); + + for (std::size_t i_ = 0; i_ < size_; ++i_, ++list_iter_) + { + const charset *charset_ = *list_iter_; + + token_vector_.push_back (charset_->_token); + } + + sm_.append (token_vector_, internals_, dfa_index_); + } + + // state_machine version + static void append_dfa (const charset_list &, + const internals &, sm &, const id_type, const true_ &) + { + // Nothing to do - will use create() instead + } + + // char_state_machine version + static void create (internals &, sm &, const id_type_vector &, + const false_ &) + { + // Nothing to do - will use append_dfa() instead + } + + // state_machine version + static void create (internals &internals_, sm &sm_, + const id_type_vector &features_, const true_ &) + { + for (std::size_t i_ = 0, size_ = internals_._dfa->size (); + i_ < size_; ++i_) + { + internals_._features |= features_[i_]; + } + + if (internals_._dfa->size () > 1) + { + internals_._features |= multi_state_bit; + } + + sm_.data ().swap (internals_); + } + + // NFA version + static void partition_charsets (const charset_map &map_, + charset_list &lhs_, const false_ &) + { + fill_rhs_list (map_, lhs_); + } + + // DFA version + static void partition_charsets (const charset_map &map_, + charset_list &lhs_, const true_ &) + { + charset_list rhs_; + + fill_rhs_list (map_, rhs_); + + if (!rhs_->empty ()) + { + typename charset_list::list::iterator iter_; + typename charset_list::list::iterator end_; + charset_ptr overlap_ (new charset); + + lhs_->push_back (static_cast(0)); + lhs_->back () = rhs_->front (); + rhs_->pop_front (); + + while (!rhs_->empty ()) + { + charset_ptr r_ (rhs_->front ()); + + rhs_->pop_front (); + iter_ = lhs_->begin (); + end_ = lhs_->end (); + + while (!r_->empty () && iter_ != end_) + { + typename charset_list::list::iterator l_iter_ = iter_; + + (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); + + if (overlap_->empty ()) + { + ++iter_; + } + else if ((*l_iter_)->empty ()) + { + delete *l_iter_; + *l_iter_ = overlap_.release (); + overlap_.reset (new charset); + ++iter_; + } + else if (r_->empty ()) + { + delete r_.release (); + r_ = overlap_; + overlap_.reset (new charset); + break; + } + else + { + iter_ = lhs_->insert (++iter_, + static_cast(0)); + *iter_ = overlap_.release (); + overlap_.reset (new charset); + ++iter_; + end_ = lhs_->end (); + } + } + + if (!r_->empty ()) + { + lhs_->push_back (static_cast(0)); + lhs_->back () = r_.release (); + } + } + } + } + + static void fill_rhs_list (const charset_map &map_, + charset_list &list_) + { + typename charset_map::const_iterator iter_ = map_.begin (); + typename charset_map::const_iterator end_ = map_.end (); + + for (; iter_ != end_; ++iter_) + { + list_->push_back (static_cast(0)); + list_->back () = new charset (iter_->first, iter_->second); + } + } + + static void build_set_mapping (const charset_list &charset_list_, + internals &internals_, const id_type dfa_index_, + index_set_vector &set_mapping_) + { + typename charset_list::list::const_iterator iter_ = + charset_list_->begin (); + typename charset_list::list::const_iterator end_ = + charset_list_->end (); + typename index_set::const_iterator set_iter_; + typename index_set::const_iterator set_end_; + + for (id_type index_ = 0; iter_ != end_; ++iter_, ++index_) + { + const charset *cs_ = *iter_; + + set_iter_ = cs_->_index_set.begin (); + set_end_ = cs_->_index_set.end (); + fill_lookup (cs_->_token, internals_._lookup[dfa_index_], + index_, lookup ()); + + for (; set_iter_ != set_end_; ++set_iter_) + { + set_mapping_[*set_iter_].insert (index_); + } + } + } + + // char_state_machine version + static void fill_lookup (const string_token &, id_type_vector *, + const id_type, const false_ &) + { + // Do nothing (lookup not used) + } + + // state_machine version + static void fill_lookup (const string_token &charset_, + id_type_vector *lookup_, const id_type index_, const true_ &) + { + typename string_token::range_vector::const_iterator iter_ = + charset_._ranges.begin (); + typename string_token::range_vector::const_iterator end_ = + charset_._ranges.end (); + id_type *ptr_ = &lookup_->front (); + + for (; iter_ != end_; ++iter_) + { + for (typename char_traits::index_type char_ = iter_->first; + char_ < iter_->second; ++char_) + { + // Note char_ must be unsigned + ptr_[char_] = index_ + transitions_index; + } + + // Note iter_->second must be unsigned + ptr_[iter_->second] = index_ + transitions_index; + } + } + + static id_type closure (const node_vector *followpos_, + node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, + size_t_vector &hash_vector_, const id_type size_, id_type_vector &dfa_) + { + bool end_state_ = false; + id_type id_ = 0; + id_type user_id_ = sm_traits::npos (); + id_type next_dfa_ = 0; + id_type push_dfa_ = sm_traits::npos (); + bool pop_dfa_ = false; + std::size_t hash_ = 0; + + if (followpos_->empty ()) return sm_traits::npos (); + + id_type index_ = 0; + std::auto_ptr set_ptr_ (new node_set); + std::auto_ptr vector_ptr_ (new node_vector); + + for (typename node_vector::const_iterator iter_ = + followpos_->begin (), end_ = followpos_->end (); + iter_ != end_; ++iter_) + { + closure_ex (*iter_, end_state_, id_, user_id_, next_dfa_, + push_dfa_, pop_dfa_, set_ptr_.get (), + vector_ptr_.get (), hash_); + } + + bool found_ = false; + typename size_t_vector::const_iterator hash_iter_ = + hash_vector_.begin (); + typename size_t_vector::const_iterator hash_end_ = + hash_vector_.end (); + typename node_set_vector::vector::const_iterator set_iter_ = + seen_sets_->begin (); + + for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) + { + found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; + ++index_; + + if (found_) break; + } + + if (!found_) + { + seen_sets_->push_back (static_cast(0)); + seen_sets_->back () = set_ptr_.release (); + seen_vectors_->push_back (static_cast(0)); + seen_vectors_->back () = vector_ptr_.release (); + hash_vector_.push_back (hash_); + // State 0 is the jam state... + index_ = static_cast(seen_sets_->size ()); + + const std::size_t old_size_ = dfa_.size (); + + dfa_.resize (old_size_ + size_, 0); + + if (end_state_) + { + dfa_[old_size_] |= end_state_bit; + + if (pop_dfa_) + { + dfa_[old_size_] |= pop_dfa_bit; + } + + dfa_[old_size_ + id_index] = id_; + dfa_[old_size_ + user_id_index] = user_id_; + dfa_[old_size_ + push_dfa_index] = push_dfa_; + dfa_[old_size_ + next_dfa_index] = next_dfa_; + } + } + + return index_; + } + + static void closure_ex (node *node_, bool &end_state_, + id_type &id_, id_type &user_id_, id_type &next_dfa_, + id_type &push_dfa_, bool &pop_dfa_, node_set *set_ptr_, + node_vector *vector_ptr_, std::size_t &hash_) + { + const bool temp_end_state_ = node_->end_state (); + + if (temp_end_state_) + { + if (!end_state_) + { + end_state_ = true; + id_ = node_->id (); + user_id_ = node_->user_id (); + next_dfa_ = node_->next_dfa (); + push_dfa_ = node_->push_dfa (); + pop_dfa_ = node_->pop_dfa (); + } + } + + if (set_ptr_->insert (node_).second) + { + vector_ptr_->push_back (node_); + hash_ += reinterpret_cast (node_); + } + } + + // NFA version + static void build_equiv_list (const node_vector *vector_, + const index_set_vector &set_mapping_, equivset_list &lhs_, + const false_ &) + { + fill_rhs_list (vector_, set_mapping_, lhs_); + } + + // DFA version + static void build_equiv_list (const node_vector *vector_, + const index_set_vector &set_mapping_, equivset_list &lhs_, + const true_ &) + { + equivset_list rhs_; + + fill_rhs_list (vector_, set_mapping_, rhs_); + + if (!rhs_->empty ()) + { + typename equivset_list::list::iterator iter_; + typename equivset_list::list::iterator end_; + equivset_ptr overlap_ (new equivset); + + lhs_->push_back (static_cast(0)); + lhs_->back () = rhs_->front (); + rhs_->pop_front (); + + while (!rhs_->empty ()) + { + equivset_ptr r_ (rhs_->front ()); + + rhs_->pop_front (); + iter_ = lhs_->begin (); + end_ = lhs_->end (); + + while (!r_->empty () && iter_ != end_) + { + typename equivset_list::list::iterator l_iter_ = iter_; + + (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); + + if (overlap_->empty ()) + { + ++iter_; + } + else if ((*l_iter_)->empty ()) + { + delete *l_iter_; + *l_iter_ = overlap_.release (); + overlap_.reset (new equivset); + ++iter_; + } + else if (r_->empty ()) + { + delete r_.release (); + r_ = overlap_; + overlap_.reset (new equivset); + break; + } + else + { + iter_ = lhs_->insert (++iter_, + static_cast(0)); + *iter_ = overlap_.release (); + overlap_.reset (new equivset); + ++iter_; + end_ = lhs_->end (); + } + } + + if (!r_->empty ()) + { + lhs_->push_back (static_cast(0)); + lhs_->back () = r_.release (); + } + } + } + } + + static void fill_rhs_list (const node_vector *vector_, + const index_set_vector &set_mapping_, equivset_list &list_) + { + typename node_vector::const_iterator iter_ = + vector_->begin (); + typename node_vector::const_iterator end_ = + vector_->end (); + + for (; iter_ != end_; ++iter_) + { + const node *node_ = *iter_; + + if (!node_->end_state ()) + { + const id_type token_ = node_->token (); + + if (token_ != node::null_token ()) + { + list_->push_back (static_cast(0)); + + if (token_ == parser::bol_token () || + token_ == parser::eol_token ()) + { + std::set index_set_; + + index_set_.insert (token_); + list_->back () = new equivset (index_set_, + token_, node_->greedy (), node_->followpos ()); + } + else + { + list_->back () = new equivset (set_mapping_[token_], + token_, node_->greedy (), node_->followpos ()); + } + } + } + } + } +}; + +typedef basic_generator generator; +typedef basic_generator wgenerator; +typedef basic_generator char_generator; +typedef basic_generator wchar_generator; +} + +#endif diff --git a/inc/lexertl/internals.hpp b/inc/lexertl/internals.hpp new file mode 100644 index 0000000..10335c6 --- /dev/null +++ b/inc/lexertl/internals.hpp @@ -0,0 +1,80 @@ +// internals.hpp +// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_INTERNALS_HPP +#define LEXERTL_INTERNALS_HPP + +#include "enums.hpp" +#include "containers/ptr_vector.hpp" + +namespace lexertl +{ +namespace detail +{ +template +struct basic_internals +{ + typedef std::vector id_type_vector; + typedef ptr_vector id_type_vector_vector; + + id_type _eoi; + id_type_vector_vector _lookup; + id_type_vector _dfa_alphabet; + id_type _features; + id_type_vector_vector _dfa; + + basic_internals () : + _eoi (0), + _lookup (), + _dfa_alphabet (), + _features (0), + _dfa () + { + } + + void clear () + { + _eoi = 0; + _lookup.clear (); + _dfa_alphabet.clear (); + _features = 0; + _dfa.clear (); + } + + bool empty () const + { + return _dfa->empty (); + } + + void add_states (const std::size_t num_) + { + for (std::size_t index_ = 0; index_ < num_; ++index_) + { + _lookup->push_back (static_cast(0)); + // lookup *always* has a size 256 now. + _lookup->back () = new id_type_vector (256, dead_state_index); + _dfa_alphabet.push_back (0); + _dfa->push_back (static_cast(0)); + _dfa->back () = new id_type_vector; + } + } + + void swap (basic_internals &internals_) + { + std::swap (_eoi, internals_._eoi); + _lookup->swap (*internals_._lookup); + _dfa_alphabet.swap (internals_._dfa_alphabet); + std::swap (_features, internals_._features); + _dfa->swap (*internals_._dfa); + } + +private: + basic_internals (const basic_internals &); // No copy construction. + basic_internals &operator = (const basic_internals &); // No assignment. +}; +} +} + +#endif diff --git a/inc/lexertl/is_same.hpp b/inc/lexertl/is_same.hpp new file mode 100644 index 0000000..42684dc --- /dev/null +++ b/inc/lexertl/is_same.hpp @@ -0,0 +1,29 @@ +// is_same.hpp +// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_IS_SAME_HPP +#define LEXERTL_IS_SAME_HPP + +namespace lexertl +{ +namespace detail +{ +template +struct is_same +{ + enum {same = false}; +}; + +template +struct is_same +{ + enum {same = true}; +}; +} +} + +#endif + diff --git a/inc/lexertl/licence_1_0.txt b/inc/lexertl/licence_1_0.txt new file mode 100644 index 0000000..d1c4c6c --- /dev/null +++ b/inc/lexertl/licence_1_0.txt @@ -0,0 +1,24 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + diff --git a/inc/lexertl/lookup.hpp b/inc/lexertl/lookup.hpp new file mode 100644 index 0000000..a54ae42 --- /dev/null +++ b/inc/lexertl/lookup.hpp @@ -0,0 +1,477 @@ +// lookup.hpp +// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_LOOKUP_HPP +#define LEXERTL_LOOKUP_HPP + +#include +#include "bool.hpp" +#include "match_results.hpp" +#include "state_machine.hpp" + +namespace lexertl +{ +namespace detail +{ +template +struct bol_state +{ + bol_state (const bool) + { + } +}; + +template<> +struct bol_state +{ + bool _bol; + bool _end_bol; + + bol_state (const bool bol_) : + _bol (bol_), + _end_bol (bol_) + { + } +}; + +template +struct eol_state +{ +}; + +template +struct eol_state +{ + id_type _EOL_state; + + eol_state () : + _EOL_state (0) + { + } +}; + +template +struct multi_state_state +{ + multi_state_state (const id_type) + { + } +}; + +template +struct multi_state_state +{ + id_type _start_state; + + multi_state_state (const id_type state_) : + _start_state (state_) + { + } +}; + +template +struct recursive_state +{ + recursive_state (const id_type *) + { + } +}; + +template +struct recursive_state +{ + bool _pop; + id_type _push_dfa; + + recursive_state (const id_type *ptr_) : + _pop ((*ptr_ & pop_dfa_bit) != 0), + _push_dfa (*(ptr_ + push_dfa_index)) + { + } +}; + +template +struct lookup_state +{ + typedef basic_internals internals; + + const id_type *_lookup; + id_type _dfa_alphabet; + const id_type *_dfa; + const id_type *_ptr; + bool _end_state; + id_type _id; + id_type _uid; + bol_state<(flags & bol_bit) != 0> _bol_state; + eol_state _eol_state; + multi_state_state + _multi_state_state; + recursive_state _recursive_state; + + lookup_state (const internals &internals_, const bool bol_, + const id_type state_) : + _lookup (&internals_._lookup[state_]->front ()), + _dfa_alphabet (internals_._dfa_alphabet[state_]), + _dfa (&internals_._dfa[state_]->front ()), + _ptr (_dfa + _dfa_alphabet), + _end_state (*_ptr != 0), + _id (*(_ptr + id_index)), + _uid (*(_ptr + user_id_index)), + _bol_state (bol_), + _eol_state (), + _multi_state_state (state_), + _recursive_state (_ptr) + { + } + + void reset_recursive (const false_ &) + { + // Do nothing + } + + void reset_recursive (const true_ &) + { + _recursive_state._pop = (*_ptr & pop_dfa_bit) != 0; + _recursive_state._push_dfa = *(_ptr + push_dfa_index); + } + + void bol_start_state (const false_ &) + { + // Do nothing + } + + void bol_start_state (const true_ &) + { + if (_bol_state._bol) + { + const id_type state_ = *_dfa; + + if (state_) + { + _ptr = &_dfa[state_ * _dfa_alphabet]; + } + } + } + + template + bool eol (const char_type, const false_ &) + { + return false; + } + + template + bool eol (const char_type curr_, const true_ &) + { + bool ret_ = false; + + _eol_state._EOL_state = _ptr[eol_index]; + ret_ = _eol_state._EOL_state && curr_ == '\n'; + + if (ret_) + { + _ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet]; + } + + return ret_; + } + + template + id_type next_char (const char_type prev_char_, const false_ &) + { + const id_type state_= _ptr[_lookup + [static_cast(prev_char_)]]; + + if (state_ != 0) + { + _ptr = &_dfa[state_ * _dfa_alphabet]; + } + + return state_; + } + + template + id_type next_char (const char_type prev_char_, const true_ &) + { + const std::size_t bytes_ = sizeof (char_type) < 3 ? + sizeof (char_type) : 3; + const std::size_t shift_[] = {0, 8, 16}; + id_type state_= 0; + + for (std::size_t i_ = 0; i_ < bytes_; ++i_) + { + state_ = _ptr[_lookup[static_cast((prev_char_ >> + shift_[bytes_ - 1 - i_]) & 0xff)]]; + + if (state_ == 0) + { + break; + } + + _ptr = &_dfa[state_ * _dfa_alphabet]; + } + + return state_; + } + + template + void bol (const char_type, const false_ &) + { + // Do nothing + } + + template + void bol (const char_type prev_char_, const true_ &) + { + _bol_state._bol = prev_char_ == '\n'; + } + + void eol (const id_type, const false_ &) + { + // Do nothing + } + + void eol (const id_type err_val_, const true_ &) + { + _eol_state._EOL_state = err_val_; + } + + void reset_start_state (const false_ &) + { + // Do nothing + } + + void reset_start_state (const true_ &) + { + _multi_state_state._start_state = *(_ptr + next_dfa_index); + } + + void reset_end_bol (const false_ &) + { + // Do nothing + } + + void reset_end_bol (const true_ &) + { + _bol_state._end_bol = _bol_state._bol; + } + + template + void end_state (iter_type &end_token_, iter_type &curr_) + { + if (*_ptr) + { + _end_state = true; + reset_end_bol (bool_<(flags & bol_bit) != 0> ()); + _id = *(_ptr + id_index); + _uid = *(_ptr + user_id_index); + reset_recursive (bool_<(flags & recursive_bit) != 0> ()); + reset_start_state (bool_<(flags & multi_state_bit) != 0> ()); + end_token_ = curr_; + } + } + + template + void check_eol (iter_type &, iter_type &, const id_type, + const char_type, const false_ &) + { + // Do nothing + } + + template + void check_eol (iter_type &end_token_, iter_type &curr_, + const id_type npos, const char_type eoi_, const true_ &) + { + if (_eol_state._EOL_state != npos && curr_ == eoi_) + { + _eol_state._EOL_state = _ptr[eol_index]; + + if (_eol_state._EOL_state) + { + _ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet]; + end_state (end_token_, curr_); + } + } + } + + template + void pop (results &, const false_ &) + { + // Nothing to do + } + + template + void pop (results &results_, const true_ &) + { + if (_recursive_state._pop) + { + _multi_state_state._start_state = results_.stack.top ().first; + results_.stack.pop (); + } + else if (_recursive_state._push_dfa != results::npos ()) + { + results_.stack.push (typename results::id_type_pair + (_recursive_state._push_dfa, _id)); + } + } + + template + bool id_eoi (const id_type eoi_, const results &, const false_ &) + { + return _id == eoi_; + } + + template + bool id_eoi (const id_type eoi_, const results &results_, const true_ &) + { + return _id == eoi_ || (_recursive_state._pop && + !results_.stack.empty () && results_.stack.top ().second == eoi_); + } + + void start_state (id_type &, const false_ &) + { + // Do nothing + } + + void start_state (id_type &start_state_, const true_ &) + { + start_state_ = _multi_state_state._start_state; + } + + void bol (bool &, const false_ &) + { + // Do nothing + } + + void bol (bool &end_bol_, const true_ &) + { + end_bol_ = _bol_state._end_bol; + } +}; + +template +void inc_end (results &, const false_ &) +{ + // Do nothing +} + +template +void inc_end (results &results_, const true_ &) +{ + ++results_.end; +} + +template +void next (const basic_state_machine::value_type, id_type> &sm_, + results &results_, const bool_ &compressed_, + const bool_ &recursive_) +{ + const basic_internals &internals_ = sm_.data (); + typename results::iter_type end_token_ = results_.end; + +skip: + typename results::iter_type curr_ = results_.end; + + results_.start = curr_; + +again: + if (curr_ == results_.eoi) + { + results_.id = internals_._eoi; + results_.user_id = results::npos (); + return; + } + + lookup_state lu_state_ + (internals_, results_.bol, results_.state); + lu_state_.bol_start_state (bool_<(flags & bol_bit) != 0> ()); + + while (curr_ != results_.eoi) + { + if (!lu_state_.eol (*curr_, bool_<(flags & eol_bit) != 0> ())) + { + const typename results::char_type prev_char_ = *curr_++; + const id_type state_ = lu_state_.next_char (prev_char_, + compressed_); + + lu_state_.bol (prev_char_, bool_<(flags & bol_bit) != 0> ()); + + if (state_ == 0) + { + lu_state_.eol (results::npos (), + bool_<(flags & eol_bit) != 0> ()); + break; + } + } + + lu_state_.end_state (end_token_, curr_); + } + + lu_state_.check_eol (end_token_, curr_, results::npos (), results_.eoi, + bool_<(flags & eol_bit) != 0> ()); + + if (lu_state_._end_state) + { + // Return longest match + lu_state_.pop (results_, recursive_); + + lu_state_.start_state (results_.state, + bool_<(flags & multi_state_bit) != 0> ()); + lu_state_.bol (results_.bol, bool_<(flags & bol_bit) != 0> ()); + results_.end = end_token_; + + if (lu_state_._id == sm_.skip ()) goto skip; + + if (lu_state_.id_eoi (internals_._eoi, results_, recursive_)) + { + curr_ = end_token_; + goto again; + } + } + else + { + results_.end = end_token_; + results_.bol = *results_.end == '\n'; + results_.start = results_.end; + // No match causes char to be skipped + inc_end (results_, bool_<(flags & advance_bit) != 0> ()); + lu_state_._id = results::npos (); + lu_state_._uid = results::npos (); + } + + results_.id = lu_state_._id; + results_.user_id = lu_state_._uid; +} +} + +template +void lookup (const basic_state_machine::value_type, id_type> &sm_, + match_results &results_) +{ + // If this asserts, you have either not defined all the correct + // flags, or you should be using recursive_match_results instead + // of match_results. + assert ((sm_.data ()._features & flags) == sm_.data ()._features); + detail::next (sm_, results_, bool_<(sizeof + (typename std::iterator_traits::value_type) > 1)> (), + false_ ()); +} + +template +void lookup (const basic_state_machine::value_type, id_type> &sm_, + recursive_match_results &results_) +{ + // If this asserts, you have not defined all the correct flags + assert ((sm_.data ()._features & flags) == sm_.data ()._features); + detail::next (sm_, results_, + bool_<(sizeof(typename std::iterator_traits:: + value_type) > 1)> (), true_ ()); +} +} + +#endif diff --git a/inc/lexertl/match_results.hpp b/inc/lexertl/match_results.hpp new file mode 100644 index 0000000..7a21b9a --- /dev/null +++ b/inc/lexertl/match_results.hpp @@ -0,0 +1,150 @@ +// match_results.hpp +// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_MATCH_RESULTS_HPP +#define LEXERTL_MATCH_RESULTS_HPP + +#include "char_traits.hpp" +#include "enums.hpp" +#include +#include +#include + +namespace lexertl +{ +template +struct match_results +{ + typedef iter iter_type; + typedef typename std::iterator_traits::value_type char_type; + typedef typename basic_char_traits::index_type index_type; + typedef std::basic_string string; + + id_type id; + id_type user_id; + iter_type start; + iter_type end; + iter_type eoi; + bool bol; + id_type state; + + match_results () : + id (0), + user_id (npos ()), + start (iter_type ()), + end (iter_type ()), + eoi (iter_type ()), + bol (true), + state (0) + { + } + + match_results (const iter_type &start_, const iter_type &end_) : + id (0), + user_id (npos ()), + start (start_), + end (start_), + eoi (end_), + bol (true), + state (0) + { + } + + virtual ~match_results () + { + } + + string str () const + { + return string (start, end); + } + + virtual void clear () + { + id = 0; + user_id = npos (); + start = eoi; + end = eoi; + bol = true; + state = 0; + } + + virtual void reset (const iter_type &start_, const iter_type &end_) + { + id = 0; + user_id = npos (); + start = start_; + end = start_; + eoi = end_; + bol = true; + state = 0; + } + + static id_type npos () + { + return static_cast(~0); + } + + static id_type skip () + { + return static_cast(~1); + } +}; + +template +struct recursive_match_results : public match_results +{ + typedef std::pair id_type_pair; + std::stack stack; + + recursive_match_results () : + match_results (), + stack () + { + } + + recursive_match_results (const iter &start_, const iter &end_) : + match_results (start_, end_), + stack () + { + } + + virtual ~recursive_match_results () + { + } + + virtual void clear () + { + match_results::clear (); + + while (!stack.empty()) stack.pop (); + } + + virtual void reset (const iter &start_, const iter &end_) + { + match_results::reset (start_, end_); + + while (!stack.empty()) stack.pop (); + } +}; + +typedef match_results smatch; +typedef match_results cmatch; +typedef match_results wsmatch; +typedef match_results wcmatch; + +typedef recursive_match_results + srmatch; +typedef recursive_match_results crmatch; +typedef recursive_match_results + wsrmatch; +typedef recursive_match_results wcrmatch; +} + +#endif diff --git a/inc/lexertl/memory_file.hpp b/inc/lexertl/memory_file.hpp new file mode 100644 index 0000000..2d87b3d --- /dev/null +++ b/inc/lexertl/memory_file.hpp @@ -0,0 +1,112 @@ +// memory_file.hpp +// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/) +// Inspired by http://en.wikibooks.org/wiki/Optimizing_C%2B%2B/General_optimization_techniques/Input/Output#Memory-mapped_file +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_MEMORY_FILE_H +#define LEXERTL_MEMORY_FILE_H + +#ifdef __unix__ +#include +#include +#include +#include +#elif defined _WIN32 +#include +#endif + +// Only files small enough to fit into memory are supported. +namespace lexertl +{ +template +class basic_memory_file +{ +public: + basic_memory_file (const char *pathname_) : + _data (0), + _size (0) + { +#ifdef __unix__ + _fh = ::open (pathname_, O_RDONLY); + + if (_fh > -1) + { + struct stat sbuf_; + + if (::fstat (_fh, &sbuf_) > -1) + { + _data = static_cast + (::mmap (0, sbuf_.st_size, PROT_READ, MAP_SHARED, _fh, 0)); + + if (_data == MAP_FAILED) + { + _data = 0; + } + else + { + _size = sbuf_.st_size; + } + } + } +#elif defined _WIN32 + _fh = ::CreateFileA (pathname_, GENERIC_READ, FILE_SHARE_READ, 0, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + _fmh = 0; + + if (_fh != INVALID_HANDLE_VALUE) + { + _fmh = ::CreateFileMapping (_fh, 0, PAGE_READONLY, 0, 0, 0); + + if (_fmh != 0) + { + _data = static_cast(::MapViewOfFile + (_fmh, FILE_MAP_READ, 0, 0, 0)); + + if (_data) _size = ::GetFileSize(_fh, 0); + } + } +#endif + } + + ~basic_memory_file () + { +#if defined(__unix__) + ::munmap(const_cast(_data), _size); + ::close(_fh); +#elif defined(_WIN32) + ::UnmapViewOfFile(_data); + ::CloseHandle(_fmh); + ::CloseHandle(_fh); +#endif + } + + const CharT *data () const + { + return _data; + } + + std::size_t size () const + { + return _size; + } + +private: + const CharT *_data; + std::size_t _size; +#ifdef __unix__ + int _fh; +#elif defined _WIN32 + HANDLE _fh; + HANDLE _fmh; +#else + #error Only Posix or Windows are supported. +#endif +}; + +typedef basic_memory_file memory_file; +typedef basic_memory_file wmemory_file; +} + +#endif diff --git a/inc/lexertl/old/fast_filebuf.hpp b/inc/lexertl/old/fast_filebuf.hpp new file mode 100644 index 0000000..f9dd3a9 --- /dev/null +++ b/inc/lexertl/old/fast_filebuf.hpp @@ -0,0 +1,45 @@ +// Quick hack... +// If you find this really is faster then using std::ifstream, let me know +// as I can always spend some more time to improve it. + +namespace lexertl +{ +template +class basic_fast_filebuf : public std::basic_streambuf +{ +public: + basic_fast_filebuf (const char *filename_) : + _fp (0) + { + _fp = ::fopen(filename_, "r"); + } + + virtual ~basic_fast_filebuf() + { + ::fclose(_fp); + _fp = 0; + } + +protected: + FILE *_fp; + + virtual std::streamsize xsgetn (CharT *ptr_, std::streamsize count_) + { + return ::fread (ptr_, sizeof(CharT), + static_cast(count_), _fp); + } +}; + +typedef basic_fast_filebuf > fast_filebuf; +typedef basic_fast_filebuf > wfast_filebuf; +} + +// Usage: +// lexertl::rules rules_; +// lexertl::state_machine state_machine_; +// fast_filebuf buf ("Unicode/PropList.txt"); +// std::istream if_(&buf); +// lexertl::stream_shared_iterator iter_ (if_); +// lexertl::stream_shared_iterator end_; +// lexertl::match_results +// results_(iter_, end_); diff --git a/inc/lexertl/old/string_token.hpp b/inc/lexertl/old/string_token.hpp new file mode 100644 index 0000000..eb75f08 --- /dev/null +++ b/inc/lexertl/old/string_token.hpp @@ -0,0 +1,561 @@ +// string_token.hpp +// Copyright (c) 2005-2010 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_STRING_TOKEN_HPP +#define LEXERTL_STRING_TOKEN_HPP + +#include "../char_traits.hpp" +#include +#include +#include +#include +#include + +namespace lexertl +{ +template +struct basic_string_token +{ + typedef std::basic_string string; + + bool _negated; + string _chars; + + basic_string_token () : + _negated (false) + { + } + + basic_string_token (const bool negated_, const string &chars_) : + _negated (negated_), + _chars (chars_) + { + } + + void remove_duplicates () + { + const char_type *start_ = _chars.c_str (); + const char_type *end_ = start_ + _chars.size (); + + // Optimisation for very large charsets: + // sorting via pointers is much quicker than + // via iterators... + std::sort (const_cast (start_), const_cast + (end_)); + _chars.erase (std::unique (_chars.begin (), _chars.end ()), + _chars.end ()); + } + + void normalise () + { + const std::size_t max_chars_ = sizeof (char_type) == 1 ? + num_chars : num_wchar_ts; + + if (_chars.length () == max_chars_) + { + _negated = !_negated; + _chars.clear (); + } + else if (_chars.length () > max_chars_ / 2) + { + negate (); + } + } + + void negate () + { + const std::size_t max_chars_ = sizeof (char_type) == 1 ? + num_chars : num_wchar_ts; + char_type curr_char_ = std::numeric_limits::min (); + string temp_; + const char_type *curr_ = _chars.c_str (); + const char_type *chars_end_ = curr_ + _chars.size (); + + _negated = !_negated; + temp_.resize (max_chars_ - _chars.size ()); + + char_type *ptr_ = const_cast (temp_.c_str ()); + std::size_t i_ = 0; + + while (curr_ < chars_end_) + { + while (*curr_ > curr_char_) + { + *ptr_ = curr_char_; + ++ptr_; + ++curr_char_; + ++i_; + } + + ++curr_char_; + ++curr_; + ++i_; + } + + for (; i_ < max_chars_; ++i_) + { + *ptr_ = curr_char_; + ++ptr_; + ++curr_char_; + } + + _chars = temp_; + } + + bool operator < (const basic_string_token &rhs_) const + { + return _negated < rhs_._negated || + (_negated == rhs_._negated && _chars < rhs_._chars); + } + + bool operator == (const basic_string_token &rhs_) const + { + return _negated == rhs_._negated && _chars == rhs_._chars; + } + + bool empty () const + { + return _chars.empty () && !_negated; + } + + bool any () const + { + return _chars.empty () && _negated; + } + + void clear () + { + _negated = false; + _chars.clear (); + } + + void intersect (basic_string_token &rhs_, basic_string_token &overlap_) + { + if ((any () && rhs_.any ()) || (_negated == rhs_._negated && + !any () && !rhs_.any ())) + { + intersect_same_types (rhs_, overlap_); + } + else + { + intersect_diff_types (rhs_, overlap_); + } + } + + void merge (const basic_string_token &rhs_, + basic_string_token &merged_) const + { + if ((any () && rhs_.any ()) || (_negated == rhs_._negated && + !any () && !rhs_.any ())) + { + merge_same_types (rhs_, merged_); + } + else + { + merge_diff_types (rhs_, merged_); + } + } + + static string escape_char (const char_type ch_) + { + string out_; + + switch (ch_) + { + case '\0': + out_ += '\\'; + out_ += '0'; + break; + case '\a': + out_ += '\\'; + out_ += 'a'; + break; + case '\b': + out_ += '\\'; + out_ += 'b'; + break; + case 27: + out_ += '\\'; + out_ += 'x'; + out_ += '1'; + out_ += 'b'; + break; + case '\f': + out_ += '\\'; + out_ += 'f'; + break; + case '\n': + out_ += '\\'; + out_ += 'n'; + break; + case '\r': + out_ += '\\'; + out_ += 'r'; + break; + case '\t': + out_ += '\\'; + out_ += 't'; + break; + case '\v': + out_ += '\\'; + out_ += 'v'; + break; + case '\\': + out_ += '\\'; + out_ += '\\'; + break; + case '"': + out_ += '\\'; + out_ += '"'; + break; + case '\'': + out_ += '\\'; + out_ += '\''; + break; + default: + { + if (ch_ < 32) + { + std::basic_stringstream ss_; + + out_ += '\\'; + out_ += 'x'; + ss_ << std::hex << + static_cast (ch_); + out_ += ss_.str (); + } + else + { + out_ += ch_; + } + + break; + } + } + + return out_; + } + +private: + void intersect_same_types (basic_string_token &rhs_, + basic_string_token &overlap_) + { + if (any ()) + { + clear (); + overlap_._negated = true; + rhs_.clear (); + } + else + { + typename string::iterator iter_ = _chars.begin (); + typename string::iterator end_ = _chars.end (); + typename string::iterator rhs_iter_ = rhs_._chars.begin (); + typename string::iterator rhs_end_ = rhs_._chars.end (); + + overlap_._negated = _negated; + + while (iter_ != end_ && rhs_iter_ != rhs_end_) + { + if (*iter_ < *rhs_iter_) + { + ++iter_; + } + else if (*iter_ > *rhs_iter_) + { + ++rhs_iter_; + } + else + { + overlap_._chars += *iter_; + iter_ = _chars.erase (iter_); + end_ = _chars.end (); + rhs_iter_ = rhs_._chars.erase (rhs_iter_); + rhs_end_ = rhs_._chars.end (); + } + } + + if (_negated) + { + // duplicates already merged, so safe to merge + // using std lib. + + // src, dest + merge (_chars, overlap_._chars); + // duplicates already merged, so safe to merge + // using std lib. + + // src, dest + merge (rhs_._chars, overlap_._chars); + _negated = false; + rhs_._negated = false; + std::swap (_chars, rhs_._chars); + normalise (); + overlap_.normalise (); + rhs_.normalise (); + } + else if (!overlap_._chars.empty ()) + { + normalise (); + overlap_.normalise (); + rhs_.normalise (); + } + } + } + + void intersect_diff_types (basic_string_token &rhs_, + basic_string_token &overlap_) + { + if (any ()) + { + intersect_any (rhs_, overlap_); + } + else if (_negated) + { + intersect_negated (rhs_, overlap_); + } + else // _negated == false + { + intersect_charset (rhs_, overlap_); + } + } + + void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_) + { + if (rhs_._negated) + { + rhs_.intersect_negated (*this, overlap_); + } + else // rhs._negated == false + { + rhs_.intersect_charset (*this, overlap_); + } + } + + void intersect_negated (basic_string_token &rhs_, + basic_string_token &overlap_) + { + if (rhs_.any ()) + { + overlap_._negated = true; + overlap_._chars = _chars; + rhs_._negated = false; + rhs_._chars = _chars; + clear (); + } + else // rhs._negated == false + { + rhs_.intersect_charset (*this, overlap_); + } + } + + void intersect_charset (basic_string_token &rhs_, + basic_string_token &overlap_) + { + if (rhs_.any ()) + { + overlap_._chars = _chars; + rhs_._negated = true; + rhs_._chars = _chars; + clear (); + } + else // rhs_._negated == true + { + typename string::iterator iter_ = _chars.begin (); + typename string::iterator end_ = _chars.end (); + typename string::iterator rhs_iter_ = rhs_._chars.begin (); + typename string::iterator rhs_end_ = rhs_._chars.end (); + + while (iter_ != end_ && rhs_iter_ != rhs_end_) + { + if (*iter_ < *rhs_iter_) + { + overlap_._chars += *iter_; + rhs_iter_ = rhs_._chars.insert (rhs_iter_, *iter_); + ++rhs_iter_; + rhs_end_ = rhs_._chars.end (); + iter_ = _chars.erase (iter_); + end_ = _chars.end (); + } + else if (*iter_ > *rhs_iter_) + { + ++rhs_iter_; + } + else + { + ++iter_; + ++rhs_iter_; + } + } + + if (iter_ != end_) + { + // nothing bigger in rhs_ than iter_, + // so safe to merge using std lib. + string temp_ (iter_, end_); + + // src, dest + merge (temp_, overlap_._chars); + _chars.erase (iter_, end_); + } + + if (!overlap_._chars.empty ()) + { + merge (overlap_._chars, rhs_._chars); + // possible duplicates, so check for any and erase. + rhs_._chars.erase (std::unique (rhs_._chars.begin (), + rhs_._chars.end ()), rhs_._chars.end ()); + normalise (); + overlap_.normalise (); + rhs_.normalise (); + } + } + } + + void merge (string &src_, string &dest_) + { + string tmp_ (src_.size () + dest_.size (), 0); + + std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (), + tmp_.begin ()); + dest_ = tmp_; + } + + void merge_same_types (const basic_string_token &rhs_, + basic_string_token &merged_) const + { + if (any ()) + { + merged_._negated = true; + } + else if (_negated) + { + typename string::const_iterator iter_ = _chars.begin (); + typename string::const_iterator end_ = _chars.end (); + typename string::const_iterator rhs_iter_ = rhs_._chars.begin (); + typename string::const_iterator rhs_end_ = rhs_._chars.end (); + + merged_._negated = _negated; + + while (iter_ != end_ && rhs_iter_ != rhs_end_) + { + if (*iter_ < *rhs_iter_) + { + ++iter_; + } + else if (*iter_ > *rhs_iter_) + { + ++rhs_iter_; + } + else + { + merged_._chars += *iter_; + ++iter_; + ++rhs_iter_; + } + } + + merged_.normalise (); + } + else + { + typename string::const_iterator iter_ = _chars.begin (); + typename string::const_iterator end_ = _chars.end (); + typename string::const_iterator rhs_iter_ = rhs_._chars.begin (); + typename string::const_iterator rhs_end_ = rhs_._chars.end (); + + while (iter_ != end_ && rhs_iter_ != rhs_end_) + { + if (*iter_ < *rhs_iter_) + { + merged_._chars += *iter_; + ++iter_; + } + else if (*iter_ > *rhs_iter_) + { + merged_._chars += *rhs_iter_; + ++rhs_iter_; + } + else + { + merged_._chars += *iter_; + ++iter_; + ++rhs_iter_; + } + } + + // Include any trailing chars + if (iter_ != end_) + { + string temp_ (iter_, end_); + + merged_._chars += temp_; + } + else if (rhs_iter_ != rhs_end_) + { + string temp_ (rhs_iter_, rhs_end_); + + merged_._chars += temp_; + } + + merged_.normalise (); + } + } + + void merge_diff_types (const basic_string_token &rhs_, + basic_string_token &merged_) const + { + if (_negated) + { + merge_negated (*this, rhs_, merged_); + } + else + { + merge_negated (rhs_, *this, merged_); + } + + merged_.normalise (); + } + + void merge_negated (const basic_string_token &lhs_, + const basic_string_token &rhs_, basic_string_token &merged_) const + { + typename string::const_iterator lhs_iter_ = lhs_._chars.begin (); + typename string::const_iterator lhs_end_ = lhs_._chars.end (); + typename string::const_iterator rhs_iter_ = rhs_._chars.begin (); + typename string::const_iterator rhs_end_ = rhs_._chars.end (); + + merged_._negated = true; + + while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_) + { + if (*lhs_iter_ < *rhs_iter_) + { + merged_._chars += *lhs_iter_; + ++lhs_iter_; + } + else if (*lhs_iter_ > *rhs_iter_) + { + ++rhs_iter_; + } + else + { + ++lhs_iter_; + ++rhs_iter_; + } + } + + // Only interested in any remaining 'negated' chars + if (lhs_iter_ != lhs_end_) + { + string temp_ (lhs_iter_, lhs_end_); + + merged_._chars += temp_; + } + } +}; +} + +#endif diff --git a/inc/lexertl/parser/parser.hpp b/inc/lexertl/parser/parser.hpp new file mode 100644 index 0000000..06869fa --- /dev/null +++ b/inc/lexertl/parser/parser.hpp @@ -0,0 +1,1076 @@ +// parser.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_PARSER_HPP +#define LEXERTL_PARSER_HPP + +#include +#include +#include "../bool.hpp" +#include "tree/end_node.hpp" +#include "tree/iteration_node.hpp" +#include "tree/leaf_node.hpp" +#include +#include "../containers/ptr_stack.hpp" +#include "tokeniser/re_tokeniser.hpp" +#include "../runtime_error.hpp" +#include "tree/selection_node.hpp" +#include "tree/sequence_node.hpp" +#include "../size_t.hpp" +#include + +namespace lexertl +{ +namespace detail +{ +/* + General principles of regex parsing: + - Every regex is a sequence of sub-regexes. + - Regexes consist of operands and operators + - All operators decompose to sequence, selection ('|') and iteration ('*') + - Regex tokens are stored on a stack. + - When a complete sequence of regex tokens is on the stack it is processed. + +Grammar: + + -> + -> | '|' + -> + -> | + -> + -> charset | macro | '('')' | + -> '?' | '??' | '*' | '*?' | '+' | '+?' | '{n[,[m]]}' | + '{n[,[m]]}?' +*/ + +template +class basic_parser +{ +public: + enum {char_24_bit = sm_traits::char_24_bit}; + typedef typename sm_traits::char_type char_type; + typedef typename sm_traits::id_type id_type; + typedef basic_end_node end_node; + typedef typename sm_traits::input_char_type input_char_type; + typedef basic_string_token input_string_token; + typedef basic_iteration_node iteration_node; + typedef basic_leaf_node leaf_node; + typedef basic_re_tokeniser + tokeniser; + typedef basic_node node; + typedef typename node::node_ptr_vector node_ptr_vector; + typedef std::basic_string string; + typedef basic_string_token string_token; + typedef std::map macro_map; + typedef basic_selection_node selection_node; + typedef basic_sequence_node sequence_node; + typedef std::map charset_map; + typedef std::pair charset_pair; + typedef bool_ compressed; + + basic_parser (const std::locale &locale_, + node_ptr_vector &node_ptr_vector_, const macro_map ¯o_map_, + charset_map &charset_map_, const id_type eoi_) : + _locale (locale_), + _node_ptr_vector (node_ptr_vector_), + _macro_map (macro_map_), + _charset_map (charset_map_), + _eoi (eoi_), + _token_stack (), + _tree_node_stack () + { + } + + node *parse (const rules_char_type *start_, + const rules_char_type * const end_, const id_type id_, + const id_type user_id_, const id_type next_dfa_, + const id_type push_dfa_, const bool pop_dfa_, + const std::size_t flags_, id_type &nl_id_, const bool seen_bol_, + const bool macro_) + { + node *root_ = 0; + state state_ (start_, end_, id_, flags_, _locale, macro_); + token *lhs_token_ = 0; + std::auto_ptr rhs_token_ (new token); + char action_ = 0; + + _token_stack->push (static_cast(0)); + _token_stack->top () = rhs_token_.release (); + rhs_token_.reset (new token); + tokeniser::next (_token_stack->top (), state_, rhs_token_.get ()); + + do + { + lhs_token_ = _token_stack->top (); + action_ = lhs_token_->precedence (rhs_token_->_type); + + switch (action_) + { + case '<': + case '=': + _token_stack->push (static_cast(0)); + _token_stack->top () = rhs_token_.release (); + rhs_token_.reset (new token); + tokeniser::next (_token_stack->top (), state_, + rhs_token_.get ()); + break; + case '>': + reduce (state_); + break; + default: + { + std::ostringstream ss_; + + ss_ << "A syntax error occurred: '" << + lhs_token_->precedence_string () << + "' against '" << rhs_token_->precedence_string () << + "' preceding index " << state_.index () << + " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + break; + } + } + } while (!_token_stack->empty ()); + + if (_tree_node_stack.empty ()) + { + std::ostringstream ss_; + + ss_ << "Empty rules are not allowed in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + assert (_tree_node_stack.size () == 1); + + node *lhs_node_ = _tree_node_stack.top (); + + _tree_node_stack.pop (); + + if (macro_) + { + // Macros have no end state... + root_ = lhs_node_; + } + else + { + _node_ptr_vector->push_back (static_cast(0)); + + node *rhs_node_ = new end_node (id_, user_id_, next_dfa_, + push_dfa_, pop_dfa_); + + _node_ptr_vector->back () = rhs_node_; + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new sequence_node + (lhs_node_, rhs_node_); + root_ = _node_ptr_vector->back (); + } + + if (seen_bol_) + { + fixup_bol (root_); + } + + if (state_._nl_id != static_cast(~0)) + { + nl_id_ = state_._nl_id; + } + + if ((flags_ & match_zero_len) == 0) + { + const typename node::node_vector &firstpos_ = root_->firstpos(); + typename node::node_vector::const_iterator iter_ = + firstpos_.begin (); + typename node::node_vector::const_iterator end_ = + firstpos_.end (); + + for (; iter_ != end_; ++iter_) + { + const node *node_ = *iter_; + + if (node_->end_state ()) + { + std::ostringstream ss_; + + ss_ << "Rules that match zero characters are not allowed " + "as this can cause an infinite loop in user code. The " + "match_zero_len flag overrides this check. Rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + } + + return root_; + } + + static id_type bol_token () + { + return static_cast(~1); + } + + static id_type eol_token () + { + return static_cast(~2); + } + +private: + typedef typename input_string_token::range input_range; + typedef typename tokeniser::state state; + typedef basic_re_token token; + typedef typename string_token::range range; + typedef ptr_vector string_token_vector; + typedef ptr_stack token_stack; + typedef typename node::node_stack tree_node_stack; + + const std::locale &_locale; + node_ptr_vector &_node_ptr_vector; + const macro_map &_macro_map; + charset_map &_charset_map; + id_type _eoi; + token_stack _token_stack; + tree_node_stack _tree_node_stack; + + struct find_functor + { + // Pointer to stop warning about cannot create assignment operator. + const string_token *_token; + + find_functor (const string_token &token_) : + _token (&token_) + { + } + + bool operator () (const string_token *rhs_) + { + return *_token == *rhs_; + } + }; + + void reduce (state &state_) + { + token *lhs_ = 0; + token *rhs_ = 0; + token_stack handle_; + char action_ = 0; + + do + { + rhs_ = _token_stack->top (); + handle_->push (static_cast(0)); + _token_stack->pop (); + handle_->top () = rhs_; + + if (!_token_stack->empty ()) + { + lhs_ = _token_stack->top (); + action_ = lhs_->precedence (rhs_->_type); + } + } while (!_token_stack->empty () && action_ == '='); + + assert (_token_stack->empty () || action_ == '<'); + + switch (rhs_->_type) + { + case BEGIN: + // finished processing so exit + break; + case REGEX: + // finished parsing, nothing to do + break; + case OREXP: + orexp (handle_); + break; + case SEQUENCE: + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (OREXP); + break; + case SUB: + sub (handle_); + break; + case EXPRESSION: + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (SUB); + break; + case REPEAT: + repeat (handle_); + break; + case BOL: + bol (handle_); + break; + case EOL: + eol (handle_, state_); + break; + case CHARSET: + charset (handle_, compressed ()); + break; + case MACRO: + macro (handle_, state_); + break; + case OPENPAREN: + openparen (handle_); + break; + case OPT: + case AOPT: + optional (rhs_->_type == OPT); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (DUP); + break; + case ZEROORMORE: + case AZEROORMORE: + zero_or_more (rhs_->_type == ZEROORMORE); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (DUP); + break; + case ONEORMORE: + case AONEORMORE: + one_or_more (rhs_->_type == ONEORMORE); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (DUP); + break; + case REPEATN: + case AREPEATN: + repeatn (rhs_->_type == REPEATN, handle_->top ()); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (DUP); + break; + default: + throw runtime_error + ("Internal error in regex_parser::reduce."); + break; + } + } + + void orexp (token_stack &handle_) + { + assert (handle_->top ()->_type == OREXP && + (handle_->size () == 1 || handle_->size () == 3)); + + if (handle_->size () == 1) + { + std::auto_ptr token_ (new token (REGEX)); + + _token_stack->push (static_cast(0)); + _token_stack->top () = token_.release (); + } + else + { + token *token_ = handle_->top (); + + handle_->pop (); + delete token_; + token_ = 0; + assert (handle_->top ()->_type == OR); + token_ = handle_->top (); + handle_->pop (); + delete token_; + token_ = 0; + assert (handle_->top ()->_type == SEQUENCE); + perform_or (); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (OREXP); + } + } + + void perform_or () + { + // perform or + node *rhs_ = _tree_node_stack.top (); + + _tree_node_stack.pop (); + + node *lhs_ = _tree_node_stack.top (); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new selection_node (lhs_, rhs_); + _tree_node_stack.top () = _node_ptr_vector->back (); + } + + void sub (token_stack &handle_) + { + assert ((handle_->top ()->_type == SUB && + handle_->size () == 1) || handle_->size () == 2); + + if (handle_->size () == 1) + { + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (SEQUENCE); + } + else + { + token *token_ = handle_->top (); + + handle_->pop (); + delete token_; + token_ = 0; + assert (handle_->top ()->_type == EXPRESSION); + // perform join + sequence (); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (SUB); + } + } + + void repeat (token_stack &handle_) + { + assert (handle_->top ()->_type == REPEAT && + handle_->size () >= 1 && handle_->size () <= 3); + + if (handle_->size () == 1) + { + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (EXPRESSION); + } + else + { + token *token_ = handle_->top (); + + handle_->pop (); + delete token_; + token_ = 0; + assert (handle_->top ()->_type == DUP); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REPEAT); + } + } + +#ifndef NDEBUG + void bol (token_stack &handle_) +#else + void bol (token_stack &) +#endif + { + assert (handle_->top ()->_type == BOL && + handle_->size () == 1); + + // store charset + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new leaf_node (bol_token (), true); + _tree_node_stack.push (_node_ptr_vector->back ()); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REPEAT); + } + +#ifndef NDEBUG + void eol (token_stack &handle_, state &state_) +#else + void eol (token_stack &, state &state_) +#endif + { + // Done in two parts for VC6. + const string_token nl_ ('\n'); + + assert (handle_->top ()->_type == EOL && + handle_->size () == 1); + state_._nl_id = lookup (nl_); + // store charset + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new leaf_node (eol_token (), true); + _tree_node_stack.push (_node_ptr_vector->back ()); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REPEAT); + } + + // Uncompressed + void charset (token_stack &handle_, const false_ &) + { + assert (handle_->top ()->_type == CHARSET && + handle_->size () == 1); + + const id_type id_ = lookup (handle_->top ()->_str); + + // store charset + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new leaf_node (id_, true); + _tree_node_stack.push (_node_ptr_vector->back ()); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REPEAT); + } + + // Compressed + void charset (token_stack &handle_, const true_ &) + { + assert (handle_->top ()->_type == CHARSET && + handle_->size () == 1); + + std::auto_ptr token_ (handle_->top ()); + + handle_->pop (); + create_sequence (token_); + } + + // Slice wchar_t into sequence of char. + void create_sequence (std::auto_ptr &token_) + { + typename token::string_token::range_vector::iterator iter_ = + token_->_str._ranges.begin (); + typename token::string_token::range_vector::const_iterator end_ = + token_->_str._ranges.end (); + + string_token_vector data_[char_24_bit ? 3 : 2]; + + for (; iter_ != end_; ++iter_) + { + slice_range (*iter_, data_, bool_ ()); + } + + push_ranges (data_, bool_ ()); + + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (OPENPAREN); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REGEX); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (CLOSEPAREN); + } + + // 16 bit unicode + void slice_range (const input_range &range_, string_token_vector data_[2], + const false_ &) + { + const unsigned char first_msb_ = static_cast + ((range_.first >> 8) & 0xff); + const unsigned char first_lsb_ = static_cast + (range_.first & 0xff); + const unsigned char second_msb_ = static_cast + ((range_.second >> 8) & 0xff); + const unsigned char second_lsb_ = static_cast + (range_.second & 0xff); + + if (first_msb_ == second_msb_) + { + insert_range (first_msb_, first_msb_, first_lsb_, + second_lsb_, data_); + } + else + { + insert_range (first_msb_, first_msb_, first_lsb_, 0xff, data_); + + if (second_msb_ > first_msb_ + 1) + { + insert_range (first_msb_ + 1, second_msb_ - 1, 0, 0xff, data_); + } + + insert_range (second_msb_, second_msb_, 0, second_lsb_, data_); + } + } + + // 24 bit unicode + void slice_range (const input_range &range_, string_token_vector data_[3], + const true_ &) + { + const unsigned char first_msb_ = static_cast + ((range_.first >> 16) & 0xff); + const unsigned char first_mid_ = static_cast + ((range_.first >> 8) & 0xff); + const unsigned char first_lsb_ = static_cast + (range_.first & 0xff); + const unsigned char second_msb_ = static_cast + ((range_.second >> 16) & 0xff); + const unsigned char second_mid_ = static_cast + ((range_.second >> 8) & 0xff); + const unsigned char second_lsb_ = static_cast + (range_.second & 0xff); + + if (first_msb_ == second_msb_) + { + string_token_vector data2_[2]; + + // Re-use 16 bit slice function + slice_range (range_, data2_, false_ ()); + + for (std::size_t i_ = 0, size_ = data2_[0]->size (); + i_ < size_; ++i_) + { + insert_range (string_token (first_msb_, first_msb_), + *(*data2_[0])[i_], *(*data2_[1])[i_], data_); + } + } + else + { + insert_range (first_msb_, first_msb_, + first_mid_, first_mid_, + first_lsb_, 0xff, data_); + + if (first_mid_ != 0xff) + { + insert_range (first_msb_, first_msb_, + first_mid_ + 1, 0xff, + 0, 0xff, data_); + } + + if (second_msb_ > first_msb_ + 1) + { + insert_range (first_mid_ + 1, second_mid_ - 1, + 0, 0xff, + 0, 0xff, data_); + } + + if (second_mid_ != 0) + { + insert_range (second_msb_, second_msb_, + 0, second_mid_ - 1, + 0, 0xff, data_); + insert_range (second_msb_, second_msb_, + second_mid_, second_mid_, + 0, second_lsb_, data_); + } + else + { + insert_range (second_msb_, second_msb_, + 0, second_mid_, + 0, second_lsb_, data_); + } + } + } + + // 16 bit unicode + void insert_range (const unsigned char first_, const unsigned char second_, + const unsigned char first2_, const unsigned char second2_, + string_token_vector data_[2]) + { + const string_token token_ (first_ > second_ ? second_ : first_, + first_ > second_ ? first_ : second_); + const string_token token2_ (first2_ > second2_ ? second2_ : first2_, + first2_ > second2_ ? first2_ : second2_); + + insert_range (token_, token2_, data_); + } + + void insert_range (const string_token &token_, const string_token &token2_, + string_token_vector data_[2]) + { + typename string_token_vector::vector::const_iterator iter_ = + std::find_if (data_[0]->begin (), data_[0]->end (), + find_functor (token_)); + + if (iter_ == data_[0]->end ()) + { + data_[0]->push_back (0); + data_[0]->back () = new string_token (token_); + data_[1]->push_back (0); + data_[1]->back () = new string_token (token2_); + } + else + { + const std::size_t index_ = iter_ - data_[0]->begin (); + + (*data_[1])[index_]->insert (token2_); + } + } + + // 24 bit unicode + void insert_range (const unsigned char first_, const unsigned char second_, + const unsigned char first2_, const unsigned char second2_, + const unsigned char first3_, const unsigned char second3_, + string_token_vector data_[3]) + { + const string_token token_ (first_ > second_ ? second_ : first_, + first_ > second_ ? first_ : second_); + const string_token token2_ (first2_ > second2_ ? second2_ : first2_, + first2_ > second2_ ? first2_ : second2_); + const string_token token3_ (first3_ > second3_ ? second3_ : first3_, + first3_ > second3_ ? first3_ : second3_); + + insert_range (token_, token2_, token3_, data_); + } + + void insert_range (const string_token &token_, const string_token &token2_, + const string_token &token3_, string_token_vector data_[3]) + { + typename string_token_vector::vector::const_iterator iter_ = + data_[0]->begin (); + typename string_token_vector::vector::const_iterator end_ = + data_[0]->end (); + bool finished_ = false; + + do + { + iter_ = std::find_if (iter_, end_, find_functor (token_)); + + if (iter_ == end_) + { + data_[0]->push_back (0); + data_[0]->back () = new string_token (token_); + data_[1]->push_back (0); + data_[1]->back () = new string_token (token2_); + data_[2]->push_back (0); + data_[2]->back () = new string_token (token3_); + finished_ = true; + } + else + { + const std::size_t index_ = iter_ - data_[0]->begin (); + + if (*(*data_[1])[index_] == token2_) + { + (*data_[2])[index_]->insert (token3_); + finished_ = true; + } + else + { + ++iter_; + } + } + } while (!finished_); + } + + // 16 bit unicode + void push_ranges (string_token_vector data_[2], const false_ &) + { + typename string_token_vector::vector::const_iterator viter_ = + data_[0]->begin (); + typename string_token_vector::vector::const_iterator vend_ = + data_[0]->end (); + typename string_token_vector::vector::const_iterator viter2_ = + data_[1]->begin (); + + push_range (*viter_++); + push_range (*viter2_++); + sequence (); + + while (viter_ != vend_) + { + push_range (*viter_++); + push_range (*viter2_++); + sequence (); + perform_or (); + } + } + + // 24 bit unicode + void push_ranges (string_token_vector data_[3], const true_ &) + { + typename string_token_vector::vector::const_iterator viter_ = + data_[0]->begin (); + typename string_token_vector::vector::const_iterator vend_ = + data_[0]->end (); + typename string_token_vector::vector::const_iterator viter2_ = + data_[1]->begin (); + typename string_token_vector::vector::const_iterator viter3_ = + data_[2]->begin (); + + push_range (*viter_++); + push_range (*viter2_++); + sequence (); + push_range (*viter3_++); + sequence (); + + while (viter_ != vend_) + { + push_range (*viter_++); + push_range (*viter2_++); + sequence (); + push_range (*viter3_++); + sequence (); + perform_or (); + } + } + + void push_range (const string_token *token_) + { + const id_type id_ = lookup (*token_); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new leaf_node (id_, true); + _tree_node_stack.push (_node_ptr_vector->back ()); + } + + id_type lookup (const string_token &charset_) + { + // Converted to id_type below. + std::size_t id_ = sm_traits::npos (); + typename charset_map::const_iterator iter_ = + _charset_map.find (charset_); + + if (iter_ == _charset_map.end ()) + { + id_ = _charset_map.size (); + _charset_map.insert (charset_pair (charset_, id_)); + } + else + { + id_ = iter_->second; + } + + if (static_cast(id_) < id_) + { + throw runtime_error ("id_type is not large enough " + "to hold all ids."); + } + + return static_cast(id_); + } + + void macro (token_stack &handle_, const state &state_) + { + const token *top_ = handle_->top (); + + assert (top_->_type == MACRO && handle_->size () == 1); + + typename macro_map::const_iterator iter_ = + _macro_map.find (top_->_extra); + + if (iter_ == _macro_map.end ()) + { + const rules_char_type *name_ = top_->_extra.c_str (); + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Unknown MACRO name '"; + + while (*name_) + { + os_ << ss_.narrow (*name_++, ' '); + } + + os_ << "' in rule id " << state_._id << '.'; + throw runtime_error (os_.str ()); + } + + _tree_node_stack.push (iter_->second->copy (_node_ptr_vector)); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REPEAT); + } + + void openparen (token_stack &handle_) + { + token *token_ = handle_->top (); + + assert (token_->_type == OPENPAREN && + handle_->size () == 3); + + handle_->pop (); + delete token_; + token_ = handle_->top (); + assert (token_->_type == REGEX); + handle_->pop (); + delete token_; + token_ = 0; + assert (handle_->top ()->_type == CLOSEPAREN); + _token_stack->push (static_cast(0)); + _token_stack->top () = new token (REPEAT); + } + + void sequence () + { + node *rhs_ = _tree_node_stack.top (); + + _tree_node_stack.pop (); + + node *lhs_ = _tree_node_stack.top (); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new sequence_node (lhs_, rhs_); + _tree_node_stack.top () = _node_ptr_vector->back (); + } + + void optional (const bool greedy_) + { + // perform ? + node *lhs_ = _tree_node_stack.top (); + // Don't know if lhs_ is a leaf_node, so get firstpos. + typename node::node_vector &firstpos_ = lhs_->firstpos (); + + for (typename node::node_vector::iterator iter_ = firstpos_.begin (), + end_ = firstpos_.end (); iter_ != end_; ++iter_) + { + // These are leaf_nodes! + (*iter_)->greedy (greedy_); + } + + _node_ptr_vector->push_back (static_cast(0)); + + node *rhs_ = new leaf_node (node::null_token (), greedy_); + + _node_ptr_vector->back () = rhs_; + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new selection_node (lhs_, rhs_); + _tree_node_stack.top () = _node_ptr_vector->back (); + } + + void zero_or_more (const bool greedy_) + { + // perform * + node *ptr_ = _tree_node_stack.top (); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new iteration_node (ptr_, greedy_); + _tree_node_stack.top () = _node_ptr_vector->back (); + } + + void one_or_more (const bool greedy_) + { + // perform + + node *lhs_ = _tree_node_stack.top (); + node *copy_ = lhs_->copy (_node_ptr_vector); + + _node_ptr_vector->push_back (static_cast(0)); + + node *rhs_ = new iteration_node (copy_, greedy_); + + _node_ptr_vector->back () = rhs_; + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new sequence_node (lhs_, rhs_); + _tree_node_stack.top () = _node_ptr_vector->back (); + } + + // perform {n[,[m]]} + // Semantic checks have already been performed. + // {0,} = * + // {0,1} = ? + // {1,} = + + // therefore we do not check for these cases. + void repeatn (const bool greedy_, const token *token_) + { + const rules_char_type *str_ = token_->_extra.c_str (); + std::size_t min_ = 0; + bool comma_ = false; + std::size_t max_ = 0; + + while (*str_>= '0' && *str_ <= '9') + { + min_ *= 10; + min_ += *str_ - '0'; + ++str_; + } + + comma_ = *str_ == ','; + + if (comma_) ++str_; + + while (*str_>= '0' && *str_ <= '9') + { + max_ *= 10; + max_ += *str_ - '0'; + ++str_; + } + + if (!(min_ == 1 && !comma_)) + { + const std::size_t top_ = min_ > 0 ? min_ : max_; + + if (min_ == 0) + { + optional (greedy_); + } + + node *prev_ = _tree_node_stack.top ()-> + copy (_node_ptr_vector); + node *curr_ = 0; + + for (std::size_t i_ = 2; i_ < top_; ++i_) + { + node *temp_ = prev_->copy (_node_ptr_vector); + + curr_ = temp_; + _tree_node_stack.push (static_cast(0)); + _tree_node_stack.top () = prev_; + sequence (); + prev_ = curr_; + } + + if (comma_ && min_ > 0) + { + if (min_ > 1) + { + node *temp_ = prev_->copy (_node_ptr_vector); + + curr_ = temp_; + _tree_node_stack.push (static_cast(0)); + _tree_node_stack.top () = prev_; + sequence (); + prev_ = curr_; + } + + if (comma_ && max_) + { + _tree_node_stack.push (static_cast(0)); + _tree_node_stack.top () = prev_; + optional (greedy_); + + node *temp_ = _tree_node_stack.top (); + + _tree_node_stack.pop (); + prev_ = temp_; + + const std::size_t count_ = max_ - min_; + + for (std::size_t i_ = 1; i_ < count_; ++i_) + { + node *temp_ = prev_->copy (_node_ptr_vector); + + curr_ = temp_; + _tree_node_stack.push (static_cast(0)); + _tree_node_stack.top () = prev_; + sequence (); + prev_ = curr_; + } + } + else + { + _tree_node_stack.push (static_cast(0)); + _tree_node_stack.top () = prev_; + zero_or_more (greedy_); + + node *temp_ = _tree_node_stack.top (); + + prev_ = temp_; + _tree_node_stack.pop (); + } + } + + _tree_node_stack.push (static_cast(0)); + _tree_node_stack.top () = prev_; + sequence (); + } + } + + void fixup_bol (node * &root_)const + { + typename node::node_vector *first_ = &root_->firstpos (); + bool found_ = false; + typename node::node_vector::const_iterator iter_ = + first_->begin (); + typename node::node_vector::const_iterator end_ = + first_->end (); + + for (; iter_ != end_; ++iter_) + { + const node *node_ = *iter_; + + found_ = !node_->end_state () && node_->token () == bol_token (); + + if (found_) break; + } + + if (!found_) + { + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new leaf_node (bol_token (), true); + + node *lhs_ = _node_ptr_vector->back (); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new leaf_node + (node::null_token (), true); + + node *rhs_ = _node_ptr_vector->back (); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new selection_node (lhs_, rhs_); + lhs_ = _node_ptr_vector->back (); + + _node_ptr_vector->push_back (static_cast(0)); + _node_ptr_vector->back () = new sequence_node (lhs_, root_); + root_ = _node_ptr_vector->back (); + } + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tokeniser/re_token.hpp b/inc/lexertl/parser/tokeniser/re_token.hpp new file mode 100644 index 0000000..449ad65 --- /dev/null +++ b/inc/lexertl/parser/tokeniser/re_token.hpp @@ -0,0 +1,100 @@ +// re_token.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_RE_TOKEN_HPP +#define LEXERTL_RE_TOKEN_HPP + +#include "../../string_token.hpp" + +namespace lexertl +{ +namespace detail +{ +enum token_type {BEGIN, REGEX, OREXP, SEQUENCE, SUB, EXPRESSION, REPEAT, + DUP, OR, CHARSET, BOL, EOL, MACRO, OPENPAREN, CLOSEPAREN, OPT, AOPT, + ZEROORMORE, AZEROORMORE, ONEORMORE, AONEORMORE, REPEATN, AREPEATN, + END}; + +template +struct basic_re_token +{ + typedef basic_string_token string_token; + typedef std::basic_string string; + + token_type _type; + string _extra; + string_token _str; + + basic_re_token (const token_type type_ = BEGIN) : + _type (type_), + _extra (), + _str () + { + } + + void clear () + { + _type = BEGIN; + _extra.clear (); + _str.clear (); + } + + basic_re_token &operator = (const basic_re_token &rhs_) + { + _type = rhs_._type; + _extra = rhs_._extra; + _str = rhs_._str; + return *this; + } + + char precedence (const token_type type_) const + { + // Moved in here for Solaris compiler. + static const char precedence_table_[END + 1][END + 1] = { +// BEG, REG, ORE, SEQ, SUB, EXP, RPT, DUP, | , CHR, BOL, EOL, MCR, ( , ) , ? , ?? , * , *? , + , +?, {n}?, {n}, END +/*BEGIN*/{' ', '<', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/*REGEX*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/*OREXP*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* SEQ */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* SUB */{' ', ' ', ' ', ' ', ' ', '=', '<', ' ', '>', '<', '<', '<', '<', '<', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/*EXPRE*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* RPT */{' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', '>', '>', '>', '<', '<', '<', '<', '<', '<', '<', '<', '>'}, +/*DUPLI*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* | */{' ', ' ', ' ', '=', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '}, +/*CHARA*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'}, +/* BOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'}, +/* EOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'}, +/*MACRO*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'}, +/* ( */{' ', '=', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '}, +/* ) */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'}, +/* ? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* ?? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* * */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* *? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* + */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* +? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/*{n,m}*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/*{nm}?*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'}, +/* END */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '} +}; + + return precedence_table_[_type][type_]; + } + + const char *precedence_string () const + { + // Moved in here for Solaris compiler. + static const char *precedence_strings_[END + 1] = + {"BEGIN", "REGEX", "OREXP", "SEQUENCE", "SUB", "EXPRESSION", + "REPEAT", "DUPLICATE", "|", "CHARSET", "^", "$", "MACRO", "(", ")", + "?", "??", "*", "*?", "+", "+?", "{n[,[m]]}", "{n[,[m]]}?", "END"}; + + return precedence_strings_[_type]; + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tokeniser/re_tokeniser.hpp b/inc/lexertl/parser/tokeniser/re_tokeniser.hpp new file mode 100644 index 0000000..394eea4 --- /dev/null +++ b/inc/lexertl/parser/tokeniser/re_tokeniser.hpp @@ -0,0 +1,829 @@ +// tokeniser.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_RE_TOKENISER_HPP +#define LEXERTL_RE_TOKENISER_HPP + +#include +#include "re_token.hpp" +#include "../../runtime_error.hpp" +#include "../../size_t.hpp" +#include +#include "../../string_token.hpp" +#include "re_tokeniser_helper.hpp" + +namespace lexertl +{ +namespace detail +{ +template +class basic_re_tokeniser +{ +public: + typedef basic_re_token re_token; + typedef basic_re_tokeniser_helper + tokeniser_helper; + typedef typename tokeniser_helper::char_state char_state; + typedef typename tokeniser_helper::state state; + typedef basic_string_token string_token; + + static void next (re_token *lhs_, state &state_, re_token *token_) + { + rules_char_type ch_ = 0; + bool eos_ = state_.next (ch_); + bool skipped_ = false; + + token_->clear (); + + do + { + // string begin/end + while (!eos_ && ch_ == '"') + { + state_._in_string ^= 1; + eos_ = state_.next (ch_); + } + + // (?# ...) + skipped_ = comment (eos_, ch_, state_); + // skip_ws set + skipped_ |= skip (eos_, ch_, state_); + } while (skipped_); + + if (eos_) + { + if (state_._in_string) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '\"') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (state_._paren_count) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing ')') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + token_->_type = END; + } + else + { + if (ch_ == '\\') + { + // Even if we are in a string, respect escape sequences... + token_->_type = CHARSET; + escape (state_, token_->_str); + } + else if (state_._in_string) + { + // All other meta characters lose their special meaning + // inside a string. + token_->_type = CHARSET; + token_->_str.insert (typename string_token::range (ch_, ch_)); + } + else + { + // Not an escape sequence and not inside a string, so + // check for meta characters. + switch (ch_) + { + case '(': + token_->_type = OPENPAREN; + ++state_._paren_count; + read_options (state_); + break; + case ')': + --state_._paren_count; + + if (state_._paren_count < 0) + { + std::ostringstream ss_; + + ss_ << "Number of open parenthesis < 0 " + "at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + token_->_type = CLOSEPAREN; + + if (!state_._flags_stack.empty ()) + { + state_._flags = state_._flags_stack.top (); + state_._flags_stack.pop (); + } + + break; + case '?': + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AOPT; + state_.increment (); + } + else + { + token_->_type = OPT; + } + + break; + case '*': + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AZEROORMORE; + state_.increment (); + } + else + { + token_->_type = ZEROORMORE; + } + + break; + case '+': + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AONEORMORE; + state_.increment (); + } + else + { + token_->_type = ONEORMORE; + } + + break; + case '{': + open_curly (lhs_, state_, token_); + break; + case '|': + token_->_type = OR; + break; + case '^': + if (!state_._macro && state_._curr - 1 == state_._start) + { + token_->_type = BOL; + } + else + { + token_->_type = CHARSET; + token_->_str.insert (typename string_token::range + (ch_, ch_)); + } + + break; + case '$': + if (!state_._macro && state_._curr == state_._end) + { + token_->_type = EOL; + } + else + { + token_->_type = CHARSET; + token_->_str.insert (typename string_token::range + (ch_, ch_)); + } + + break; + case '.': + { + token_->_type = CHARSET; + + if (state_._flags & dot_not_newline) + { + token_->_str.insert (typename string_token::range + ('\n', '\n')); + } + + token_->_str.negate (); + break; + } + case '[': + { + token_->_type = CHARSET; + tokeniser_helper::charset (state_, token_->_str); + break; + } + case '/': + { + std::ostringstream ss_; + + ss_ << "Lookahead ('/') is not supported yet in " << + "rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + break; + } + default: + token_->_type = CHARSET; + + if ((state_._flags & icase) && + (std::isupper (ch_, state_._locale) || + std::islower (ch_, state_._locale))) + { + char_type upper_ = std::toupper + (ch_, state_._locale); + char_type lower_ = std::tolower + (ch_, state_._locale); + + token_->_str.insert (typename string_token::range + (upper_, upper_)); + token_->_str.insert (typename string_token::range + (lower_, lower_)); + } + else + { + token_->_str.insert (typename string_token::range + (ch_, ch_)); + } + + break; + } + } + } + } + +private: + static bool comment (bool &eos_, rules_char_type &ch_, state &state_) + { + bool skipped_ = false; + + if (!eos_ && !state_._in_string && ch_ == '(' && + !state_.eos () && *state_._curr == '?' && + state_._curr + 1 < state_._end && *(state_._curr + 1) == '#') + { + std::size_t paren_count_ = 1; + + state_.increment (); + state_.increment (); + + do + { + eos_ = state_.next (ch_); + + if (ch_ == '(') + { + ++paren_count_; + } + else if (ch_ == ')') + { + --paren_count_; + } + } while (!eos_ && !(ch_ == ')' && paren_count_ == 0)); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (unterminated comment) " << + "in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + else + { + eos_ = state_.next (ch_); + } + + skipped_ = true; + } + + return skipped_; + } + + static bool skip (bool &eos_, rules_char_type &ch_, state &state_) + { + bool skipped_ = false; + + if (!eos_ && (state_._flags & skip_ws) && !state_._in_string) + { + bool c_comment_ = false; + bool skip_ws_ = false; + + do + { + c_comment_ = ch_ == '/' && !state_.eos () && + *state_._curr == '*'; + skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' || + ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v'); + + if (c_comment_) + { + state_.increment (); + eos_ = state_.next (ch_); + + while (!eos_ && !(ch_ == '*' && !state_.eos () && + *state_._curr == '/')) + { + eos_ = state_.next (ch_); + } + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (unterminated " << + "C style comment) in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + else + { + state_.increment (); + eos_ = state_.next (ch_); + } + + skipped_ = true; + } + else if (skip_ws_) + { + eos_ = state_.next (ch_); + skipped_ = true; + } + } while (c_comment_ || skip_ws_); + } + + return skipped_; + } + + static void read_options (state &state_) + { + if (!state_.eos () && *state_._curr == '?') + { + rules_char_type ch_ = 0; + bool eos_ = false; + bool negate_ = false; + + state_.increment (); + eos_ = state_.next (ch_); + state_._flags_stack.push (state_._flags); + + while (!eos_ && ch_ != ':') + { + switch (ch_) + { + case '-': + negate_ ^= 1; + break; + case 'i': + if (negate_) + { + state_._flags = state_._flags & ~icase; + } + else + { + state_._flags = state_._flags | icase; + } + + negate_ = false; + break; + case 's': + if (negate_) + { + state_._flags = state_._flags | dot_not_newline; + } + else + { + state_._flags = state_._flags & ~dot_not_newline; + } + + negate_ = false; + break; + case 'x': + if (negate_) + { + state_._flags = state_._flags & ~skip_ws; + } + else + { + state_._flags = state_._flags | skip_ws; + } + + negate_ = false; + break; + default: + { + std::ostringstream ss_; + + ss_ << "Unknown option at index " << + state_.index () - 1 << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + eos_ = state_.next (ch_); + } + + // End of string handler will handle early termination + } + else if (!state_._flags_stack.empty ()) + { + state_._flags_stack.push (state_._flags); + } + } + + static void escape (state &state_, string_token &token_) + { + char_type ch_ = 0; + std::size_t str_len_ = 0; + const char *str_ = tokeniser_helper::escape_sequence (state_, + ch_, str_len_); + + if (str_) + { + char_state state2_ (str_ + 1, str_ + str_len_, state_._id, + state_._flags, state_._locale, false); + + tokeniser_helper::charset (state2_, token_); + } + else + { + token_.insert (typename string_token::range (ch_, ch_)); + } + } + + static void open_curly (re_token *lhs_, state &state_, + re_token *token_) + { + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '}') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + else if (*state_._curr == '-') + { + charset_difference (lhs_, state_, token_); + } + else if (*state_._curr == '+') + { + charset_union (lhs_, state_, token_); + } + else if (*state_._curr >= '0' && *state_._curr <= '9') + { + repeat_n (state_, token_); + } + else + { + macro (state_, token_); + } + } + + static void charset_difference (re_token *lhs_, state &state_, + re_token *token_) + { + rules_char_type ch_ = 0; + + if (lhs_->_type != CHARSET) + { + std::ostringstream ss_; + + ss_ << "CHARSET must precede {-} at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + state_.next (ch_); + + if (state_.next (ch_)) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '}') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (ch_ != '}') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + re_token rhs_; + + next (lhs_, state_, &rhs_); + + if (rhs_._type != CHARSET) + { + std::ostringstream ss_; + + ss_ << "CHARSET must follow {-} at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + lhs_->_str.remove (rhs_._str); + + if (lhs_->_str.empty ()) + { + std::ostringstream ss_; + + ss_ << "Empty charset created by {-} at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + next (lhs_, state_, token_); + } + + static void charset_union (re_token *lhs_, state &state_, + re_token *token_) + { + rules_char_type ch_ = 0; + + if (lhs_->_type != CHARSET) + { + std::ostringstream ss_; + + ss_ << "CHARSET must precede {+} at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + state_.next (ch_); + + if (state_.next (ch_)) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '}') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (ch_ != '}') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + re_token rhs_; + + next (lhs_, state_, &rhs_); + + if (rhs_._type != CHARSET) + { + std::ostringstream ss_; + + ss_ << "CHARSET must follow {+} at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + lhs_->_str.insert (rhs_._str); + next (lhs_, state_, token_); + } + + // SYNTAX: + // {n[,[n]]} + // SEMANTIC RULES: + // {0} - INVALID (throw exception) + // {0,} = * + // {0,0} - INVALID (throw exception) + // {0,1} = ? + // {1,} = + + // {min,max} where min == max - {min} + // {min,max} where max < min - INVALID (throw exception) + static void repeat_n (state &state_, re_token *token_) + { + rules_char_type ch_ = 0; + bool eos_ = state_.next (ch_); + std::size_t min_ = 0; + std::size_t max_ = 0; + + while (!eos_ && ch_ >= '0' && ch_ <= '9') + { + min_ *= 10; + min_ += ch_ - '0'; + token_->_extra += ch_; + eos_ = state_.next (ch_); + } + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '}') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + bool min_max_ = false; + bool repeatn_ = true; + + if (ch_ == ',') + { + token_->_extra += ch_; + eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '}') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (ch_ == '}') + { + // Small optimisation: Check for '*' equivalency. + if (min_ == 0) + { + token_->_type = ZEROORMORE; + repeatn_ = false; + } + // Small optimisation: Check for '+' equivalency. + else if (min_ == 1) + { + token_->_type = ONEORMORE; + repeatn_ = false; + } + } + else + { + if (ch_ < '0' || ch_ > '9') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + min_max_ = true; + + do + { + max_ *= 10; + max_ += ch_ - '0'; + token_->_extra += ch_; + eos_ = state_.next (ch_); + } while (!eos_ && ch_ >= '0' && ch_ <= '9'); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing '}') " + "in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + // Small optimisation: Check for '?' equivalency. + if (min_ == 0 && max_ == 1) + { + token_->_type = OPT; + repeatn_ = false; + } + // Small optimisation: if min == max, then min. + else if (min_ == max_) + { + token_->_extra.erase (token_->_extra.find (',')); + min_max_ = false; + max_ = 0; + } + } + } + + if (ch_ != '}') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (repeatn_) + { + // SEMANTIC VALIDATION follows: + // NOTE: {0,} has already become * + // therefore we don't check for a comma. + if (min_ == 0 && max_ == 0) + { + std::ostringstream ss_; + + ss_ << "Cannot have exactly zero repeats preceding index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (min_max_ && max_ < min_) + { + std::ostringstream ss_; + + ss_ << "Max less than min preceding index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AREPEATN; + state_.increment (); + } + else + { + token_->_type = REPEATN; + } + } + else if (token_->_type == ZEROORMORE) + { + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AZEROORMORE; + state_.increment (); + } + } + else if (token_->_type == ONEORMORE) + { + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AONEORMORE; + state_.increment (); + } + } + else if (token_->_type == OPT) + { + if (!state_.eos () && *state_._curr == '?') + { + token_->_type = AOPT; + state_.increment (); + } + } + } + + static void macro (state &state_, re_token *token_) + { + rules_char_type ch_ = 0; + bool eos_ = false; + + state_.next (ch_); + + if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') && + !(ch_ >= 'a' && ch_ <= 'z')) + { + std::ostringstream ss_; + + ss_ << "Invalid MACRO name at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + do + { + token_->_extra += ch_; + eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex " << + "(missing '}') in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') || + (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9')); + + if (ch_ != '}') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + token_->_type = MACRO; + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tokeniser/re_tokeniser_helper.hpp b/inc/lexertl/parser/tokeniser/re_tokeniser_helper.hpp new file mode 100644 index 0000000..4507ce2 --- /dev/null +++ b/inc/lexertl/parser/tokeniser/re_tokeniser_helper.hpp @@ -0,0 +1,2351 @@ +// tokeniser_helper.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_RE_TOKENISER_HELPER_H +#define LEXERTL_RE_TOKENISER_HELPER_H + +#include "../../bool.hpp" +#include "../../char_traits.hpp" +// strlen() +#include +#include "../../size_t.hpp" +#include "re_tokeniser_state.hpp" +#include "../../runtime_error.hpp" +#include +#include "../../string_token.hpp" + +namespace lexertl +{ +namespace detail +{ +template > +class basic_re_tokeniser_helper +{ +public: + typedef basic_re_tokeniser_state char_state; + typedef basic_re_tokeniser_state state; + typedef basic_string_token string_token; + + template + struct size + { + }; + + typedef size<1> one; + typedef size<2> two; + typedef size<4> four; + + template + static const char *escape_sequence (state_type &state_, + char_type &ch_, std::size_t &str_len_) + { + bool eos_ = state_.eos (); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following '\\' in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + const char *str_ = charset_shortcut (state_, str_len_); + + if (str_) + { + state_.increment (); + } + else + { + ch_ = chr (state_); + } + + return str_; + } + + // This function can call itself. + template + static void charset (state_type &state_, string_token &token_) + { + bool negated_ = false; + typename state_type::char_type ch_ = 0; + bool eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following '[' in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + negated_ = ch_ == '^'; + + if (negated_) + { + eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following '^' in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + bool chset_ = false; + typename string_token::char_type prev_ = 0; + + while (ch_ != ']') + { + if (ch_ == '\\') + { + std::size_t str_len_ = 0; + const char *str_ = escape_sequence (state_, prev_, + str_len_); + + chset_ = str_ != 0; + + if (chset_) + { + char_state temp_state_ (str_ + 1, str_ + str_len_, + state_._id, state_._flags, state_._locale, false); + string_token temp_token_; + + charset (temp_state_, temp_token_); + token_.insert (temp_token_); + } + } + else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + posix (state_, token_); + chset_ = true; + } + else + { + chset_ = false; + prev_ = ch_; + } + + eos_ = state_.next (ch_); + + // Covers preceding if, else if and else + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing ']') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (ch_ == '-') + { + charset_range (chset_, state_, eos_, ch_, prev_, + token_); + } + else if (!chset_) + { + token_.insert (typename string_token::range (prev_, prev_)); + + if (state_._flags & icase) + { + const input_char_type folded_ = fold (prev_, + state_._locale, size ()); + + if (prev_ != folded_) + { + token_.insert (typename string_token::range + (folded_, folded_)); + } + } + } + } + + if (negated_) + { + token_.negate (); + } + + if (token_.empty ()) + { + std::ostringstream ss_; + + ss_ << "Empty charsets not allowed preceding index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + +private: + struct char_pair + { + input_char_type first; + input_char_type second; + }; + + struct fold_pair + { + char_pair from; + char_pair to; + }; + + template + static void posix (state_type &state_, string_token &token_) + { + bool negate_ = false; + + if (!state_.eos () && *state_._curr == '^') + { + negate_ = true; + state_.increment (); + } + + if (!state_.eos ()) + { + switch (*state_._curr) + { + case 'a': + // alnum + // alpha + alnum_alpha (state_, token_, negate_); + break; + case 'b': + // blank + blank (state_, token_, negate_); + break; + case 'c': + // cntrl + cntrl (state_, token_, negate_); + break; + case 'd': + // digit + digit (state_, token_, negate_); + break; + case 'g': + // graph + graph (state_, token_, negate_); + break; + case 'l': + // lower + lower (state_, token_, negate_); + break; + case 'p': + // print + // punct + print_punct (state_, token_, negate_); + break; + case 's': + // space + space (state_, token_, negate_); + break; + case 'u': + // upper + upper (state_, token_, negate_); + break; + case 'x': + // xdigit + xdigit (state_, token_, negate_); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + break; + } + } + } + else + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (unterminated POSIX charset) " << + "in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void alnum_alpha (state_type &state_, string_token &token_, + const bool negate_) + { + bool alnum_ = true; + + state_.increment (); + + if (!state_.eos () && *state_._curr == 'l') + { + state_.increment (); + + if (!state_.eos ()) + { + if (*state_._curr == 'n') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'u') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'm') + { + state_.increment (); + } + } + } + else if (*state_._curr == 'p') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'h') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'a') + { + state_.increment (); + alnum_ = false; + } + } + } + } + } + + if (!state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!state_.eos () && *state_._curr == ']') + { + std::string str_; + + state_.increment (); + + if (alnum_) + { + // alnum + str_ = sizeof(input_char_type) == 1 ? + make_alnum (state_._locale) : + std::string ("[\\p{Ll}\\p{Lu}\\p{Nd}]"); + } + else + { + // alpha + str_ = sizeof(input_char_type) == 1 ? + make_alpha (state_._locale) : + std::string ("[\\p{Ll}\\p{Lu}]"); + } + + insert_charset (str_.c_str (), state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + static std::string make_alnum (std::locale &locale_) + { + std::string str_ (1, '['); + + for (std::size_t i_ = 0; i_ < 256; ++i_) + { + if (std::use_facet > (locale_). + is (std::ctype_base::alnum, static_cast(i_))) + { + str_ += static_cast(i_); + } + } + + str_ += ']'; + return str_; + } + + static std::string make_alpha (std::locale &locale_) + { + std::string str_ (1, '['); + + for (std::size_t i_ = 0; i_ < 256; ++i_) + { + if (std::use_facet > (locale_). + is (std::ctype_base::alpha, static_cast(i_))) + { + str_ += static_cast(i_); + } + } + + str_ += ']'; + return str_; + } + + template + static void blank (state_type &state_, string_token &token_, + const bool negate_) + { + const char *blank_ = "lank"; + + state_.increment (); + + while (!state_.eos () && *blank_ && *state_._curr == *blank_++) + { + state_.increment (); + } + + if (!*blank_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*blank_ && !state_.eos () && *state_._curr == ']') + { + const char *str_ = sizeof(input_char_type) == 1 ? + "[ \t]" : "[\\p{Zs}\t]"; + + state_.increment (); + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void cntrl (state_type &state_, string_token &token_, + const bool negate_) + { + const char *cntrl_ = "ntrl"; + + state_.increment (); + + while (!state_.eos () && *cntrl_ && *state_._curr == *cntrl_++) + { + state_.increment (); + } + + if (!*cntrl_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*cntrl_ && !state_.eos () && *state_._curr == ']') + { + const char *str_ = sizeof(input_char_type) == 1 ? + "[\\x00-\x1f\x7f]" : "[\\p{Cc}]"; + + state_.increment (); + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void digit (state_type &state_, string_token &token_, + const bool negate_) + { + const char *digit_ = "igit"; + + state_.increment (); + + while (!state_.eos () && *digit_ && *state_._curr == *digit_++) + { + state_.increment (); + } + + if (!*digit_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*digit_ && !state_.eos () && *state_._curr == ']') + { + const char *str_ = sizeof(input_char_type) == 1 ? + "[0-9]" : "[\\p{Nd}]"; + + state_.increment (); + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void graph (state_type &state_, string_token &token_, + const bool negate_) + { + const char *graph_ = "raph"; + + state_.increment (); + + while (!state_.eos () && *graph_ && *state_._curr == *graph_++) + { + state_.increment (); + } + + if (!*graph_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*graph_ && !state_.eos () && *state_._curr == ']') + { + const char *str_ = sizeof(input_char_type) == 1 ? + "[\x21-\x7e]" : "[^\\p{Z}\\p{C}]"; + + state_.increment (); + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void lower (state_type &state_, string_token &token_, + const bool negate_) + { + const char *lower_ = "ower"; + + state_.increment (); + + while (!state_.eos () && *lower_ && *state_._curr == *lower_++) + { + state_.increment (); + } + + if (!*lower_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*lower_ && !state_.eos () && *state_._curr == ']') + { + std::string str_ = sizeof(input_char_type) == 1 ? + create_lower (state_._locale) : + std::string ("[\\p{Ll}]"); + + state_.increment (); + insert_charset (str_.c_str (), state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + static std::string create_lower (std::locale &locale_) + { + std::string str_ (1, '['); + + for (std::size_t i_ = 0; i_ < 256; ++i_) + { + if (std::use_facet > (locale_). + is (std::ctype_base::lower, static_cast(i_))) + { + str_ += static_cast(i_); + } + } + + str_ += ']'; + return str_; + } + + template + static void print_punct (state_type &state_, string_token &token_, + const bool negate_) + { + bool print_ = true; + + state_.increment (); + + if (!state_.eos ()) + { + if (*state_._curr == 'r') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'i') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'n') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 't') + { + state_.increment (); + } + } + } + } + else if (*state_._curr == 'u') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'n') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 'c') + { + state_.increment (); + + if (!state_.eos () && *state_._curr == 't') + { + state_.increment (); + print_ = false; + } + } + } + } + } + + if (!state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!state_.eos () && *state_._curr == ']') + { + const char *str_ = 0; + + state_.increment (); + + if (print_) + { + // print + str_ = sizeof(input_char_type) == 1 ? + "[\x20-\x7e]" : "[\\p{C}]"; + } + else + { + // punct + str_ = sizeof(input_char_type) == 1 ? + "[!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~]" : + "[\\p{P}\\p{S}]"; + } + + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void space (state_type &state_, string_token &token_, + const bool negate_) + { + const char *space_ = "pace"; + + state_.increment (); + + while (!state_.eos () && *space_ && *state_._curr == *space_++) + { + state_.increment (); + } + + if (!*space_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*space_ && !state_.eos () && *state_._curr == ']') + { + const char *str_ = sizeof(input_char_type) == 1 ? + "[ \t\r\n\v\f]" : "[\\p{Z}\t\r\n\v\f]"; + + state_.increment (); + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void upper (state_type &state_, string_token &token_, + const bool negate_) + { + const char *upper_ = "pper"; + + state_.increment (); + + while (!state_.eos () && *upper_ && *state_._curr == *upper_++) + { + state_.increment (); + } + + if (!*upper_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*upper_ && !state_.eos () && *state_._curr == ']') + { + std::string str_ = sizeof(input_char_type) == 1 ? + create_upper (state_._locale) : + std::string ("[\\p{Lu}]"); + + state_.increment (); + insert_charset (str_.c_str (), state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + static std::string create_upper (std::locale &locale_) + { + std::string str_ (1, '['); + + for (std::size_t i_ = 0; i_ < 256; ++i_) + { + if (std::use_facet > (locale_). + is (std::ctype_base::upper, static_cast(i_))) + { + str_ += static_cast(i_); + } + } + + str_ += ']'; + return str_; + } + + template + static void xdigit (state_type &state_, string_token &token_, + const bool negate_) + { + const char *xdigit_ = "digit"; + + state_.increment (); + + while (!state_.eos () && *xdigit_ && *state_._curr == *xdigit_++) + { + state_.increment (); + } + + if (!*xdigit_ && !state_.eos () && *state_._curr == ':') + { + state_.increment (); + } + + if (!*xdigit_ && !state_.eos () && *state_._curr == ']') + { + const char *str_ = "[0-9A-Fa-f]"; + + state_.increment (); + insert_charset (str_, state_, token_, negate_); + } + else + { + std::ostringstream ss_; + + ss_ << "Unknown POSIX charset at index " << state_.index () << + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + template + static void insert_charset (const char *str_, state_type &state_, + string_token &token_, const bool negate_) + { + // Some systems have strlen in namespace std. + using namespace std; + + char_state temp_state_ (str_ + 1, str_ + strlen (str_), + state_._id, state_._flags, state_._locale, false); + string_token temp_token_; + + charset (temp_state_, temp_token_); + + if (negate_) temp_token_.negate (); + + token_.insert (temp_token_); + } + + template + static const char *charset_shortcut + (state_type &state_, std::size_t &str_len_) + { + const char *str_ = 0; + + switch (*state_._curr) + { + case 'd': + str_ = "[0-9]"; + break; + case 'D': + str_ = "[^0-9]"; + break; + case 'p': + str_ = unicode_escape (state_); + break; + case 's': + str_ = "[ \t\n\r\f\v]"; + break; + case 'S': + str_ = "[^ \t\n\r\f\v]"; + break; + case 'w': + str_ = "[_0-9A-Za-z]"; + break; + case 'W': + str_ = "[^_0-9A-Za-z]"; + break; + } + + if (str_) + { + // Some systems have strlen in namespace std. + using namespace std; + + str_len_ = strlen (str_); + } + else + { + str_len_ = 0; + } + + return str_; + } + + template + static const char *unicode_escape (state_type &state_) + { + const char *str_ = 0; + + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (*state_._curr != '{') + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p at index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{ in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case 'C': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{C " + "in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}]"; + break; + case 'c': + str_ = other_control (); + state_.increment (); + break; + case 'f': + str_ = other_format (); + state_.increment (); + break; +// case 'n': +// break; + case 'o': + str_ = other_private (); + state_.increment (); + break; + case 's': + str_ = other_surrogate (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{C at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + case 'L': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{L " + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Ll}\\p{Lm}\\p{Lo}\\p{Lt}\\p{Lu}]"; + break; + case 'C': + str_ = "[\\p{Ll}\\p{Lt}\\p{Lu}]"; + state_.increment (); + break; + case 'l': + str_ = letter_lowercase (); + state_.increment (); + break; + case 'm': + str_ = letter_modifier (); + state_.increment (); + break; + case 'o': + str_ = letter_other (); + state_.increment (); + break; + case 't': + str_ = letter_titlecase (); + state_.increment (); + break; + case 'u': + str_ = letter_uppercase (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{L at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + case 'M': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{M " + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Mc}\\p{Me}\\p{Mn}]"; + break; + case 'c': + str_ = mark_combining (); + state_.increment (); + break; + case 'e': + str_ = mark_enclosing (); + state_.increment (); + break; + case 'n': + str_ = mark_nonspacing (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{M at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + case 'N': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{N " + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Nd}\\p{Nl}\\p{No}]"; + break; + case 'd': + str_ = number_decimal (); + state_.increment (); + break; + case 'l': + str_ = number_letter (); + state_.increment (); + break; + case 'o': + str_ = number_other (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{N at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + case 'P': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{P " + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}" + "\\p{Ps}]"; + break; + case 'c': + str_ = punctuation_connector (); + state_.increment (); + break; + case 'd': + str_ = punctuation_dash (); + state_.increment (); + break; + case 'e': + str_ = punctuation_close (); + state_.increment (); + break; + case 'f': + str_ = punctuation_final (); + state_.increment (); + break; + case 'i': + str_ = punctuation_initial (); + state_.increment (); + break; + case 'o': + str_ = punctuation_other (); + state_.increment (); + break; + case 's': + str_ = punctuation_open (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{P at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + case 'S': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{S " + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Sc}\\p{Sk}\\p{Sm}\\p{So}]"; + break; + case 'c': + str_ = symbol_currency (); + state_.increment (); + break; + case 'k': + str_ = symbol_modifier (); + state_.increment (); + break; + case 'm': + str_ = symbol_math (); + state_.increment (); + break; + case 'o': + str_ = symbol_other (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{S at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + case 'Z': + state_.increment (); + + if (state_.eos ()) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\p{Z " + " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + switch (*state_._curr) + { + case '}': + str_ = "[\\p{Zl}\\p{Zp}\\p{Zs}]"; + break; + case 'l': + str_ = separator_line (); + state_.increment (); + break; + case 'p': + str_ = separator_paragraph (); + state_.increment (); + break; + case 's': + str_ = separator_space (); + state_.increment (); + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{Z at index " << + state_.index () << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + break; + default: + { + std::ostringstream ss_; + + ss_ << "Syntax error following \\p{ at index " << + state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + if (*state_._curr != '}') + { + std::ostringstream ss_; + + ss_ << "Missing } at index " << state_.index () << + " in rule id << " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + return str_; + } + + static const char *letter_uppercase () + { + return "[\\x41-\\x5a\\xc0-\\xd6\\xd8-\\xde\\x100\\x102\\x104\\x106" + "\\x108\\x10a\\x10c\\x10e\\x110\\x112\\x114\\x116\\x118\\x11a" + "\\x11c\\x11e\\x120\\x122\\x124\\x126\\x128\\x12a\\x12c\\x12e" + "\\x130\\x132\\x134\\x136\\x139\\x13b\\x13d\\x13f\\x141\\x143" + "\\x145\\x147\\x14a\\x14c\\x14e\\x150\\x152\\x154\\x156\\x158" + "\\x15a\\x15c\\x15e\\x160\\x162\\x164\\x166\\x168\\x16a\\x16c" + "\\x16e\\x170\\x172\\x174\\x176\\x178\\x179\\x17b\\x17d\\x181" + "\\x182\\x184\\x186\\x187\\x189-\\x18b\\x18e-\\x191\\x193\\x194" + "\\x196-\\x198\\x19c\\x19d\\x19f\\x1a0\\x1a2\\x1a4\\x1a6\\x1a7" + "\\x1a9\\x1ac\\x1ae\\x1af\\x1b1-\\x1b3\\x1b5\\x1b7\\x1b8\\x1bc" + "\\x1c4\\x1c7\\x1ca\\x1cd\\x1cf\\x1d1\\x1d3\\x1d5\\x1d7\\x1d9" + "\\x1db\\x1de\\x1e0\\x1e2\\x1e4\\x1e6\\x1e8\\x1ea\\x1ec\\x1ee" + "\\x1f1\\x1f4\\x1f6-\\x1f8\\x1fa\\x1fc\\x1fe\\x200\\x202\\x204" + "\\x206\\x208\\x20a\\x20c\\x20e\\x210\\x212\\x214\\x216\\x218" + "\\x21a\\x21c\\x21e\\x220\\x222\\x224\\x226\\x228\\x22a\\x22c" + "\\x22e\\x230\\x232\\x23a\\x23b\\x23d\\x23e\\x241\\x243-\\x246" + "\\x248\\x24a\\x24c\\x24e\\x370\\x372\\x376\\x386\\x388-\\x38a" + "\\x38c\\x38e\\x38f\\x391-\\x3a1\\x3a3-\\x3ab\\x3cf\\x3d2-\\x3d4" + "\\x3d8\\x3da\\x3dc\\x3de\\x3e0\\x3e2\\x3e4\\x3e6\\x3e8\\x3ea" + "\\x3ec\\x3ee\\x3f4\\x3f7\\x3f9\\x3fa\\x3fd-\\x42f\\x460\\x462" + "\\x464\\x466\\x468\\x46a\\x46c\\x46e\\x470\\x472\\x474\\x476" + "\\x478\\x47a\\x47c\\x47e\\x480\\x48a\\x48c\\x48e\\x490\\x492" + "\\x494\\x496\\x498\\x49a\\x49c\\x49e\\x4a0\\x4a2\\x4a4\\x4a6" + "\\x4a8\\x4aa\\x4ac\\x4ae\\x4b0\\x4b2\\x4b4\\x4b6\\x4b8\\x4ba" + "\\x4bc\\x4be\\x4c0\\x4c1\\x4c3\\x4c5\\x4c7\\x4c9\\x4cb\\x4cd" + "\\x4d0\\x4d2\\x4d4\\x4d6\\x4d8\\x4da\\x4dc\\x4de\\x4e0\\x4e2" + "\\x4e4\\x4e6\\x4e8\\x4ea\\x4ec\\x4ee\\x4f0\\x4f2\\x4f4\\x4f6" + "\\x4f8\\x4fa\\x4fc\\x4fe\\x500\\x502\\x504\\x506\\x508\\x50a" + "\\x50c\\x50e\\x510\\x512\\x514\\x516\\x518\\x51a\\x51c\\x51e" + "\\x520\\x522\\x524\\x526\\x531-\\x556\\x10a0-\\x10c5\\x1e00" + "\\x1e02\\x1e04\\x1e06\\x1e08\\x1e0a\\x1e0c\\x1e0e\\x1e10\\x1e12" + "\\x1e14\\x1e16\\x1e18\\x1e1a\\x1e1c\\x1e1e\\x1e20\\x1e22\\x1e24" + "\\x1e26\\x1e28\\x1e2a\\x1e2c\\x1e2e\\x1e30\\x1e32\\x1e34\\x1e36" + "\\x1e38\\x1e3a\\x1e3c\\x1e3e\\x1e40\\x1e42\\x1e44\\x1e46\\x1e48" + "\\x1e4a\\x1e4c\\x1e4e\\x1e50\\x1e52\\x1e54\\x1e56\\x1e58\\x1e5a" + "\\x1e5c\\x1e5e\\x1e60\\x1e62\\x1e64\\x1e66\\x1e68\\x1e6a\\x1e6c" + "\\x1e6e\\x1e70\\x1e72\\x1e74\\x1e76\\x1e78\\x1e7a\\x1e7c\\x1e7e" + "\\x1e80\\x1e82\\x1e84\\x1e86\\x1e88\\x1e8a\\x1e8c\\x1e8e\\x1e90" + "\\x1e92\\x1e94\\x1e9e\\x1ea0\\x1ea2\\x1ea4\\x1ea6\\x1ea8\\x1eaa" + "\\x1eac\\x1eae\\x1eb0\\x1eb2\\x1eb4\\x1eb6\\x1eb8\\x1eba\\x1ebc" + "\\x1ebe\\x1ec0\\x1ec2\\x1ec4\\x1ec6\\x1ec8\\x1eca\\x1ecc\\x1ece" + "\\x1ed0\\x1ed2\\x1ed4\\x1ed6\\x1ed8\\x1eda\\x1edc\\x1ede\\x1ee0" + "\\x1ee2\\x1ee4\\x1ee6\\x1ee8\\x1eea\\x1eec\\x1eee\\x1ef0\\x1ef2" + "\\x1ef4\\x1ef6\\x1ef8\\x1efa\\x1efc\\x1efe\\x1f08-\\x1f0f" + "\\x1f18-\\x1f1d\\x1f28-\\x1f2f\\x1f38-\\x1f3f\\x1f48-\\x1f4d" + "\\x1f59\\x1f5b\\x1f5d\\x1f5f\\x1f68-\\x1f6f\\x1fb8-\\x1fbb" + "\\x1fc8-\\x1fcb\\x1fd8-\\x1fdb\\x1fe8-\\x1fec\\x1ff8-\\x1ffb" + "\\x2102\\x2107\\x210b-\\x210d\\x2110-\\x2112\\x2115" + "\\x2119-\\x211d\\x2124\\x2126\\x2128\\x212a-\\x212d" + "\\x2130-\\x2133\\x213e\\x213f\\x2145\\x2183\\x2c00-\\x2c2e" + "\\x2c60\\x2c62-\\x2c64\\x2c67\\x2c69\\x2c6b\\x2c6d-\\x2c70" + "\\x2c72\\x2c75\\x2c7e-\\x2c80\\x2c82\\x2c84\\x2c86\\x2c88\\x2c8a" + "\\x2c8c\\x2c8e\\x2c90\\x2c92\\x2c94\\x2c96\\x2c98\\x2c9a\\x2c9c" + "\\x2c9e\\x2ca0\\x2ca2\\x2ca4\\x2ca6\\x2ca8\\x2caa\\x2cac\\x2cae" + "\\x2cb0\\x2cb2\\x2cb4\\x2cb6\\x2cb8\\x2cba\\x2cbc\\x2cbe\\x2cc0" + "\\x2cc2\\x2cc4\\x2cc6\\x2cc8\\x2cca\\x2ccc\\x2cce\\x2cd0\\x2cd2" + "\\x2cd4\\x2cd6\\x2cd8\\x2cda\\x2cdc\\x2cde\\x2ce0\\x2ce2\\x2ceb" + "\\x2ced\\xa640\\xa642\\xa644\\xa646\\xa648\\xa64a\\xa64c\\xa64e" + "\\xa650\\xa652\\xa654\\xa656\\xa658\\xa65a\\xa65c\\xa65e\\xa660" + "\\xa662\\xa664\\xa666\\xa668\\xa66a\\xa66c\\xa680\\xa682\\xa684" + "\\xa686\\xa688\\xa68a\\xa68c\\xa68e\\xa690\\xa692\\xa694\\xa696" + "\\xa722\\xa724\\xa726\\xa728\\xa72a\\xa72c\\xa72e\\xa732\\xa734" + "\\xa736\\xa738\\xa73a\\xa73c\\xa73e\\xa740\\xa742\\xa744\\xa746" + "\\xa748\\xa74a\\xa74c\\xa74e\\xa750\\xa752\\xa754\\xa756\\xa758" + "\\xa75a\\xa75c\\xa75e\\xa760\\xa762\\xa764\\xa766\\xa768\\xa76a" + "\\xa76c\\xa76e\\xa779\\xa77b\\xa77d\\xa77e\\xa780\\xa782\\xa784" + "\\xa786\\xa78b\\xa78d\\xa790\\xa7a0\\xa7a2\\xa7a4\\xa7a6\\xa7a8" + "\\xff21-\\xff3a\\x10400-\\x10427\\x1d400-\\x1d419" + "\\x1d434-\\x1d44d\\x1d468-\\x1d481\\x1d49c\\x1d49e\\x1d49f" + "\\x1d4a2\\x1d4a5\\x1d4a6\\x1d4a9-\\x1d4ac\\x1d4ae-\\x1d4b5" + "\\x1d4d0-\\x1d4e9\\x1d504\\x1d505\\x1d507-\\x1d50a" + "\\x1d50d-\\x1d514\\x1d516-\\x1d51c\\x1d538\\x1d539" + "\\x1d53b-\\x1d53e\\x1d540-\\x1d544\\x1d546\\x1d54a-\\x1d550" + "\\x1d56c-\\x1d585\\x1d5a0-\\x1d5b9\\x1d5d4-\\x1d5ed" + "\\x1d608-\\x1d621\\x1d63c-\\x1d655\\x1d670-\\x1d689" + "\\x1d6a8-\\x1d6c0\\x1d6e2-\\x1d6fa\\x1d71c-\\x1d734" + "\\x1d756-\\x1d76e\\x1d790-\\x1d7a8\\x1d7ca]"; + } + + static const char *letter_lowercase () + { + return "[\\x61-\\x7a\\xaa\\xb5\\xba\\xdf-\\xf6\\xf8-\\xff\\x101" + "\\x103\\x105\\x107\\x109\\x10b\\x10d\\x10f\\x111\\x113\\x115" + "\\x117\\x119\\x11b\\x11d\\x11f\\x121\\x123\\x125\\x127\\x129" + "\\x12b\\x12d\\x12f\\x131\\x133\\x135\\x137\\x138\\x13a\\x13c" + "\\x13e\\x140\\x142\\x144\\x146\\x148\\x149\\x14b\\x14d\\x14f" + "\\x151\\x153\\x155\\x157\\x159\\x15b\\x15d\\x15f\\x161\\x163" + "\\x165\\x167\\x169\\x16b\\x16d\\x16f\\x171\\x173\\x175\\x177" + "\\x17a\\x17c\\x17e-\\x180\\x183\\x185\\x188\\x18c\\x18d\\x192" + "\\x195\\x199-\\x19b\\x19e\\x1a1\\x1a3\\x1a5\\x1a8\\x1aa\\x1ab" + "\\x1ad\\x1b0\\x1b4\\x1b6\\x1b9\\x1ba\\x1bd-\\x1bf\\x1c6\\x1c9" + "\\x1cc\\x1ce\\x1d0\\x1d2\\x1d4\\x1d6\\x1d8\\x1da\\x1dc\\x1dd" + "\\x1df\\x1e1\\x1e3\\x1e5\\x1e7\\x1e9\\x1eb\\x1ed\\x1ef\\x1f0" + "\\x1f3\\x1f5\\x1f9\\x1fb\\x1fd\\x1ff\\x201\\x203\\x205\\x207" + "\\x209\\x20b\\x20d\\x20f\\x211\\x213\\x215\\x217\\x219\\x21b" + "\\x21d\\x21f\\x221\\x223\\x225\\x227\\x229\\x22b\\x22d\\x22f" + "\\x231\\x233-\\x239\\x23c\\x23f\\x240\\x242\\x247\\x249\\x24b" + "\\x24d\\x24f-\\x293\\x295-\\x2af\\x371\\x373\\x377\\x37b-\\x37d" + "\\x390\\x3ac-\\x3ce\\x3d0\\x3d1\\x3d5-\\x3d7\\x3d9\\x3db\\x3dd" + "\\x3df\\x3e1\\x3e3\\x3e5\\x3e7\\x3e9\\x3eb\\x3ed\\x3ef-\\x3f3" + "\\x3f5\\x3f8\\x3fb\\x3fc\\x430-\\x45f\\x461\\x463\\x465\\x467" + "\\x469\\x46b\\x46d\\x46f\\x471\\x473\\x475\\x477\\x479\\x47b" + "\\x47d\\x47f\\x481\\x48b\\x48d\\x48f\\x491\\x493\\x495\\x497" + "\\x499\\x49b\\x49d\\x49f\\x4a1\\x4a3\\x4a5\\x4a7\\x4a9\\x4ab" + "\\x4ad\\x4af\\x4b1\\x4b3\\x4b5\\x4b7\\x4b9\\x4bb\\x4bd\\x4bf" + "\\x4c2\\x4c4\\x4c6\\x4c8\\x4ca\\x4cc\\x4ce\\x4cf\\x4d1\\x4d3" + "\\x4d5\\x4d7\\x4d9\\x4db\\x4dd\\x4df\\x4e1\\x4e3\\x4e5\\x4e7" + "\\x4e9\\x4eb\\x4ed\\x4ef\\x4f1\\x4f3\\x4f5\\x4f7\\x4f9\\x4fb" + "\\x4fd\\x4ff\\x501\\x503\\x505\\x507\\x509\\x50b\\x50d\\x50f" + "\\x511\\x513\\x515\\x517\\x519\\x51b\\x51d\\x51f\\x521\\x523" + "\\x525\\x527\\x561-\\x587\\x1d00-\\x1d2b\\x1d62-\\x1d77" + "\\x1d79-\\x1d9a\\x1e01\\x1e03\\x1e05\\x1e07\\x1e09\\x1e0b" + "\\x1e0d\\x1e0f\\x1e11\\x1e13\\x1e15\\x1e17\\x1e19\\x1e1b" + "\\x1e1d\\x1e1f\\x1e21\\x1e23\\x1e25\\x1e27\\x1e29\\x1e2b\\x1e2d" + "\\x1e2f\\x1e31\\x1e33\\x1e35\\x1e37\\x1e39\\x1e3b\\x1e3d\\x1e3f" + "\\x1e41\\x1e43\\x1e45\\x1e47\\x1e49\\x1e4b\\x1e4d\\x1e4f\\x1e51" + "\\x1e53\\x1e55\\x1e57\\x1e59\\x1e5b\\x1e5d\\x1e5f\\x1e61\\x1e63" + "\\x1e65\\x1e67\\x1e69\\x1e6b\\x1e6d\\x1e6f\\x1e71\\x1e73\\x1e75" + "\\x1e77\\x1e79\\x1e7b\\x1e7d\\x1e7f\\x1e81\\x1e83\\x1e85\\x1e87" + "\\x1e89\\x1e8b\\x1e8d\\x1e8f\\x1e91\\x1e93\\x1e95-\\x1e9d\\x1e9f" + "\\x1ea1\\x1ea3\\x1ea5\\x1ea7\\x1ea9\\x1eab\\x1ead\\x1eaf\\x1eb1" + "\\x1eb3\\x1eb5\\x1eb7\\x1eb9\\x1ebb\\x1ebd\\x1ebf\\x1ec1\\x1ec3" + "\\x1ec5\\x1ec7\\x1ec9\\x1ecb\\x1ecd\\x1ecf\\x1ed1\\x1ed3\\x1ed5" + "\\x1ed7\\x1ed9\\x1edb\\x1edd\\x1edf\\x1ee1\\x1ee3\\x1ee5\\x1ee7" + "\\x1ee9\\x1eeb\\x1eed\\x1eef\\x1ef1\\x1ef3\\x1ef5\\x1ef7\\x1ef9" + "\\x1efb\\x1efd\\x1eff-\\x1f07\\x1f10-\\x1f15\\x1f20-\\x1f27" + "\\x1f30-\\x1f37\\x1f40-\\x1f45\\x1f50-\\x1f57\\x1f60-\\x1f67" + "\\x1f70-\\x1f7d\\x1f80-\\x1f87\\x1f90-\\x1f97\\x1fa0-\\x1fa7" + "\\x1fb0-\\x1fb4\\x1fb6\\x1fb7\\x1fbe\\x1fc2-\\x1fc4\\x1fc6" + "\\x1fc7\\x1fd0-\\x1fd3\\x1fd6\\x1fd7\\x1fe0-\\x1fe7" + "\\x1ff2-\\x1ff4\\x1ff6\\x1ff7\\x210a\\x210e\\x210f\\x2113" + "\\x212f\\x2134\\x2139\\x213c\\x213d\\x2146-\\x2149\\x214e" + "\\x2184\\x2c30-\\x2c5e\\x2c61\\x2c65\\x2c66\\x2c68\\x2c6a" + "\\x2c6c\\x2c71\\x2c73\\x2c74\\x2c76-\\x2c7c\\x2c81\\x2c83" + "\\x2c85\\x2c87\\x2c89\\x2c8b\\x2c8d\\x2c8f\\x2c91\\x2c93\\x2c95" + "\\x2c97\\x2c99\\x2c9b\\x2c9d\\x2c9f\\x2ca1\\x2ca3\\x2ca5\\x2ca7" + "\\x2ca9\\x2cab\\x2cad\\x2caf\\x2cb1\\x2cb3\\x2cb5\\x2cb7\\x2cb9" + "\\x2cbb\\x2cbd\\x2cbf\\x2cc1\\x2cc3\\x2cc5\\x2cc7\\x2cc9\\x2ccb" + "\\x2ccd\\x2ccf\\x2cd1\\x2cd3\\x2cd5\\x2cd7\\x2cd9\\x2cdb\\x2cdd" + "\\x2cdf\\x2ce1\\x2ce3\\x2ce4\\x2cec\\x2cee\\x2d00-\\x2d25\\xa641" + "\\xa643\\xa645\\xa647\\xa649\\xa64b\\xa64d\\xa64f\\xa651\\xa653" + "\\xa655\\xa657\\xa659\\xa65b\\xa65d\\xa65f\\xa661\\xa663\\xa665" + "\\xa667\\xa669\\xa66b\\xa66d\\xa681\\xa683\\xa685\\xa687\\xa689" + "\\xa68b\\xa68d\\xa68f\\xa691\\xa693\\xa695\\xa697\\xa723\\xa725" + "\\xa727\\xa729\\xa72b\\xa72d\\xa72f-\\xa731\\xa733\\xa735\\xa737" + "\\xa739\\xa73b\\xa73d\\xa73f\\xa741\\xa743\\xa745\\xa747\\xa749" + "\\xa74b\\xa74d\\xa74f\\xa751\\xa753\\xa755\\xa757\\xa759\\xa75b" + "\\xa75d\\xa75f\\xa761\\xa763\\xa765\\xa767\\xa769\\xa76b\\xa76d" + "\\xa76f\\xa771-\\xa778\\xa77a\\xa77c\\xa77f\\xa781\\xa783" + "\\xa785\\xa787\\xa78c\\xa78e\\xa791\\xa7a1\\xa7a3\\xa7a5\\xa7a7" + "\\xa7a9\\xa7fa\\xfb00-\\xfb06\\xfb13-\\xfb17\\xff41-\\xff5a" + "\\x10428-\\x1044f\\x1d41a-\\x1d433\\x1d44e-\\x1d454" + "\\x1d456-\\x1d467\\x1d482-\\x1d49b\\x1d4b6-\\x1d4b9\\x1d4bb" + "\\x1d4bd-\\x1d4c3\\x1d4c5-\\x1d4cf\\x1d4ea-\\x1d503" + "\\x1d51e-\\x1d537\\x1d552-\\x1d56b\\x1d586-\\x1d59f" + "\\x1d5ba-\\x1d5d3\\x1d5ee-\\x1d607\\x1d622-\\x1d63b" + "\\x1d656-\\x1d66f\\x1d68a-\\x1d6a5\\x1d6c2-\\x1d6da" + "\\x1d6dc-\\x1d6e1\\x1d6fc-\\x1d714\\x1d716-\\x1d71b" + "\\x1d736-\\x1d74e\\x1d750-\\x1d755\\x1d770-\\x1d788" + "\\x1d78a-\\x1d78f\\x1d7aa-\\x1d7c2\\x1d7c4-\\x1d7c9\\x1d7cb]"; + } + + static const char *letter_titlecase () + { + return "[\\x1c5\\x1c8\\x1cb\\x1f2\\x1f88-\\x1f8f\\x1f98-\\x1f9f" + "\\x1fa8-\\x1faf\\x1fbc\\x1fcc\\x1ffc]"; + } + + static const char *letter_modifier () + { + return "[\\x2b0-\\x2c1\\x2c6-\\x2d1\\x2e0-\\x2e4\\x2ec\\x2ee\\x374" + "\\x37a\\x559\\x640\\x6e5\\x6e6\\x7f4\\x7f5\\x7fa\\x81a\\x824" + "\\x828\\x971\\xe46\\xec6\\x10fc\\x17d7\\x1843\\x1aa7" + "\\x1c78-\\x1c7d\\x1d2c-\\x1d61\\x1d78\\x1d9b-\\x1dbf\\x2071" + "\\x207f\\x2090-\\x209c\\x2c7d\\x2d6f\\x2e2f\\x3005" + "\\x3031-\\x3035\\x303b\\x309d\\x309e\\x30fc-\\x30fe\\xa015" + "\\xa4f8-\\xa4fd\\xa60c\\xa67f\\xa717-\\xa71f\\xa770\\xa788" + "\\xa9cf\\xaa70\\xaadd\\xff70\\xff9e\\xff9f]"; + } + + static const char *letter_other () + { + return "[\\x1bb\\x1c0-\\x1c3\\x294\\x5d0-\\x5ea\\x5f0-\\x5f2" + "\\x620-\\x63f\\x641-\\x64a\\x66e\\x66f\\x671-\\x6d3\\x6d5\\x6ee" + "\\x6ef\\x6fa-\\x6fc\\x6ff\\x710\\x712-\\x72f\\x74d-\\x7a5\\x7b1" + "\\x7ca-\\x7ea\\x800-\\x815\\x840-\\x858\\x904-\\x939\\x93d" + "\\x950\\x958-\\x961\\x972-\\x977\\x979-\\x97f\\x985-\\x98c\\x98f" + "\\x990\\x993-\\x9a8\\x9aa-\\x9b0\\x9b2\\x9b6-\\x9b9\\x9bd\\x9ce" + "\\x9dc\\x9dd\\x9df-\\x9e1\\x9f0\\x9f1\\xa05-\\xa0a\\xa0f\\xa10" + "\\xa13-\\xa28\\xa2a-\\xa30\\xa32\\xa33\\xa35\\xa36\\xa38\\xa39" + "\\xa59-\\xa5c\\xa5e\\xa72-\\xa74\\xa85-\\xa8d\\xa8f-\\xa91" + "\\xa93-\\xaa8\\xaaa-\\xab0\\xab2\\xab3\\xab5-\\xab9\\xabd\\xad0" + "\\xae0\\xae1\\xb05-\\xb0c\\xb0f\\xb10\\xb13-\\xb28\\xb2a-\\xb30" + "\\xb32\\xb33\\xb35-\\xb39\\xb3d\\xb5c\\xb5d\\xb5f-\\xb61\\xb71" + "\\xb83\\xb85-\\xb8a\\xb8e-\\xb90\\xb92-\\xb95\\xb99\\xb9a\\xb9c" + "\\xb9e\\xb9f\\xba3\\xba4\\xba8-\\xbaa\\xbae-\\xbb9\\xbd0" + "\\xc05-\\xc0c\\xc0e-\\xc10\\xc12-\\xc28\\xc2a-\\xc33" + "\\xc35-\\xc39\\xc3d\\xc58\\xc59\\xc60\\xc61\\xc85-\\xc8c" + "\\xc8e-\\xc90\\xc92-\\xca8\\xcaa-\\xcb3\\xcb5-\\xcb9\\xcbd" + "\\xcde\\xce0\\xce1\\xcf1\\xcf2\\xd05-\\xd0c\\xd0e-\\xd10" + "\\xd12-\\xd3a\\xd3d\\xd4e\\xd60\\xd61\\xd7a-\\xd7f\\xd85-\\xd96" + "\\xd9a-\\xdb1\\xdb3-\\xdbb\\xdbd\\xdc0-\\xdc6\\xe01-\\xe30\\xe32" + "\\xe33\\xe40-\\xe45\\xe81\\xe82\\xe84\\xe87\\xe88\\xe8a\\xe8d" + "\\xe94-\\xe97\\xe99-\\xe9f\\xea1-\\xea3\\xea5\\xea7\\xeaa\\xeab" + "\\xead-\\xeb0\\xeb2\\xeb3\\xebd\\xec0-\\xec4\\xedc\\xedd\\xf00" + "\\xf40-\\xf47\\xf49-\\xf6c\\xf88-\\xf8c\\x1000-\\x102a\\x103f" + "\\x1050-\\x1055\\x105a-\\x105d\\x1061\\x1065\\x1066" + "\\x106e-\\x1070\\x1075-\\x1081\\x108e\\x10d0-\\x10fa" + "\\x1100-\\x1248\\x124a-\\x124d\\x1250-\\x1256\\x1258" + "\\x125a-\\x125d\\x1260-\\x1288\\x128a-\\x128d\\x1290-\\x12b0" + "\\x12b2-\\x12b5\\x12b8-\\x12be\\x12c0\\x12c2-\\x12c5" + "\\x12c8-\\x12d6\\x12d8-\\x1310\\x1312-\\x1315\\x1318-\\x135a" + "\\x1380-\\x138f\\x13a0-\\x13f4\\x1401-\\x166c\\x166f-\\x167f" + "\\x1681-\\x169a\\x16a0-\\x16ea\\x1700-\\x170c\\x170e-\\x1711" + "\\x1720-\\x1731\\x1740-\\x1751\\x1760-\\x176c\\x176e-\\x1770" + "\\x1780-\\x17b3\\x17dc\\x1820-\\x1842\\x1844-\\x1877" + "\\x1880-\\x18a8\\x18aa\\x18b0-\\x18f5\\x1900-\\x191c" + "\\x1950-\\x196d\\x1970-\\x1974\\x1980-\\x19ab\\x19c1-\\x19c7" + "\\x1a00-\\x1a16\\x1a20-\\x1a54\\x1b05-\\x1b33\\x1b45-\\x1b4b" + "\\x1b83-\\x1ba0\\x1bae\\x1baf\\x1bc0-\\x1be5\\x1c00-\\x1c23" + "\\x1c4d-\\x1c4f\\x1c5a-\\x1c77\\x1ce9-\\x1cec\\x1cee-\\x1cf1" + "\\x2135-\\x2138\\x2d30-\\x2d65\\x2d80-\\x2d96\\x2da0-\\x2da6" + "\\x2da8-\\x2dae\\x2db0-\\x2db6\\x2db8-\\x2dbe\\x2dc0-\\x2dc6" + "\\x2dc8-\\x2dce\\x2dd0-\\x2dd6\\x2dd8-\\x2dde\\x3006\\x303c" + "\\x3041-\\x3096\\x309f\\x30a1-\\x30fa\\x30ff\\x3105-\\x312d" + "\\x3131-\\x318e\\x31a0-\\x31ba\\x31f0-\\x31ff\\x3400\\x4db5" + "\\x4e00\\x9fcb\\xa000-\\xa014\\xa016-\\xa48c\\xa4d0-\\xa4f7" + "\\xa500-\\xa60b\\xa610-\\xa61f\\xa62a\\xa62b\\xa66e" + "\\xa6a0-\\xa6e5\\xa7fb-\\xa801\\xa803-\\xa805\\xa807-\\xa80a" + "\\xa80c-\\xa822\\xa840-\\xa873\\xa882-\\xa8b3\\xa8f2-\\xa8f7" + "\\xa8fb\\xa90a-\\xa925\\xa930-\\xa946\\xa960-\\xa97c" + "\\xa984-\\xa9b2\\xaa00-\\xaa28\\xaa40-\\xaa42\\xaa44-\\xaa4b" + "\\xaa60-\\xaa6f\\xaa71-\\xaa76\\xaa7a\\xaa80-\\xaaaf\\xaab1" + "\\xaab5\\xaab6\\xaab9-\\xaabd\\xaac0\\xaac2\\xaadb\\xaadc" + "\\xab01-\\xab06\\xab09-\\xab0e\\xab11-\\xab16\\xab20-\\xab26" + "\\xab28-\\xab2e\\xabc0-\\xabe2\\xac00\\xd7a3\\xd7b0-\\xd7c6" + "\\xd7cb-\\xd7fb\\xf900-\\xfa2d\\xfa30-\\xfa6d\\xfa70-\\xfad9" + "\\xfb1d\\xfb1f-\\xfb28\\xfb2a-\\xfb36\\xfb38-\\xfb3c\\xfb3e" + "\\xfb40\\xfb41\\xfb43\\xfb44\\xfb46-\\xfbb1\\xfbd3-\\xfd3d" + "\\xfd50-\\xfd8f\\xfd92-\\xfdc7\\xfdf0-\\xfdfb\\xfe70-\\xfe74" + "\\xfe76-\\xfefc\\xff66-\\xff6f\\xff71-\\xff9d\\xffa0-\\xffbe" + "\\xffc2-\\xffc7\\xffca-\\xffcf\\xffd2-\\xffd7\\xffda-\\xffdc" + "\\x10000-\\x1000b\\x1000d-\\x10026\\x10028-\\x1003a\\x1003c" + "\\x1003d\\x1003f-\\x1004d\\x10050-\\x1005d\\x10080-\\x100fa" + "\\x10280-\\x1029c\\x102a0-\\x102d0\\x10300-\\x1031e" + "\\x10330-\\x10340\\x10342-\\x10349\\x10380-\\x1039d" + "\\x103a0-\\x103c3\\x103c8-\\x103cf\\x10450-\\x1049d" + "\\x10800-\\x10805\\x10808\\x1080a-\\x10835\\x10837\\x10838" + "\\x1083c\\x1083f-\\x10855\\x10900-\\x10915\\x10920-\\x10939" + "\\x10a00\\x10a10-\\x10a13\\x10a15-\\x10a17\\x10a19-\\x10a33" + "\\x10a60-\\x10a7c\\x10b00-\\x10b35\\x10b40-\\x10b55" + "\\x10b60-\\x10b72\\x10c00-\\x10c48\\x11003-\\x11037" + "\\x11083-\\x110af\\x12000-\\x1236e\\x13000-\\x1342e" + "\\x16800-\\x16a38\\x1b000\\x1b001\\x20000\\x2a6d6\\x2a700" + "\\x2b734\\x2b740\\x2b81d\\x2f800-\\x2fa1d]"; + } + + static const char *mark_nonspacing () + { + return "[\\x300-\\x36f\\x483-\\x487\\x591-\\x5bd\\x5bf\\x5c1\\x5c2" + "\\x5c4\\x5c5\\x5c7\\x610-\\x61a\\x64b-\\x65f\\x670\\x6d6-\\x6dc" + "\\x6df-\\x6e4\\x6e7\\x6e8\\x6ea-\\x6ed\\x711\\x730-\\x74a" + "\\x7a6-\\x7b0\\x7eb-\\x7f3\\x816-\\x819\\x81b-\\x823" + "\\x825-\\x827\\x829-\\x82d\\x859-\\x85b\\x900-\\x902\\x93a\\x93c" + "\\x941-\\x948\\x94d\\x951-\\x957\\x962\\x963\\x981\\x9bc" + "\\x9c1-\\x9c4\\x9cd\\x9e2\\x9e3\\xa01\\xa02\\xa3c\\xa41\\xa42" + "\\xa47\\xa48\\xa4b-\\xa4d\\xa51\\xa70\\xa71\\xa75\\xa81\\xa82" + "\\xabc\\xac1-\\xac5\\xac7\\xac8\\xacd\\xae2\\xae3\\xb01\\xb3c" + "\\xb3f\\xb41-\\xb44\\xb4d\\xb56\\xb62\\xb63\\xb82\\xbc0\\xbcd" + "\\xc3e-\\xc40\\xc46-\\xc48\\xc4a-\\xc4d\\xc55\\xc56\\xc62\\xc63" + "\\xcbc\\xcbf\\xcc6\\xccc\\xccd\\xce2\\xce3\\xd41-\\xd44\\xd4d" + "\\xd62\\xd63\\xdca\\xdd2-\\xdd4\\xdd6\\xe31\\xe34-\\xe3a" + "\\xe47-\\xe4e\\xeb1\\xeb4-\\xeb9\\xebb\\xebc\\xec8-\\xecd\\xf18" + "\\xf19\\xf35\\xf37\\xf39\\xf71-\\xf7e\\xf80-\\xf84\\xf86\\xf87" + "\\xf8d-\\xf97\\xf99-\\xfbc\\xfc6\\x102d-\\x1030\\x1032-\\x1037" + "\\x1039\\x103a\\x103d\\x103e\\x1058\\x1059\\x105e-\\x1060" + "\\x1071-\\x1074\\x1082\\x1085\\x1086\\x108d\\x109d" + "\\x135d-\\x135f\\x1712-\\x1714\\x1732-\\x1734\\x1752\\x1753" + "\\x1772\\x1773\\x17b7-\\x17bd\\x17c6\\x17c9-\\x17d3\\x17dd" + "\\x180b-\\x180d\\x18a9\\x1920-\\x1922\\x1927\\x1928\\x1932" + "\\x1939-\\x193b\\x1a17\\x1a18\\x1a56\\x1a58-\\x1a5e\\x1a60" + "\\x1a62\\x1a65-\\x1a6c\\x1a73-\\x1a7c\\x1a7f\\x1b00-\\x1b03" + "\\x1b34\\x1b36-\\x1b3a\\x1b3c\\x1b42\\x1b6b-\\x1b73\\x1b80" + "\\x1b81\\x1ba2-\\x1ba5\\x1ba8\\x1ba9\\x1be6\\x1be8\\x1be9\\x1bed" + "\\x1bef-\\x1bf1\\x1c2c-\\x1c33\\x1c36\\x1c37\\x1cd0-\\x1cd2" + "\\x1cd4-\\x1ce0\\x1ce2-\\x1ce8\\x1ced\\x1dc0-\\x1de6" + "\\x1dfc-\\x1dff\\x20d0-\\x20dc\\x20e1\\x20e5-\\x20f0" + "\\x2cef-\\x2cf1\\x2d7f\\x2de0-\\x2dff\\x302a-\\x302f\\x3099" + "\\x309a\\xa66f\\xa67c\\xa67d\\xa6f0\\xa6f1\\xa802\\xa806\\xa80b" + "\\xa825\\xa826\\xa8c4\\xa8e0-\\xa8f1\\xa926-\\xa92d" + "\\xa947-\\xa951\\xa980-\\xa982\\xa9b3\\xa9b6-\\xa9b9\\xa9bc" + "\\xaa29-\\xaa2e\\xaa31\\xaa32\\xaa35\\xaa36\\xaa43\\xaa4c\\xaab0" + "\\xaab2-\\xaab4\\xaab7\\xaab8\\xaabe\\xaabf\\xaac1\\xabe5\\xabe8" + "\\xabed\\xfb1e\\xfe00-\\xfe0f\\xfe20-\\xfe26\\x101fd" + "\\x10a01-\\x10a03\\x10a05\\x10a06\\x10a0c-\\x10a0f" + "\\x10a38-\\x10a3a\\x10a3f\\x11001\\x11038-\\x11046\\x11080" + "\\x11081\\x110b3-\\x110b6\\x110b9\\x110ba\\x1d167-\\x1d169" + "\\x1d17b-\\x1d182\\x1d185-\\x1d18b\\x1d1aa-\\x1d1ad" + "\\x1d242-\\x1d244\\xe0100-\\xe01ef]"; + } + + static const char *mark_combining () + { + return "[\\x903\\x93b\\x93e-\\x940\\x949-\\x94c\\x94e\\x94f\\x982" + "\\x983\\x9be-\\x9c0\\x9c7\\x9c8\\x9cb\\x9cc\\x9d7\\xa03" + "\\xa3e-\\xa40\\xa83\\xabe-\\xac0\\xac9\\xacb\\xacc\\xb02\\xb03" + "\\xb3e\\xb40\\xb47\\xb48\\xb4b\\xb4c\\xb57\\xbbe\\xbbf\\xbc1" + "\\xbc2\\xbc6-\\xbc8\\xbca-\\xbcc\\xbd7\\xc01-\\xc03\\xc41-\\xc44" + "\\xc82\\xc83\\xcbe\\xcc0-\\xcc4\\xcc7\\xcc8\\xcca\\xccb\\xcd5" + "\\xcd6\\xd02\\xd03\\xd3e-\\xd40\\xd46-\\xd48\\xd4a-\\xd4c\\xd57" + "\\xd82\\xd83\\xdcf-\\xdd1\\xdd8-\\xddf\\xdf2\\xdf3\\xf3e\\xf3f" + "\\xf7f\\x102b\\x102c\\x1031\\x1038\\x103b\\x103c\\x1056\\x1057" + "\\x1062-\\x1064\\x1067-\\x106d\\x1083\\x1084\\x1087-\\x108c" + "\\x108f\\x109a-\\x109c\\x17b6\\x17be-\\x17c5\\x17c7\\x17c8" + "\\x1923-\\x1926\\x1929-\\x192b\\x1930\\x1931\\x1933-\\x1938" + "\\x19b0-\\x19c0\\x19c8\\x19c9\\x1a19-\\x1a1b\\x1a55\\x1a57" + "\\x1a61\\x1a63\\x1a64\\x1a6d-\\x1a72\\x1b04\\x1b35\\x1b3b" + "\\x1b3d-\\x1b41\\x1b43\\x1b44\\x1b82\\x1ba1\\x1ba6\\x1ba7\\x1baa" + "\\x1be7\\x1bea-\\x1bec\\x1bee\\x1bf2\\x1bf3\\x1c24-\\x1c2b" + "\\x1c34\\x1c35\\x1ce1\\x1cf2\\xa823\\xa824\\xa827\\xa880\\xa881" + "\\xa8b4-\\xa8c3\\xa952\\xa953\\xa983\\xa9b4\\xa9b5\\xa9ba" + "\\xa9bb\\xa9bd-\\xa9c0\\xaa2f\\xaa30\\xaa33\\xaa34\\xaa4d\\xaa7b" + "\\xabe3\\xabe4\\xabe6\\xabe7\\xabe9\\xabea\\xabec\\x11000" + "\\x11002\\x11082\\x110b0-\\x110b2\\x110b7\\x110b8\\x1d165" + "\\x1d166\\x1d16d-\\x1d172]"; + } + + static const char *mark_enclosing () + { + return "[\\x488\\x489\\x20dd-\\x20e0\\x20e2-\\x20e4\\xa670-\\xa672]"; + } + + static const char *number_decimal () + { + return "[\\x30-\\x39\\x660-\\x669\\x6f0-\\x6f9\\x7c0-\\x7c9" + "\\x966-\\x96f\\x9e6-\\x9ef\\xa66-\\xa6f\\xae6-\\xaef" + "\\xb66-\\xb6f\\xbe6-\\xbef\\xc66-\\xc6f\\xce6-\\xcef" + "\\xd66-\\xd6f\\xe50-\\xe59\\xed0-\\xed9\\xf20-\\xf29" + "\\x1040-\\x1049\\x1090-\\x1099\\x17e0-\\x17e9\\x1810-\\x1819" + "\\x1946-\\x194f\\x19d0-\\x19d9\\x1a80-\\x1a89\\x1a90-\\x1a99" + "\\x1b50-\\x1b59\\x1bb0-\\x1bb9\\x1c40-\\x1c49\\x1c50-\\x1c59" + "\\xa620-\\xa629\\xa8d0-\\xa8d9\\xa900-\\xa909\\xa9d0-\\xa9d9" + "\\xaa50-\\xaa59\\xabf0-\\xabf9\\xff10-\\xff19\\x104a0-\\x104a9" + "\\x11066-\\x1106f\\x1d7ce-\\x1d7ff]"; + } + + static const char *number_letter () + { + return "[\\x16ee-\\x16f0\\x2160-\\x2182\\x2185-\\x2188\\x3007" + "\\x3021-\\x3029\\x3038-\\x303a\\xa6e6-\\xa6ef\\x10140-\\x10174" + "\\x10341\\x1034a\\x103d1-\\x103d5\\x12400-\\x12462]"; + } + + static const char *number_other () + { + return "[\\xb2\\xb3\\xb9\\xbc-\\xbe\\x9f4-\\x9f9\\xb72-\\xb77" + "\\xbf0-\\xbf2\\xc78-\\xc7e\\xd70-\\xd75\\xf2a-\\xf33" + "\\x1369-\\x137c\\x17f0-\\x17f9\\x19da\\x2070\\x2074-\\x2079" + "\\x2080-\\x2089\\x2150-\\x215f\\x2189\\x2460-\\x249b" + "\\x24ea-\\x24ff\\x2776-\\x2793\\x2cfd\\x3192-\\x3195" + "\\x3220-\\x3229\\x3251-\\x325f\\x3280-\\x3289\\x32b1-\\x32bf" + "\\xa830-\\xa835\\x10107-\\x10133\\x10175-\\x10178\\x1018a" + "\\x10320-\\x10323\\x10858-\\x1085f\\x10916-\\x1091b" + "\\x10a40-\\x10a47\\x10a7d\\x10a7e\\x10b58-\\x10b5f" + "\\x10b78-\\x10b7f\\x10e60-\\x10e7e\\x11052-\\x11065" + "\\x1d360-\\x1d371\\x1f100-\\x1f10a]"; + } + + static const char *punctuation_connector () + { + return "[\\x5f\\x203f\\x2040\\x2054\\xfe33\\xfe34\\xfe4d-\\xfe4f" + "\\xff3f]"; + } + + static const char *punctuation_dash () + { + return "[\\x2d\\x58a\\x5be\\x1400\\x1806\\x2010-\\x2015\\x2e17\\x2e1a" + "\\x301c\\x3030\\x30a0\\xfe31\\xfe32\\xfe58\\xfe63\\xff0d]"; + } + + static const char *punctuation_open () + { + return "[\\x28\\x5b\\x7b\\xf3a\\xf3c\\x169b\\x201a\\x201e\\x2045" + "\\x207d\\x208d\\x2329\\x2768\\x276a\\x276c\\x276e\\x2770\\x2772" + "\\x2774\\x27c5\\x27e6\\x27e8\\x27ea\\x27ec\\x27ee\\x2983\\x2985" + "\\x2987\\x2989\\x298b\\x298d\\x298f\\x2991\\x2993\\x2995\\x2997" + "\\x29d8\\x29da\\x29fc\\x2e22\\x2e24\\x2e26\\x2e28\\x3008\\x300a" + "\\x300c\\x300e\\x3010\\x3014\\x3016\\x3018\\x301a\\x301d\\xfd3e" + "\\xfe17\\xfe35\\xfe37\\xfe39\\xfe3b\\xfe3d\\xfe3f\\xfe41\\xfe43" + "\\xfe47\\xfe59\\xfe5b\\xfe5d\\xff08\\xff3b\\xff5b\\xff5f\\xff62]"; + } + + static const char *punctuation_close () + { + return "[\\x29\\x5d\\x7d\\xf3b\\xf3d\\x169c\\x2046\\x207e\\x208e" + "\\x232a\\x2769\\x276b\\x276d\\x276f\\x2771\\x2773\\x2775\\x27c6" + "\\x27e7\\x27e9\\x27eb\\x27ed\\x27ef\\x2984\\x2986\\x2988\\x298a" + "\\x298c\\x298e\\x2990\\x2992\\x2994\\x2996\\x2998\\x29d9\\x29db" + "\\x29fd\\x2e23\\x2e25\\x2e27\\x2e29\\x3009\\x300b\\x300d\\x300f" + "\\x3011\\x3015\\x3017\\x3019\\x301b\\x301e\\x301f\\xfd3f\\xfe18" + "\\xfe36\\xfe38\\xfe3a\\xfe3c\\xfe3e\\xfe40\\xfe42\\xfe44\\xfe48" + "\\xfe5a\\xfe5c\\xfe5e\\xff09\\xff3d\\xff5d\\xff60\\xff63]"; + } + + static const char *punctuation_initial () + { + return "[\\x00AB\\x2018\\x201B\\x201C\\x201F\\x2039\\x2E02\\x2E04" + "\\x2E09\\x2E0C\\x2E1C\\x2E20]"; + } + + static const char *punctuation_final () + { + return "[\\x00BB\\x2019\\x201D\\x203A\\x2E03\\x2E05\\x2E0A\\x2E0D" + "\\x2E1D\\x2E21]"; + } + + static const char *punctuation_other () + { + return "[\\x21-\\x23\\x25-\\x27\\x2a\\x2c\\x2e\\x2f\\x3a\\x3b\\x3f" + "\\x40\\x5c\\xa1\\xb7\\xbf\\x37e\\x387\\x55a-\\x55f\\x589\\x5c0" + "\\x5c3\\x5c6\\x5f3\\x5f4\\x609\\x60a\\x60c\\x60d\\x61b\\x61e" + "\\x61f\\x66a-\\x66d\\x6d4\\x700-\\x70d\\x7f7-\\x7f9\\x830-\\x83e" + "\\x85e\\x964\\x965\\x970\\xdf4\\xe4f\\xe5a\\xe5b\\xf04-\\xf12" + "\\xf85\\xfd0-\\xfd4\\xfd9\\xfda\\x104a-\\x104f\\x10fb" + "\\x1361-\\x1368\\x166d\\x166e\\x16eb-\\x16ed\\x1735\\x1736" + "\\x17d4-\\x17d6\\x17d8-\\x17da\\x1800-\\x1805\\x1807-\\x180a" + "\\x1944\\x1945\\x1a1e\\x1a1f\\x1aa0-\\x1aa6\\x1aa8-\\x1aad" + "\\x1b5a-\\x1b60\\x1bfc-\\x1bff\\x1c3b-\\x1c3f\\x1c7e\\x1c7f" + "\\x1cd3\\x2016\\x2017\\x2020-\\x2027\\x2030-\\x2038" + "\\x203b-\\x203e\\x2041-\\x2043\\x2047-\\x2051\\x2053" + "\\x2055-\\x205e\\x2cf9-\\x2cfc\\x2cfe\\x2cff\\x2d70\\x2e00" + "\\x2e01\\x2e06-\\x2e08\\x2e0b\\x2e0e-\\x2e16\\x2e18\\x2e19" + "\\x2e1b\\x2e1e\\x2e1f\\x2e2a-\\x2e2e\\x2e30\\x2e31" + "\\x3001-\\x3003\\x303d\\x30fb\\xa4fe\\xa4ff\\xa60d-\\xa60f" + "\\xa673\\xa67e\\xa6f2-\\xa6f7\\xa874-\\xa877\\xa8ce\\xa8cf" + "\\xa8f8-\\xa8fa\\xa92e\\xa92f\\xa95f\\xa9c1-\\xa9cd\\xa9de" + "\\xa9df\\xaa5c-\\xaa5f\\xaade\\xaadf\\xabeb\\xfe10-\\xfe16" + "\\xfe19\\xfe30\\xfe45\\xfe46\\xfe49-\\xfe4c\\xfe50-\\xfe52" + "\\xfe54-\\xfe57\\xfe5f-\\xfe61\\xfe68\\xfe6a\\xfe6b" + "\\xff01-\\xff03\\xff05-\\xff07\\xff0a\\xff0c\\xff0e\\xff0f" + "\\xff1a\\xff1b\\xff1f\\xff20\\xff3c\\xff61\\xff64\\xff65" + "\\x10100\\x10101\\x1039f\\x103d0\\x10857\\x1091f\\x1093f" + "\\x10a50-\\x10a58\\x10a7f\\x10b39-\\x10b3f\\x11047-\\x1104d" + "\\x110bb\\x110bc\\x110be-\\x110c1\\x12470-\\x12473]"; + } + + static const char *symbol_math () + { + return "[\\x2b\\x3c-\\x3e\\x7c\\x7e\\xac\\xb1\\xd7\\xf7\\x3f6" + "\\x606-\\x608\\x2044\\x2052\\x207a-\\x207c\\x208a-\\x208c" + "\\x2118\\x2140-\\x2144\\x214b\\x2190-\\x2194\\x219a\\x219b" + "\\x21a0\\x21a3\\x21a6\\x21ae\\x21ce\\x21cf\\x21d2\\x21d4" + "\\x21f4-\\x22ff\\x2308-\\x230b\\x2320\\x2321\\x237c" + "\\x239b-\\x23b3\\x23dc-\\x23e1\\x25b7\\x25c1\\x25f8-\\x25ff" + "\\x266f\\x27c0-\\x27c4\\x27c7-\\x27ca\\x27cc\\x27ce-\\x27e5" + "\\x27f0-\\x27ff\\x2900-\\x2982\\x2999-\\x29d7\\x29dc-\\x29fb" + "\\x29fe-\\x2aff\\x2b30-\\x2b44\\x2b47-\\x2b4c\\xfb29\\xfe62" + "\\xfe64-\\xfe66\\xff0b\\xff1c-\\xff1e\\xff5c\\xff5e\\xffe2" + "\\xffe9-\\xffec\\x1d6c1\\x1d6db\\x1d6fb\\x1d715\\x1d735\\x1d74f" + "\\x1d76f\\x1d789\\x1d7a9\\x1d7c3]"; + } + + static const char *symbol_currency () + { + return "[\\x24\\xa2-\\xa5\\x60b\\x9f2\\x9f3\\x9fb\\xaf1\\xbf9\\xe3f" + "\\x17db\\x20a0-\\x20b9\\xa838\\xfdfc\\xfe69\\xff04\\xffe0\\xffe1" + "\\xffe5\\xffe6]"; + } + + static const char *symbol_modifier () + { + return "[\\x5e\\x60\\xa8\\xaf\\xb4\\xb8\\x2c2-\\x2c5\\x2d2-\\x2df" + "\\x2e5-\\x2eb\\x2ed\\x2ef-\\x2ff\\x375\\x384\\x385\\x1fbd" + "\\x1fbf-\\x1fc1\\x1fcd-\\x1fcf\\x1fdd-\\x1fdf\\x1fed-\\x1fef" + "\\x1ffd\\x1ffe\\x309b\\x309c\\xa700-\\xa716\\xa720\\xa721" + "\\xa789\\xa78a\\xfbb2-\\xfbc1\\xff3e\\xff40\\xffe3]"; + } + + static const char *symbol_other () + { + return "[\\xa6\\xa7\\xa9\\xae\\xb0\\xb6\\x482\\x60e\\x60f\\x6de" + "\\x6e9\\x6fd\\x6fe\\x7f6\\x9fa\\xb70\\xbf3-\\xbf8\\xbfa\\xc7f" + "\\xd79\\xf01-\\xf03\\xf13-\\xf17\\xf1a-\\xf1f\\xf34\\xf36\\xf38" + "\\xfbe-\\xfc5\\xfc7-\\xfcc\\xfce\\xfcf\\xfd5-\\xfd8" + "\\x109e\\x109f\\x1360\\x1390-\\x1399\\x1940\\x19de-\\x19ff" + "\\x1b61-\\x1b6a\\x1b74-\\x1b7c\\x2100\\x2101\\x2103-\\x2106" + "\\x2108\\x2109\\x2114\\x2116\\x2117\\x211e-\\x2123\\x2125" + "\\x2127\\x2129\\x212e\\x213a\\x213b\\x214a\\x214c\\x214d\\x214f" + "\\x2195-\\x2199\\x219c-\\x219f\\x21a1\\x21a2\\x21a4\\x21a5" + "\\x21a7-\\x21ad\\x21af-\\x21cd\\x21d0\\x21d1\\x21d3" + "\\x21d5-\\x21f3\\x2300-\\x2307\\x230c-\\x231f\\x2322-\\x2328" + "\\x232b-\\x237b\\x237d-\\x239a\\x23b4-\\x23db\\x23e2-\\x23f3" + "\\x2400-\\x2426\\x2440-\\x244a\\x249c-\\x24e9\\x2500-\\x25b6" + "\\x25b8-\\x25c0\\x25c2-\\x25f7\\x2600-\\x266e\\x2670-\\x26ff" + "\\x2701-\\x2767\\x2794-\\x27bf\\x2800-\\x28ff\\x2b00-\\x2b2f" + "\\x2b45\\x2b46\\x2b50-\\x2b59\\x2ce5-\\x2cea\\x2e80-\\x2e99" + "\\x2e9b-\\x2ef3\\x2f00-\\x2fd5\\x2ff0-\\x2ffb\\x3004\\x3012" + "\\x3013\\x3020\\x3036\\x3037\\x303e\\x303f\\x3190\\x3191" + "\\x3196-\\x319f\\x31c0-\\x31e3\\x3200-\\x321e\\x322a-\\x3250" + "\\x3260-\\x327f\\x328a-\\x32b0\\x32c0-\\x32fe\\x3300-\\x33ff" + "\\x4dc0-\\x4dff\\xa490-\\xa4c6\\xa828-\\xa82b\\xa836\\xa837" + "\\xa839\\xaa77-\\xaa79\\xfdfd\\xffe4\\xffe8\\xffed\\xffee" + "\\xfffc\\xfffd\\x10102\\x10137-\\x1013f\\x10179-\\x10189" + "\\x10190-\\x1019b\\x101d0-\\x101fc\\x1d000-\\x1d0f5" + "\\x1d100-\\x1d126\\x1d129-\\x1d164\\x1d16a-\\x1d16c\\x1d183" + "\\x1d184\\x1d18c-\\x1d1a9\\x1d1ae-\\x1d1dd\\x1d200-\\x1d241" + "\\x1d245\\x1d300-\\x1d356\\x1f000-\\x1f02b\\x1f030-\\x1f093" + "\\x1f0a0-\\x1f0ae\\x1f0b1-\\x1f0be\\x1f0c1-\\x1f0cf" + "\\x1f0d1-\\x1f0df\\x1f110-\\x1f12e\\x1f130-\\x1f169" + "\\x1f170-\\x1f19a\\x1f1e6-\\x1f202\\x1f210-\\x1f23a" + "\\x1f240-\\x1f248\\x1f250\\x1f251\\x1f300-\\x1f320" + "\\x1f330-\\x1f335\\x1f337-\\x1f37c\\x1f380-\\x1f393" + "\\x1f3a0-\\x1f3c4\\x1f3c6-\\x1f3ca\\x1f3e0-\\x1f3f0" + "\\x1f400-\\x1f43e\\x1f440\\x1f442-\\x1f4f7\\x1f4f9-\\x1f4fc" + "\\x1f500-\\x1f53d\\x1f550-\\x1f567\\x1f5fb-\\x1f5ff" + "\\x1f601-\\x1f610\\x1f612-\\x1f614\\x1f616\\x1f618\\x1f61a" + "\\x1f61c-\\x1f61e\\x1f620-\\x1f625\\x1f628-\\x1f62b\\x1f62d" + "\\x1f630-\\x1f633\\x1f635-\\x1f640\\x1f645-\\x1f64f" + "\\x1f680-\\x1f6c5\\x1f700-\\x1f773]"; + } + + static const char *separator_space () + { + return "[\\x20\\xa0\\x1680\\x180e\\x2000-\\x200a\\x202f\\x205f" + "\\x3000]"; + } + + static const char *separator_line () + { + return "[\\x2028]"; + } + + static const char *separator_paragraph () + { + return "[\\x2029]"; + } + + static const char *other_control () + { + return "[\\x0-\\x1f\\x7f-\\x9f]"; + } + + static const char *other_format () + { + return "[\\xad\\x600-\\x603\\x6dd\\x70f\\x17b4\\x17b5\\x200b-\\x200f" + "\\x202a-\\x202e\\x2060-\\x2064\\x206a-\\x206f\\xfeff" + "\\xfff9-\\xfffb\\x110bd\\x1d173-\\x1d17a\\xe0001" + "\\xe0020-\\xe007f]"; + } + + static const char *other_surrogate () + { + return "[\\xD800\\xDB7F\\xDB80\\xDBFF\\xDC00\\xDFFF]"; + } + + static const char *other_private () + { + return "[\\xE000\\xF8FF\\xF0000\\xFFFFD\\x100000\\x10FFFD]"; + } + + static input_char_type fold (const input_char_type char_, + const std::locale &locale_, const one &) + { + const input_char_type upper_ = std::toupper + (char_, locale_); + const input_char_type lower_ = std::tolower + (char_, locale_); + + return upper_ != char_ ? upper_ : lower_; + } + + static input_char_type fold (const input_char_type char_, + const std::locale &, const two &) + { + const fold_pair mapping_[] = + {{{0x0041, 0x005a}, {0x0061, 0x007a}}, + {{0x0061, 0x007a}, {0x0041, 0x005a}}, + {{0x00b5, 0x00b5}, {0x039c, 0x039c}}, + {{0x00c0, 0x00d6}, {0x00e0, 0x00f6}}, + {{0x00d8, 0x00de}, {0x00f8, 0x00fe}}, + {{0x00e0, 0x00f6}, {0x00c0, 0x00d6}}, + {{0x00f8, 0x0137}, {0x00d8, 0x0117}}, + {{0x0139, 0x0148}, {0x013a, 0x0149}}, + {{0x014a, 0x018c}, {0x014b, 0x018d}}, + {{0x018e, 0x019a}, {0x01dd, 0x01e9}}, + {{0x019c, 0x01a9}, {0x026f, 0x027c}}, + {{0x01ac, 0x01b9}, {0x01ad, 0x01ba}}, + {{0x01bc, 0x01bd}, {0x01bd, 0x01be}}, + {{0x01bf, 0x01bf}, {0x01f7, 0x01f7}}, + {{0x01c4, 0x01c4}, {0x01c6, 0x01c6}}, + {{0x01c6, 0x01c7}, {0x01c4, 0x01c5}}, + {{0x01c9, 0x01ca}, {0x01c7, 0x01c8}}, + {{0x01cc, 0x01ef}, {0x01ca, 0x01ed}}, + {{0x01f1, 0x01f1}, {0x01f3, 0x01f3}}, + {{0x01f3, 0x0220}, {0x01f1, 0x021e}}, + {{0x0222, 0x0233}, {0x0223, 0x0234}}, + {{0x023a, 0x0254}, {0x2c65, 0x2c7f}}, + {{0x0256, 0x0257}, {0x0189, 0x018a}}, + {{0x0259, 0x0259}, {0x018f, 0x018f}}, + {{0x025b, 0x025b}, {0x0190, 0x0190}}, + {{0x0260, 0x0260}, {0x0193, 0x0193}}, + {{0x0263, 0x0263}, {0x0194, 0x0194}}, + {{0x0265, 0x0265}, {0xa78d, 0xa78d}}, + {{0x0268, 0x0269}, {0x0197, 0x0198}}, + {{0x026b, 0x026b}, {0x2c62, 0x2c62}}, + {{0x026f, 0x026f}, {0x019c, 0x019c}}, + {{0x0271, 0x0272}, {0x2c6e, 0x2c6f}}, + {{0x0275, 0x0275}, {0x019f, 0x019f}}, + {{0x027d, 0x027d}, {0x2c64, 0x2c64}}, + {{0x0280, 0x0280}, {0x01a6, 0x01a6}}, + {{0x0283, 0x0283}, {0x01a9, 0x01a9}}, + {{0x0288, 0x028c}, {0x01ae, 0x01b2}}, + {{0x0292, 0x0292}, {0x01b7, 0x01b7}}, + {{0x0370, 0x0373}, {0x0371, 0x0374}}, + {{0x0376, 0x0377}, {0x0377, 0x0378}}, + {{0x037b, 0x037d}, {0x03fd, 0x03ff}}, + {{0x0386, 0x0386}, {0x03ac, 0x03ac}}, + {{0x0388, 0x038a}, {0x03ad, 0x03af}}, + {{0x038c, 0x038c}, {0x03cc, 0x03cc}}, + {{0x038e, 0x038f}, {0x03cd, 0x03ce}}, + {{0x0391, 0x03a1}, {0x03b1, 0x03c1}}, + {{0x03a3, 0x03af}, {0x03c3, 0x03cf}}, + {{0x03b1, 0x03d1}, {0x0391, 0x03b1}}, + {{0x03d5, 0x03f2}, {0x03a6, 0x03c3}}, + {{0x03f4, 0x03f5}, {0x03b8, 0x03b9}}, + {{0x03f7, 0x03fb}, {0x03f8, 0x03fc}}, + {{0x03fd, 0x0481}, {0x037b, 0x03ff}}, + {{0x048a, 0x0527}, {0x048b, 0x0528}}, + {{0x0531, 0x0556}, {0x0561, 0x0586}}, + {{0x0561, 0x0586}, {0x0531, 0x0556}}, + {{0x10a0, 0x10c5}, {0x2d00, 0x2d25}}, + {{0x1d79, 0x1d79}, {0xa77d, 0xa77d}}, + {{0x1d7d, 0x1d7d}, {0x2c63, 0x2c63}}, + {{0x1e00, 0x1e95}, {0x1e01, 0x1e96}}, + {{0x1e9b, 0x1e9b}, {0x1e60, 0x1e60}}, + {{0x1e9e, 0x1e9e}, {0x00df, 0x00df}}, + {{0x1ea0, 0x1f15}, {0x1ea1, 0x1f16}}, + {{0x1f18, 0x1f1d}, {0x1f10, 0x1f15}}, + {{0x1f20, 0x1f45}, {0x1f28, 0x1f4d}}, + {{0x1f48, 0x1f4d}, {0x1f40, 0x1f45}}, + {{0x1f51, 0x1f51}, {0x1f59, 0x1f59}}, + {{0x1f53, 0x1f53}, {0x1f5b, 0x1f5b}}, + {{0x1f55, 0x1f55}, {0x1f5d, 0x1f5d}}, + {{0x1f57, 0x1f57}, {0x1f5f, 0x1f5f}}, + {{0x1f59, 0x1f59}, {0x1f51, 0x1f51}}, + {{0x1f5b, 0x1f5b}, {0x1f53, 0x1f53}}, + {{0x1f5d, 0x1f5d}, {0x1f55, 0x1f55}}, + {{0x1f5f, 0x1f7d}, {0x1f57, 0x1f75}}, + {{0x1f80, 0x1f87}, {0x1f88, 0x1f8f}}, + {{0x1f90, 0x1f97}, {0x1f98, 0x1f9f}}, + {{0x1fa0, 0x1fa7}, {0x1fa8, 0x1faf}}, + {{0x1fb0, 0x1fb1}, {0x1fb8, 0x1fb9}}, + {{0x1fb3, 0x1fb3}, {0x1fbc, 0x1fbc}}, + {{0x1fb8, 0x1fbb}, {0x1fb0, 0x1fb3}}, + {{0x1fbe, 0x1fbe}, {0x0399, 0x0399}}, + {{0x1fc3, 0x1fc3}, {0x1fcc, 0x1fcc}}, + {{0x1fc8, 0x1fcb}, {0x1f72, 0x1f75}}, + {{0x1fd0, 0x1fd1}, {0x1fd8, 0x1fd9}}, + {{0x1fd8, 0x1fdb}, {0x1fd0, 0x1fd3}}, + {{0x1fe0, 0x1fe1}, {0x1fe8, 0x1fe9}}, + {{0x1fe5, 0x1fe5}, {0x1fec, 0x1fec}}, + {{0x1fe8, 0x1fec}, {0x1fe0, 0x1fe4}}, + {{0x1ff3, 0x1ff3}, {0x1ffc, 0x1ffc}}, + {{0x1ff8, 0x1ffb}, {0x1f78, 0x1f7b}}, + {{0x2126, 0x2126}, {0x03c9, 0x03c9}}, + {{0x212a, 0x212b}, {0x006b, 0x006c}}, + {{0x2132, 0x2132}, {0x214e, 0x214e}}, + {{0x214e, 0x214e}, {0x2132, 0x2132}}, + {{0x2183, 0x2184}, {0x2184, 0x2185}}, + {{0x2c00, 0x2c2e}, {0x2c30, 0x2c5e}}, + {{0x2c30, 0x2c5e}, {0x2c00, 0x2c2e}}, + {{0x2c60, 0x2c70}, {0x2c61, 0x2c71}}, + {{0x2c72, 0x2c73}, {0x2c73, 0x2c74}}, + {{0x2c75, 0x2c76}, {0x2c76, 0x2c77}}, + {{0x2c7e, 0x2ce3}, {0x023f, 0x02a4}}, + {{0x2ceb, 0x2cee}, {0x2cec, 0x2cef}}, + {{0x2d00, 0x2d25}, {0x10a0, 0x10c5}}, + {{0xa640, 0xa66d}, {0xa641, 0xa66e}}, + {{0xa680, 0xa697}, {0xa681, 0xa698}}, + {{0xa722, 0xa72f}, {0xa723, 0xa730}}, + {{0xa732, 0xa76f}, {0xa733, 0xa770}}, + {{0xa779, 0xa787}, {0xa77a, 0xa788}}, + {{0xa78b, 0xa78d}, {0xa78c, 0xa78e}}, + {{0xa790, 0xa791}, {0xa791, 0xa792}}, + {{0xa7a0, 0xa7a9}, {0xa7a1, 0xa7aa}}, + {{0xff21, 0xff3a}, {0xff41, 0xff5a}}, + {{0xff41, 0xff5a}, {0xff21, 0xff3a}}, + {{0, 0}, {0, 0}}}; + input_char_type ret_ = char_; + const fold_pair *ptr_ = mapping_; + + for (; ptr_->from.first != 0 && (char_ < ptr_->from.first || + char_ > ptr_->from.second); ++ptr_); + + if (ptr_->to.first != 0) + { + ret_ = ptr_->to.first + (char_ - ptr_->from.first); + } + + return ret_; + } + + static input_char_type fold (const input_char_type char_, + const std::locale &locale_, const four &) + { + if (char_ < 0x10000) + { + return fold (char_, locale_, two ()); + } + else + { + const fold_pair mapping_[] = + {{{0x10400, 0x1044f}, {0x10428, 0x10477}}, + {{0, 0}, {0, 0}}}; + input_char_type ret_ = char_; + const fold_pair *ptr_ = mapping_; + + for (; ptr_->from.first != 0 && (char_ < ptr_->from.first || + char_ > ptr_->from.second); ++ptr_); + + if (ptr_->to.first != 0) + { + ret_ = ptr_->to.first + (char_ - ptr_->from.first); + } + + return ret_; + } + } + + template + static input_char_type chr (state_type &state_) + { + input_char_type ch_ = 0; + + // eos_ has already been checked for. + switch (*state_._curr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + ch_ = decode_octal (state_); + break; + case 'a': + ch_ = '\a'; + state_.increment (); + break; + case 'b': + ch_ = '\b'; + state_.increment (); + break; + case 'c': + ch_ = decode_control_char (state_); + break; + case 'e': + ch_ = 27; // '\e' not recognised by compiler + state_.increment (); + break; + case 'f': + ch_ = '\f'; + state_.increment (); + break; + case 'n': + ch_ = '\n'; + state_.increment (); + break; + case 'r': + ch_ = '\r'; + state_.increment (); + break; + case 't': + ch_ = '\t'; + state_.increment (); + break; + case 'v': + ch_ = '\v'; + state_.increment (); + break; + case 'x': + ch_ = decode_hex (state_); + break; + default: + ch_ = *state_._curr; + state_.increment (); + break; + } + + return ch_; + } + + template + static input_char_type decode_octal (state_type &state_) + { + std::size_t oct_ = 0; + typename state_type::char_type ch_ = *state_._curr; + unsigned short count_ = 3; + bool eos_ = false; + + for (;;) + { + oct_ *= 8; + oct_ += ch_ - '0'; + --count_; + state_.increment (); + eos_ = state_.eos (); + + if (!count_ || eos_) break; + + ch_ = *state_._curr; + + // Don't consume invalid chars! + if (ch_ < '0' || ch_ > '7') + { + break; + } + } + + if (oct_ > static_cast(char_traits::max_val ())) + { + std::ostringstream ss_; + + ss_ << "Escape \\" << std::oct << oct_ << + " is too big for the state machine char type " + "preceding index " << state_.index () << " in rule " << + std::dec << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + return static_cast (oct_); + } + + template + static input_char_type decode_control_char (state_type &state_) + { + // Skip over 'c' + state_.increment (); + + typename state_type::char_type ch_ = 0; + bool eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\c in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + else + { + if (ch_ >= 'a' && ch_ <= 'z') + { + ch_ -= 'a' - 1; + } + else if (ch_ >= 'A' && ch_ <= 'Z') + { + ch_ -= 'A' - 1; + } + else if (ch_ == '@') + { + // Apparently... + ch_ = 0; + } + else + { + std::ostringstream ss_; + + ss_ << "Invalid control char at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + + return ch_; + } + + template + static input_char_type decode_hex (state_type &state_) + { + // Skip over 'x' + state_.increment (); + + typename state_type::char_type ch_ = 0; + bool eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following \\x in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') || + (ch_ >= 'A' && ch_ <= 'F'))) + { + std::ostringstream ss_; + + ss_ << "Illegal char following \\x at index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + std::size_t hex_ = 0; + + do + { + hex_ *= 16; + + if (ch_ >= '0' && ch_ <= '9') + { + hex_ += ch_ - '0'; + } + else if (ch_ >= 'a' && ch_ <= 'f') + { + hex_ += 10 + (ch_ - 'a'); + } + else + { + hex_ += 10 + (ch_ - 'A'); + } + + eos_ = state_.eos (); + + if (!eos_) + { + ch_ = *state_._curr; + + // Don't consume invalid chars! + if (((ch_ >= '0' && ch_ <= '9') || + (ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F'))) + { + state_.increment (); + } + else + { + eos_ = true; + } + } + } while (!eos_); + + if (hex_ > static_cast(char_traits::max_val ())) + { + std::ostringstream ss_; + + ss_ << "Escape \\x" << std::hex << hex_ << + " is too big for the state machine char type at index " << + state_.index () << " in rule id " << std::dec << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + return static_cast (hex_); + } + + template + static void charset_range (const bool chset_, state_type &state_, + bool &eos_, typename state_type::char_type &ch_, + const input_char_type prev_, string_token &chars_) + { + if (chset_) + { + std::ostringstream ss_; + + ss_ << "Charset cannot form start of range preceding " + "index " << state_.index () - 1 << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + eos_ = state_.next (ch_); + + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex following '-' in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + input_char_type curr_ = 0; + + if (ch_ == '\\') + { + std::size_t str_len_ = 0; + + if (escape_sequence (state_, curr_, str_len_)) + { + std::ostringstream ss_; + + ss_ << "Charset cannot form end of range preceding index " + << state_.index () << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + } + else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') + { + std::ostringstream ss_; + + ss_ << "POSIX char class cannot form end of range at " + "index " << state_.index () - 1 << " in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + else + { + curr_ = ch_; + } + + eos_ = state_.next (ch_); + + // Covers preceding if and else + if (eos_) + { + std::ostringstream ss_; + + // Pointless returning index if at end of string + ss_ << "Unexpected end of regex (missing ']') in rule id " << + state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + // Use size_t because we need to go past one past the maximum value. + // if we use index_type, we will wrap around to 0 at max + 1. + std::size_t start_ = static_cast + (prev_); + std::size_t end_ = static_cast + (curr_); + + // Semanic check + if (end_ < start_) + { + std::ostringstream ss_; + + ss_ << "Invalid range in charset preceding index " << + state_.index () - 1 << " in rule id " << state_._id << '.'; + throw runtime_error (ss_.str ()); + } + + // Even though ranges are used now, we still need to consider + // each character if icase is set. + if (state_._flags & icase) + { + for (; start_ <= end_; ++start_) + { + const input_char_type ch_ = static_cast + (start_); + const input_char_type folded_ = fold (ch_, state_._locale, + size ()); + + chars_.insert (typename string_token::range (ch_, ch_)); + + if (ch_ != folded_) + { + chars_.insert (typename string_token::range + (folded_, folded_)); + } + } + } + else + { + chars_.insert (typename string_token::range (prev_, curr_)); + } + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tokeniser/re_tokeniser_state.hpp b/inc/lexertl/parser/tokeniser/re_tokeniser_state.hpp new file mode 100644 index 0000000..a3548d6 --- /dev/null +++ b/inc/lexertl/parser/tokeniser/re_tokeniser_state.hpp @@ -0,0 +1,115 @@ +// tokeniser_state.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_RE_TOKENISER_STATE_HPP +#define LEXERTL_RE_TOKENISER_STATE_HPP + +#include "../../char_traits.hpp" +#include "../../enums.hpp" +#include +#include "../../size_t.hpp" +#include + +namespace lexertl +{ +namespace detail +{ +template +struct basic_re_tokeniser_state +{ + typedef ch_type char_type; + typedef typename basic_char_traits::index_type index_type; + + const char_type * const _start; + const char_type * const _end; + const char_type *_curr; + id_type _id; + std::size_t _flags; + std::stack _flags_stack; + std::locale _locale; + bool _macro; + long _paren_count; + bool _in_string; + id_type _nl_id; + + basic_re_tokeniser_state (const char_type *start_, + const char_type * const end_, id_type id_, const std::size_t flags_, + const std::locale locale_, const bool macro_) : + _start (start_), + _end (end_), + _curr (start_), + _id (id_), + _flags (flags_), + _flags_stack (), + _locale (locale_), + _macro (macro_), + _paren_count (0), + _in_string (false), + _nl_id (static_cast(~0)) + { + } + + basic_re_tokeniser_state (const basic_re_tokeniser_state &rhs_) + { + assign (rhs_); + } + + // prevent VC++ 7.1 warning: + const basic_re_tokeniser_state &operator = + (const basic_re_tokeniser_state &rhs_) + { + assign (rhs_); + } + + void assign (const basic_re_tokeniser_state &rhs_) + { + _start = rhs_._start; + _end = rhs_._end; + _curr = rhs_._curr; + _id = rhs_._id; + _flags = rhs_._flags; + _flags_stack = rhs_._flags_stack; + _locale = rhs_._locale; + _macro = rhs_._macro; + _paren_count = rhs_._paren_count; + _in_string = rhs_._in_string; + _nl_id = rhs_._nl_id; + return this; + } + + inline bool next (char_type &ch_) + { + if (_curr >= _end) + { + ch_ = 0; + return true; + } + else + { + ch_ = *_curr; + increment (); + return false; + } + } + + inline void increment () + { + ++_curr; + } + + inline std::size_t index () + { + return _curr - _start; + } + + inline bool eos () + { + return _curr >= _end; + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tree/end_node.hpp b/inc/lexertl/parser/tree/end_node.hpp new file mode 100644 index 0000000..74cb512 --- /dev/null +++ b/inc/lexertl/parser/tree/end_node.hpp @@ -0,0 +1,112 @@ +// end_node.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_END_NODE_HPP +#define LEXERTL_END_NODE_HPP + +#include "node.hpp" +#include "../../size_t.hpp" + +namespace lexertl +{ +namespace detail +{ +template +class basic_end_node : public basic_node +{ +public: + typedef basic_node node; + typedef typename node::bool_stack bool_stack; + typedef typename node::const_node_stack const_node_stack; + typedef typename node::node_ptr_vector node_ptr_vector; + typedef typename node::node_stack node_stack; + typedef typename node::node_type node_type; + typedef typename node::node_vector node_vector; + + basic_end_node (const id_type id_, const id_type user_id_, + const id_type next_dfa_, const id_type push_dfa_, + const bool pop_dfa_) : + basic_node (false), + _id (id_), + _user_id (user_id_), + _next_dfa (next_dfa_), + _push_dfa (push_dfa_), + _pop_dfa (pop_dfa_), + _followpos () + { + basic_node::_firstpos.push_back (this); + basic_node::_lastpos.push_back (this); + } + + virtual ~basic_end_node () + { + } + + virtual node_type what_type () const + { + return node::END; + } + + virtual bool traverse (const_node_stack &/*node_stack_*/, + bool_stack &/*perform_op_stack_*/) const + { + return false; + } + + virtual const node_vector &followpos () const + { + // _followpos is always empty..! + return _followpos; + } + + virtual bool end_state () const + { + return true; + } + + virtual id_type id () const + { + return _id; + } + + virtual id_type user_id () const + { + return _user_id; + } + + virtual id_type next_dfa () const + { + return _next_dfa; + } + + virtual id_type push_dfa () const + { + return _push_dfa; + } + + virtual bool pop_dfa () const + { + return _pop_dfa; + } + +private: + id_type _id; + id_type _user_id; + id_type _next_dfa; + id_type _push_dfa; + bool _pop_dfa; + node_vector _followpos; + + virtual void copy_node (node_ptr_vector &/*node_ptr_vector_*/, + node_stack &/*new_node_stack_*/, bool_stack &/*perform_op_stack_*/, + bool &/*down_*/) const + { + // Nothing to do, as end_nodes are not copied. + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tree/iteration_node.hpp b/inc/lexertl/parser/tree/iteration_node.hpp new file mode 100644 index 0000000..51b852e --- /dev/null +++ b/inc/lexertl/parser/tree/iteration_node.hpp @@ -0,0 +1,103 @@ +// iteration_node.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_ITERATION_NODE_HPP +#define LEXERTL_ITERATION_NODE_HPP + +#include "node.hpp" + +namespace lexertl +{ +namespace detail +{ +template +class basic_iteration_node : public basic_node +{ +public: + typedef basic_node node; + typedef typename node::bool_stack bool_stack; + typedef typename node::const_node_stack const_node_stack; + typedef typename node::node_ptr_vector node_ptr_vector; + typedef typename node::node_stack node_stack; + typedef typename node::node_type node_type; + typedef typename node::node_vector node_vector; + + basic_iteration_node (basic_node *next_, const bool greedy_) : + basic_node (true), + _next (next_), + _greedy (greedy_) + { + typename node_vector::iterator iter_; + typename node_vector::iterator end_; + + _next->append_firstpos (node::_firstpos); + _next->append_lastpos (node::_lastpos); + + for (iter_ = node::_lastpos.begin (), end_ = node::_lastpos.end (); + iter_ != end_; ++iter_) + { + (*iter_)->append_followpos (node::_firstpos); + } + + for (iter_ = node::_firstpos.begin (), end_ = node::_firstpos.end (); + iter_ != end_; ++iter_) + { + (*iter_)->greedy (greedy_); + } + } + + virtual ~basic_iteration_node () + { + } + + virtual node_type what_type () const + { + return node::ITERATION; + } + + virtual bool traverse (const_node_stack &node_stack_, + bool_stack &perform_op_stack_) const + { + perform_op_stack_.push (true); + node_stack_.push (_next); + return true; + } + +private: + // Not owner of this pointer... + basic_node *_next; + bool _greedy; + + virtual void copy_node (node_ptr_vector &node_ptr_vector_, + node_stack &new_node_stack_, bool_stack &perform_op_stack_, + bool &down_) const + { + if (perform_op_stack_.top ()) + { + basic_node *ptr_ = new_node_stack_.top (); + + node_ptr_vector_->push_back + (static_cast *>(0)); + node_ptr_vector_->back () = new basic_iteration_node + (ptr_, _greedy); + new_node_stack_.top () = node_ptr_vector_->back (); + } + else + { + down_ = true; + } + + perform_op_stack_.pop (); + } + + // No copy construction. + basic_iteration_node (const basic_iteration_node &); + // No assignment. + const basic_iteration_node &operator = (const basic_iteration_node &); +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tree/leaf_node.hpp b/inc/lexertl/parser/tree/leaf_node.hpp new file mode 100644 index 0000000..010cbd4 --- /dev/null +++ b/inc/lexertl/parser/tree/leaf_node.hpp @@ -0,0 +1,114 @@ +// leaf_node.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_LEAF_NODE_HPP +#define LEXERTL_LEAF_NODE_HPP + +#include "../../enums.hpp" // null_token +#include "node.hpp" +#include "../../size_t.hpp" + +namespace lexertl +{ +namespace detail +{ +template +class basic_leaf_node : public basic_node +{ +public: + typedef basic_node node; + typedef typename node::bool_stack bool_stack; + typedef typename node::const_node_stack const_node_stack; + typedef typename node::node_ptr_vector node_ptr_vector; + typedef typename node::node_stack node_stack; + typedef typename node::node_type node_type; + typedef typename node::node_vector node_vector; + + basic_leaf_node (const id_type token_, const bool greedy_) : + basic_node (token_ == node::null_token ()), + _token (token_), + _set_greedy (!greedy_), + _greedy (greedy_), + _followpos () + { + if (!node::_nullable) + { + node::_firstpos.push_back (this); + node::_lastpos.push_back (this); + } + } + + virtual ~basic_leaf_node () + { + } + + virtual void append_followpos (const node_vector &followpos_) + { + for (typename node_vector::const_iterator iter_ = followpos_.begin (), + end_ = followpos_.end (); iter_ != end_; ++iter_) + { + _followpos.push_back (*iter_); + } + } + + virtual node_type what_type () const + { + return node::LEAF; + } + + virtual bool traverse (const_node_stack &/*node_stack_*/, + bool_stack &/*perform_op_stack_*/) const + { + return false; + } + + virtual id_type token () const + { + return _token; + } + + virtual void greedy (const bool greedy_) + { + if (!_set_greedy) + { + _greedy = greedy_; + _set_greedy = true; + } + } + + virtual bool greedy () const + { + return _greedy; + } + + virtual const node_vector &followpos () const + { + return _followpos; + } + + virtual node_vector &followpos () + { + return _followpos; + } + +private: + id_type _token; + bool _set_greedy; + bool _greedy; + node_vector _followpos; + + virtual void copy_node (node_ptr_vector &node_ptr_vector_, + node_stack &new_node_stack_, bool_stack &/*perform_op_stack_*/, + bool &/*down_*/) const + { + node_ptr_vector_->push_back (static_cast(0)); + node_ptr_vector_->back () = new basic_leaf_node (_token, _greedy); + new_node_stack_.push (node_ptr_vector_->back ()); + } +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tree/node.hpp b/inc/lexertl/parser/tree/node.hpp new file mode 100644 index 0000000..cb54cd2 --- /dev/null +++ b/inc/lexertl/parser/tree/node.hpp @@ -0,0 +1,241 @@ +// node.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_NODE_HPP +#define LEXERTL_NODE_HPP + +#include +#include "../../containers/ptr_vector.hpp" +#include "../../runtime_error.hpp" +#include "../../size_t.hpp" +#include +#include + +namespace lexertl +{ +namespace detail +{ +template +class basic_node +{ +public: + enum node_type {LEAF, SEQUENCE, SELECTION, ITERATION, END}; + + typedef std::stack bool_stack; + typedef std::stack *> node_stack; + // stack and vector not owner of node pointers + typedef std::stack *> const_node_stack; + typedef std::vector *> node_vector; + typedef ptr_vector > node_ptr_vector; + + basic_node () : + _nullable (false), + _firstpos (), + _lastpos () + { + } + + basic_node (const bool nullable_) : + _nullable (nullable_), + _firstpos (), + _lastpos () + { + } + + virtual ~basic_node () + { + } + + static id_type null_token () + { + return static_cast(~0); + } + + bool nullable () const + { + return _nullable; + } + + void append_firstpos (node_vector &firstpos_) const + { + firstpos_.insert (firstpos_.end (), + _firstpos.begin (), _firstpos.end ()); + } + + void append_lastpos (node_vector &lastpos_) const + { + lastpos_.insert (lastpos_.end (), + _lastpos.begin (), _lastpos.end ()); + } + + virtual void append_followpos (const node_vector &/*followpos_*/) + { + throw runtime_error ("Internal error node::append_followpos()."); + } + + basic_node *copy (node_ptr_vector &node_ptr_vector_) const + { + basic_node *new_root_ = 0; + const_node_stack node_stack_; + bool_stack perform_op_stack_; + bool down_ = true; + node_stack new_node_stack_; + + node_stack_.push (this); + + while (!node_stack_.empty ()) + { + while (down_) + { + down_ = node_stack_.top ()->traverse (node_stack_, + perform_op_stack_); + } + + while (!down_ && !node_stack_.empty ()) + { + const basic_node *top_ = node_stack_.top (); + + top_->copy_node (node_ptr_vector_, new_node_stack_, + perform_op_stack_, down_); + + if (!down_) node_stack_.pop (); + } + } + + assert (new_node_stack_.size () == 1); + new_root_ = new_node_stack_.top (); + new_node_stack_.pop (); + return new_root_; + } + + virtual node_type what_type () const = 0; + + virtual bool traverse (const_node_stack &node_stack_, + bool_stack &perform_op_stack_) const = 0; + + node_vector &firstpos () + { + return _firstpos; + } + + const node_vector &firstpos () const + { + return _firstpos; + } + + // _lastpos modified externally, so not const & + node_vector &lastpos () + { + return _lastpos; + } + + virtual bool end_state () const + { + return false; + } + + virtual id_type id () const + { + throw runtime_error ("Internal error node::id()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return id_type (); +#endif + } + + virtual id_type user_id () const + { + throw runtime_error ("Internal error node::user_id()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return id_type (); +#endif + } + + virtual id_type next_dfa () const + { + throw runtime_error ("Internal error node::next_dfa()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return id_type (); +#endif + } + + virtual id_type push_dfa () const + { + throw runtime_error ("Internal error node::push_dfa()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return id_type (); +#endif + } + + virtual bool pop_dfa () const + { + throw runtime_error ("Internal error node::pop_dfa()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return false; +#endif + } + + virtual id_type token () const + { + throw runtime_error ("Internal error node::token()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return id_type (); +#endif + } + + virtual void greedy (const bool /*greedy_*/) + { + throw runtime_error ("Internal error node::greedy(bool)."); + } + + virtual bool greedy () const + { + throw runtime_error ("Internal error node::greedy()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return false; +#endif + } + + virtual const node_vector &followpos () const + { + throw runtime_error ("Internal error node::followpos()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return firstpos; +#endif + } + + virtual node_vector &followpos () + { + throw runtime_error ("Internal error node::followpos()."); +#ifdef __SUNPRO_CC + // Stop bogus Solaris compiler warning + return firstpos; +#endif + } + +protected: + const bool _nullable; + node_vector _firstpos; + node_vector _lastpos; + + virtual void copy_node (node_ptr_vector &node_ptr_vector_, + node_stack &new_node_stack_, bool_stack &perform_op_stack_, + bool &down_) const = 0; + +private: + basic_node (const basic_node &); // No copy construction. + const basic_node &operator = (const basic_node &); // No assignment. +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tree/selection_node.hpp b/inc/lexertl/parser/tree/selection_node.hpp new file mode 100644 index 0000000..ab0802a --- /dev/null +++ b/inc/lexertl/parser/tree/selection_node.hpp @@ -0,0 +1,106 @@ +// selection_node.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_SELECTION_NODE_HPP +#define LEXERTL_SELECTION_NODE_HPP + +#include "node.hpp" + +namespace lexertl +{ +namespace detail +{ +template +class basic_selection_node : public basic_node +{ +public: + typedef basic_node node; + typedef typename node::bool_stack bool_stack; + typedef typename node::const_node_stack const_node_stack; + typedef typename node::node_ptr_vector node_ptr_vector; + typedef typename node::node_stack node_stack; + typedef typename node::node_type node_type; + + basic_selection_node (basic_node *left_, + basic_node *right_) : + basic_node (left_->nullable () || right_->nullable ()), + _left (left_), + _right (right_) + { + _left->append_firstpos (node::_firstpos); + _right->append_firstpos (node::_firstpos); + _left->append_lastpos (node::_lastpos); + _right->append_lastpos (node::_lastpos); + } + + virtual ~basic_selection_node () + { + } + + virtual node_type what_type () const + { + return node::SELECTION; + } + + virtual bool traverse (const_node_stack &node_stack_, + bool_stack &perform_op_stack_) const + { + perform_op_stack_.push (true); + + switch (_right->what_type ()) + { + case node::SEQUENCE: + case node::SELECTION: + case node::ITERATION: + perform_op_stack_.push (false); + break; + default: + break; + } + + node_stack_.push (_right); + node_stack_.push (_left); + return true; + } + +private: + // Not owner of these pointers... + basic_node *_left; + basic_node *_right; + + virtual void copy_node (node_ptr_vector &node_ptr_vector_, + node_stack &new_node_stack_, bool_stack &perform_op_stack_, + bool &down_) const + { + if (perform_op_stack_.top ()) + { + basic_node *rhs_ = new_node_stack_.top (); + + new_node_stack_.pop (); + + basic_node *lhs_ = new_node_stack_.top (); + + node_ptr_vector_->push_back + (static_cast(0)); + node_ptr_vector_->back () = new basic_selection_node (lhs_, rhs_); + new_node_stack_.top () = node_ptr_vector_->back (); + } + else + { + down_ = true; + } + + perform_op_stack_.pop (); + } + + // No copy construction. + basic_selection_node (const basic_selection_node &); + // No assignment. + const basic_selection_node &operator = (const basic_selection_node &); +}; +} +} + +#endif diff --git a/inc/lexertl/parser/tree/sequence_node.hpp b/inc/lexertl/parser/tree/sequence_node.hpp new file mode 100644 index 0000000..6e46f14 --- /dev/null +++ b/inc/lexertl/parser/tree/sequence_node.hpp @@ -0,0 +1,126 @@ +// sequence_node.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_SEQUENCE_NODE_HPP +#define LEXERTL_SEQUENCE_NODE_HPP + +#include "node.hpp" + +namespace lexertl +{ +namespace detail +{ +template +class basic_sequence_node : public basic_node +{ +public: + typedef basic_node node; + typedef typename node::bool_stack bool_stack; + typedef typename node::const_node_stack const_node_stack; + typedef typename node::node_ptr_vector node_ptr_vector; + typedef typename node::node_stack node_stack; + typedef typename node::node_type node_type; + typedef typename node::node_vector node_vector; + + basic_sequence_node (basic_node *left_, + basic_node *right_) : + basic_node (left_->nullable () && right_->nullable ()), + _left (left_), + _right (right_) + { + _left->append_firstpos (node::_firstpos); + + if (_left->nullable ()) + { + _right->append_firstpos (node::_firstpos); + } + + if (_right->nullable ()) + { + _left->append_lastpos (node::_lastpos); + } + + _right->append_lastpos (node::_lastpos); + + node_vector &lastpos_ = _left->lastpos (); + const node_vector &firstpos_ = _right->firstpos (); + + for (typename node_vector::iterator iter_ = lastpos_.begin (), + end_ = lastpos_.end (); iter_ != end_; ++iter_) + { + (*iter_)->append_followpos (firstpos_); + } + } + + virtual ~basic_sequence_node () + { + } + + virtual node_type what_type () const + { + return node::SEQUENCE; + } + + virtual bool traverse (const_node_stack &node_stack_, + bool_stack &perform_op_stack_) const + { + perform_op_stack_.push (true); + + switch (_right->what_type ()) + { + case node::SEQUENCE: + case node::SELECTION: + case node::ITERATION: + perform_op_stack_.push (false); + break; + default: + break; + } + + node_stack_.push (_right); + node_stack_.push (_left); + return true; + } + +private: + // Not owner of these pointers... + basic_node *_left; + basic_node *_right; + + virtual void copy_node (node_ptr_vector &node_ptr_vector_, + node_stack &new_node_stack_, bool_stack &perform_op_stack_, + bool &down_) const + { + if (perform_op_stack_.top ()) + { + basic_node *rhs_ = new_node_stack_.top (); + + new_node_stack_.pop (); + + basic_node *lhs_ = new_node_stack_.top (); + + node_ptr_vector_->push_back + (static_cast *>(0)); + node_ptr_vector_->back () = new basic_sequence_node + (lhs_, rhs_); + new_node_stack_.top () = node_ptr_vector_->back (); + } + else + { + down_ = true; + } + + perform_op_stack_.pop (); + } + + // No copy construction. + basic_sequence_node (const basic_sequence_node &); + // No assignment. + const basic_sequence_node &operator = (const basic_sequence_node &); +}; +} +} + +#endif diff --git a/inc/lexertl/partition/charset.hpp b/inc/lexertl/partition/charset.hpp new file mode 100644 index 0000000..d97cf03 --- /dev/null +++ b/inc/lexertl/partition/charset.hpp @@ -0,0 +1,73 @@ +// charset.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_CHARSET_HPP +#define LEXERTL_CHARSET_HPP + +#include +#include +#include +#include "../size_t.hpp" +#include "../string_token.hpp" + +namespace lexertl +{ +namespace detail +{ +template +struct basic_charset +{ + typedef basic_string_token token; + typedef std::set index_set; + + token _token; + index_set _index_set; + + basic_charset () : + _token (), + _index_set () + { + } + + basic_charset (const token &token_, const std::size_t index_) : + _token (token_), + _index_set () + { + _index_set.insert (index_); + } + + bool empty () const + { + return _token.empty () && _index_set.empty (); + } + + void intersect (basic_charset &rhs_, basic_charset &overlap_) + { + _token.intersect (rhs_._token, overlap_._token); + + if (!overlap_._token.empty ()) + { + std::merge (_index_set.begin (), _index_set.end (), + rhs_._index_set.begin (), rhs_._index_set.end (), + std::inserter (overlap_._index_set, + overlap_._index_set.end ())); + + if (_token.empty ()) + { + _index_set.clear (); + } + + if (rhs_._token.empty ()) + { + rhs_._index_set.clear (); + } + } + } +}; +} +} + +#endif diff --git a/inc/lexertl/partition/equivset.hpp b/inc/lexertl/partition/equivset.hpp new file mode 100644 index 0000000..6c25c6b --- /dev/null +++ b/inc/lexertl/partition/equivset.hpp @@ -0,0 +1,134 @@ +// equivset.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_EQUIVSET_HPP +#define LEXERTL_EQUIVSET_HPP + +#include +#include "../parser/tree/node.hpp" +#include + +namespace lexertl +{ +namespace detail +{ +template +struct basic_equivset +{ + typedef std::set index_set; + typedef std::vector index_vector; + // Not owner of nodes: + typedef basic_node node; + typedef std::vector node_vector; + + index_vector _index_vector; + id_type _id; + bool _greedy; + node_vector _followpos; + + basic_equivset () : + _index_vector (), + _id (0), + _greedy (true), + _followpos () + { + } + + basic_equivset (const index_set &index_set_, const id_type id_, + const bool greedy_, const node_vector &followpos_) : + _index_vector (index_set_.begin (), index_set_.end ()), + _id (id_), + _greedy (greedy_), + _followpos (followpos_) + { + } + + bool empty () const + { + return _index_vector.empty () && _followpos.empty (); + } + + void intersect (basic_equivset &rhs_, basic_equivset &overlap_) + { + intersect_indexes (rhs_._index_vector, overlap_._index_vector); + + if (!overlap_._index_vector.empty ()) + { + // Note that the LHS takes priority in order to + // respect rule ordering priority in the lex spec. + overlap_._id = _id; + overlap_._greedy = _greedy; + overlap_._followpos = _followpos; + + typename node_vector::const_iterator overlap_begin_ = + overlap_._followpos.begin (); + typename node_vector::const_iterator overlap_end_ = + overlap_._followpos.end (); + typename node_vector::const_iterator rhs_iter_ = + rhs_._followpos.begin (); + typename node_vector::const_iterator rhs_end_ = + rhs_._followpos.end (); + + for (; rhs_iter_ != rhs_end_; ++rhs_iter_) + { + node *node_ = *rhs_iter_; + + if (std::find (overlap_begin_, overlap_end_, node_) == + overlap_end_) + { + overlap_._followpos.push_back (node_); + overlap_begin_ = overlap_._followpos.begin (); + overlap_end_ = overlap_._followpos.end (); + } + } + + if (_index_vector.empty ()) + { + _followpos.clear (); + } + + if (rhs_._index_vector.empty ()) + { + rhs_._followpos.clear (); + } + } + } + +private: + void intersect_indexes (index_vector &rhs_, index_vector &overlap_) + { + typename index_vector::iterator iter_ = _index_vector.begin (); + typename index_vector::iterator end_ = _index_vector.end (); + typename index_vector::iterator rhs_iter_ = rhs_.begin (); + typename index_vector::iterator rhs_end_ = rhs_.end (); + + while (iter_ != end_ && rhs_iter_ != rhs_end_) + { + const id_type index_ = *iter_; + const id_type rhs_index_ = *rhs_iter_; + + if (index_ < rhs_index_) + { + ++iter_; + } + else if (index_ > rhs_index_) + { + ++rhs_iter_; + } + else + { + overlap_.push_back (index_); + iter_ = _index_vector.erase (iter_); + end_ = _index_vector.end (); + rhs_iter_ = rhs_.erase (rhs_iter_); + rhs_end_ = rhs_.end (); + } + } + } +}; +} +} + +#endif diff --git a/inc/lexertl/rules.hpp b/inc/lexertl/rules.hpp new file mode 100644 index 0000000..713341b --- /dev/null +++ b/inc/lexertl/rules.hpp @@ -0,0 +1,743 @@ +// rules.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_RULES_HPP +#define LEXERTL_RULES_HPP + +#include "compile_assert.hpp" +#include +#include "enums.hpp" +#include "internals.hpp" +#include +#include +#include "runtime_error.hpp" +#include +#include "size_t.hpp" +#include +#include +#include + +namespace lexertl +{ +template +class basic_rules +{ +public: + typedef std::vector bool_vector; + typedef std::deque bool_vector_deque; + typedef ch_type char_type; + typedef id_ty id_type; + typedef std::vector id_vector; + typedef std::deque id_vector_deque; + typedef std::basic_string string; + typedef std::deque string_deque; + typedef std::deque string_deque_deque; + typedef std::set string_set; + typedef std::pair string_pair; + typedef std::deque string_pair_deque; + typedef std::map string_id_type_map; + typedef std::pair string_id_type_pair; + + // If you get a compile error here you have + // failed to define an unsigned id type. + compile_assert<(static_cast(~0) > 0)> + _valid_id_type; + + basic_rules (const std::size_t flags_ = dot_not_newline) : + _valid_id_type (), + _statemap (), + _macrodeque (), + _macroset (), + _regexes (), + _features (), + _ids (), + _user_ids (), + _next_dfas (), + _pushes (), + _pops (), + _flags (flags_), + _locale (), + _lexer_state_names (), + _eoi (0) + { + add_state (initial ()); + } + + void clear () + { + _statemap.clear (); + _macrodeque.clear (); + _macroset.clear (); + _regexes.clear (); + _features.clear (); + _ids.clear (); + _user_ids.clear (); + _next_dfas.clear (); + _pushes.clear (); + _pops.clear (); + _flags = dot_not_newline; + _locale = std::locale (); + _lexer_state_names.clear (); + _eoi = 0; + add_state (initial ()); + } + + void clear (const id_type dfa_) + { + if (_regexes.size () > dfa_) + { + _regexes[dfa_].clear (); + _features[dfa_] = 0; + _ids[dfa_].clear (); + _user_ids[dfa_].clear (); + _next_dfas[dfa_].clear (); + _pushes[dfa_].clear (); + _pops[dfa_].clear (); + } + } + + void flags (const std::size_t flags_) + { + _flags = flags_; + } + + std::size_t flags () const + { + return _flags; + } + + static id_type skip () + { + return static_cast(~1); + } + + void eoi (const id_type eoi_) + { + _eoi = eoi_; + } + + id_type eoi () const + { + return _eoi; + } + + std::locale imbue (const std::locale &locale_) + { + std::locale loc_ = _locale; + + _locale = locale_; + return loc_; + } + + const std::locale &locale () const + { + return _locale; + } + + const char_type *state (const id_type index_) const + { + if (index_ == 0) + { + return initial (); + } + else + { + const id_type i_ = index_ - 1; + + if (_lexer_state_names.size () > i_) + { + return _lexer_state_names[i_].c_str (); + } + else + { + return 0; + } + } + } + + id_type state (const char_type *name_) const + { + typename string_id_type_map::const_iterator iter_ = + _statemap.find (name_); + + if (iter_ == _statemap.end ()) + { + return npos (); + } + else + { + return iter_->second; + } + } + + id_type add_state (const char_type *name_) + { + validate (name_); + + if (_statemap.insert (string_id_type_pair (name_, + _statemap.size ())).second) + { + _regexes.push_back (string_deque ()); + _features.push_back (0); + _ids.push_back (id_vector ()); + _user_ids.push_back (id_vector ()); + _next_dfas.push_back (id_vector ()); + _pushes.push_back (id_vector ()); + _pops.push_back (bool_vector ()); + + if (string (name_) != initial ()) + { + _lexer_state_names.push_back (name_); + } + } + else + { + return _statemap.find (name_)->second; + } + + if (_next_dfas.size () > npos ()) + { + // Overflow + throw runtime_error ("The data type you have chosen cannot hold " + "this many lexer start states."); + } + + // Initial is not stored, so no need to - 1. + return static_cast(_lexer_state_names.size ()); + } + + void add_macro (const char_type *name_, const char_type *regex_) + { + add_macro (name_, string (regex_)); + } + + void add_macro (const char_type *name_, const char_type *regex_start_, + const char_type *regex_end_) + { + add_macro (name_, string (regex_start_, regex_end_)); + } + + void add_macro (const char_type *name_, const string ®ex_) + { + validate (name_); + + typename string_set::const_iterator iter_ = _macroset.find (name_); + + if (iter_ == _macroset.end ()) + { + _macrodeque.push_back (string_pair (name_, regex_)); + _macroset.insert (name_); + } + else + { + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Attempt to redefine MACRO '"; + + while (*name_) + { + os_ << ss_.narrow (*name_++, static_cast (' ')); + } + + os_ << "'."; + throw runtime_error (os_.str ()); + } + } + + void add_macros (const basic_rules &rules_) + { + const string_pair_deque ¯os_ = rules_.macrodeque (); + typename string_pair_deque::const_iterator macro_iter_ = + macros_.begin (); + typename string_pair_deque::const_iterator macro_end_ = + macros_.end (); + + for (; macro_iter_ != macro_end_; ++macro_iter_) + { + add_macro (macro_iter_->first.c_str (), + macro_iter_->second.c_str ()); + } + } + + void merge_macros (const basic_rules &rules_) + { + const string_pair_deque ¯os_ = rules_.macrodeque (); + typename string_pair_deque::const_iterator macro_iter_ = + macros_.begin (); + typename string_pair_deque::const_iterator macro_end_ = + macros_.end (); + typename string_set::const_iterator macro_dest_iter_; + typename string_set::const_iterator macro_dest_end_ = _macroset.end (); + + for (; macro_iter_ != macro_end_; ++macro_iter_) + { + macro_dest_iter_ = _macroset.find (macro_iter_->first); + + if (macro_dest_iter_ == macro_dest_end_) + { + add_macro (macro_iter_->first.c_str (), + macro_iter_->second.c_str ()); + } + } + } + + // Add rule to INITIAL + void add (const char_type *regex_, const id_type id_, + const id_type user_id_ = npos ()) + { + add (string (regex_), id_, user_id_); + } + + void add (const char_type *regex_start_, const char_type *regex_end_, + const id_type id_, const id_type user_id_ = npos ()) + { + add (string (regex_start_, regex_end_), id_, user_id_); + } + + void add (const string ®ex_, const id_type id_, + const id_type user_id_ = npos ()) + { + check_for_invalid_id (id_); + _regexes.front ().push_back (regex_); + + if (regex_[0] == '^') + { + _features.front () |= bol_bit; + } + + if (regex_.size () > 0 && regex_[regex_.size () - 1] == '$') + { + _features.front () |= eol_bit; + } + + if (id_ == skip ()) + { + _features.front () |= skip_bit; + } + else if (id_ == eoi ()) + { + _features.front () |= again_bit; + } + + _ids.front ().push_back (id_); + _user_ids.front ().push_back (user_id_); + _next_dfas.front ().push_back (0); + _pushes.front ().push_back (npos ()); + _pops.front ().push_back (false); + } + + // Add rule with no id + void add (const char_type *curr_dfa_, + const char_type *regex_, const char_type *new_dfa_) + { + add (curr_dfa_, string (regex_), new_dfa_); + } + + void add (const char_type *curr_dfa_, + const char_type *regex_start_, const char_type *regex_end_, + const char_type *new_dfa_) + { + add (curr_dfa_, string (regex_start_, regex_end_), new_dfa_); + } + + void add (const char_type *curr_dfa_, const string ®ex_, + const char_type *new_dfa_) + { + add (curr_dfa_, regex_, _eoi, new_dfa_, false); + } + + // Add rule with id + void add (const char_type *curr_dfa_, + const char_type *regex_, const id_type id_, + const char_type *new_dfa_, const id_type user_id_ = npos ()) + { + add (curr_dfa_, string (regex_), id_, new_dfa_, user_id_); + } + + void add (const char_type *curr_dfa_, const char_type *regex_start_, + const char_type *regex_end_, const id_type id_, + const char_type *new_dfa_, const id_type user_id_ = npos ()) + { + add (curr_dfa_, string (regex_start_, regex_end_), + id_, new_dfa_, user_id_); + } + + void add (const char_type *curr_dfa_, const string ®ex_, + const id_type id_, const char_type *new_dfa_, + const id_type user_id_ = npos ()) + { + add (curr_dfa_, regex_, id_, new_dfa_, true, user_id_); + } + + const string_id_type_map &statemap () const + { + return _statemap; + } + + const string_pair_deque ¯odeque () const + { + return _macrodeque; + } + + const string_deque_deque ®exes () const + { + return _regexes; + } + + const id_vector &features () const + { + return _features; + } + + const id_vector_deque &ids () const + { + return _ids; + } + + const id_vector_deque &user_ids () const + { + return _user_ids; + } + + const id_vector_deque &next_dfas () const + { + return _next_dfas; + } + + const id_vector_deque &pushes () const + { + return _pushes; + } + + const bool_vector_deque &pops () const + { + return _pops; + } + + bool empty () const + { + typename string_deque_deque::const_iterator iter_ = _regexes.begin (); + typename string_deque_deque::const_iterator end_ = _regexes.end (); + bool empty_ = true; + + for (; iter_ != end_; ++iter_) + { + if (!iter_->empty ()) + { + empty_ = false; + break; + } + } + + return empty_; + } + + static const char_type *initial () + { + static const char_type initial_[] = + {'I', 'N', 'I', 'T', 'I', 'A', 'L', 0}; + + return initial_; + } + + static const char_type *dot () + { + static const char_type dot_[] = {'.', 0}; + + return dot_; + } + + static const char_type *all_states () + { + static const char_type star_[] = {'*', 0}; + + return star_; + } + + static id_type npos () + { + return static_cast(~0); + } + +private: + string_id_type_map _statemap; + string_pair_deque _macrodeque; + string_set _macroset; + string_deque_deque _regexes; + id_vector _features; + id_vector_deque _ids; + id_vector_deque _user_ids; + id_vector_deque _next_dfas; + id_vector_deque _pushes; + bool_vector_deque _pops; + std::size_t _flags; + std::locale _locale; + string_deque _lexer_state_names; + id_type _eoi; + + void add (const char_type *curr_dfa_, const string ®ex_, + const id_type id_, const char_type *new_dfa_, + const bool check_, const id_type user_id_ = npos ()) + { + const bool star_ = *curr_dfa_ == '*' && *(curr_dfa_ + 1) == 0; + const bool dot_ = *new_dfa_ == '.' && *(new_dfa_ + 1) == 0; + const bool push_ = *new_dfa_ == '>'; + const char_type *push_dfa_ = 0; + const bool pop_ = *new_dfa_ == '<'; + + if (push_ || pop_) + { + ++new_dfa_; + } + + if (check_) + { + check_for_invalid_id (id_); + } + + if (!dot_ && !pop_) + { + const char_type *temp_ = new_dfa_; + + while (*temp_ && *temp_ != ':') + { + ++temp_; + } + + if (*temp_) push_dfa_ = temp_ + 1; + + validate (new_dfa_, *temp_ ? temp_ : 0); + + if (push_dfa_) + { + validate (push_dfa_); + } + } + + // npos means pop here + id_type new_dfa_id_ = npos (); + id_type push_dfa_id_ = npos (); + typename string_id_type_map::const_iterator iter_; + typename string_id_type_map::const_iterator end_ = _statemap.end (); + id_vector next_dfas_; + + if (!dot_ && !pop_) + { + if (push_dfa_) + { + iter_ = _statemap.find (string (new_dfa_, push_dfa_ - 1)); + } + else + { + iter_ = _statemap.find (new_dfa_); + } + + if (iter_ == end_) + { + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Unknown state name '"; + + while (*new_dfa_) + { + os_ << ss_.narrow (*new_dfa_++, ' '); + } + + os_ << "'."; + throw runtime_error (os_.str ()); + } + + new_dfa_id_ = iter_->second; + + if (push_dfa_) + { + iter_ = _statemap.find (push_dfa_); + + if (iter_ == end_) + { + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Unknown state name '"; + + while (*push_dfa_) + { + os_ << ss_.narrow (*push_dfa_++, ' '); + } + + os_ << "'."; + throw runtime_error (os_.str ()); + } + + push_dfa_id_ = iter_->second; + } + } + + if (star_) + { + const std::size_t size_ = _statemap.size (); + + for (id_type i_ = 0; i_ < size_; ++i_) + { + next_dfas_.push_back (i_); + } + } + else + { + const char_type *start_ = curr_dfa_; + string next_dfa_; + + while (*curr_dfa_) + { + while (*curr_dfa_ && *curr_dfa_ != ',') + { + ++curr_dfa_; + } + + next_dfa_.assign (start_, curr_dfa_); + + if (*curr_dfa_) + { + ++curr_dfa_; + start_ = curr_dfa_; + } + + validate (next_dfa_.c_str ()); + iter_ = _statemap.find (next_dfa_.c_str ()); + + if (iter_ == end_) + { + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Unknown state name '"; + curr_dfa_ = next_dfa_.c_str (); + + while (*curr_dfa_) + { + os_ << ss_.narrow (*curr_dfa_++, ' '); + } + + os_ << "'."; + throw runtime_error (os_.str ()); + } + + next_dfas_.push_back (iter_->second); + } + } + + for (std::size_t i_ = 0, size_ = next_dfas_.size (); + i_ < size_; ++i_) + { + const id_type curr_ = next_dfas_[i_]; + + _regexes[curr_].push_back (regex_); + + if (regex_[0] == '^') + { + _features[curr_] |= bol_bit; + } + + if (regex_[regex_.size () - 1] == '$') + { + _features[curr_] |= eol_bit; + } + + if (id_ == skip ()) + { + _features[curr_] |= skip_bit; + } + else if (id_ == eoi ()) + { + _features[curr_] |= again_bit; + } + + if (push_ || pop_) + { + _features[curr_] |= recursive_bit; + } + + _ids[curr_].push_back (id_); + _user_ids[curr_].push_back (user_id_); + _next_dfas[curr_].push_back (dot_ ? curr_ : new_dfa_id_); + _pushes[curr_].push_back (push_ ? (push_dfa_ ? + push_dfa_id_ : curr_) : npos ()); + _pops[curr_].push_back (pop_); + } + } + + void validate (const char_type *name_, const char_type *end_ = 0) const + { + const char_type *start_ = name_; + + if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') && + !(*name_ >= 'a' && *name_ <= 'z')) + { + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Invalid name '"; + + while (*name_) + { + os_ << ss_.narrow (*name_++, ' '); + } + + os_ << "'."; + throw runtime_error (os_.str ()); + } + else if (*name_) + { + ++name_; + } + + while (*name_ && name_ != end_) + { + if (*name_ != '_' && *name_ != '-' && + !(*name_ >= 'A' && *name_ <= 'Z') && + !(*name_ >= 'a' && *name_ <= 'z') && + !(*name_ >= '0' && *name_ <= '9')) + { + std::basic_stringstream ss_; + std::ostringstream os_; + + os_ << "Invalid name '"; + name_ = start_; + + while (*name_) + { + os_ << ss_.narrow (*name_++, ' '); + } + + os_ << "'."; + throw runtime_error (os_.str ()); + } + + ++name_; + } + } + + void check_for_invalid_id (const id_type id_) const + { + if (id_ == _eoi) + { + throw runtime_error ("Cannot resuse the id for eoi."); + } + + if (id_ == npos ()) + { + throw runtime_error ("id npos is reserved for the " + "UNKNOWN token."); + } + } +}; + +typedef basic_rules rules; +typedef basic_rules wrules; +} + +#endif diff --git a/inc/lexertl/runtime_error.hpp b/inc/lexertl/runtime_error.hpp new file mode 100644 index 0000000..bd525e8 --- /dev/null +++ b/inc/lexertl/runtime_error.hpp @@ -0,0 +1,23 @@ +// runtime_error.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_RUNTIME_ERROR_HPP +#define LEXERTL_RUNTIME_ERROR_HPP + +#include + +namespace lexertl +{ +class runtime_error : public std::runtime_error +{ +public: + runtime_error (const std::string &what_arg_) : + std::runtime_error (what_arg_) + { + } +}; +} + +#endif diff --git a/inc/lexertl/serialise.hpp b/inc/lexertl/serialise.hpp new file mode 100644 index 0000000..9fcab9a --- /dev/null +++ b/inc/lexertl/serialise.hpp @@ -0,0 +1,28 @@ +// serialise.hpp +// Copyright (c) 2007-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_SERIALISE_HPP +#define LEXERTL_SERIALISE_HPP + +#include "state_machine.hpp" +#include + +namespace lexertl +{ +// IMPORTANT! This won't work if you don't enable RTTI! +template +void serialise (basic_state_machine &sm_, Archive &ar_) +{ + detail::basic_internals &internals_ = sm_.data (); + + ar_ & internals_._eoi; + ar_ & *internals_._lookup; + ar_ & internals_._dfa_alphabet; + ar_ & internals_._features; + ar_ & *internals_._dfa; +} +} + +#endif diff --git a/inc/lexertl/size_t.hpp b/inc/lexertl/size_t.hpp new file mode 100644 index 0000000..866ba28 --- /dev/null +++ b/inc/lexertl/size_t.hpp @@ -0,0 +1,12 @@ +// size_t.h +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_SIZE_T_H +#define LEXERTL_SIZE_T_H + +#include // ptrdiff_t +#include + +#endif diff --git a/inc/lexertl/sm_traits.hpp b/inc/lexertl/sm_traits.hpp new file mode 100644 index 0000000..889a1a9 --- /dev/null +++ b/inc/lexertl/sm_traits.hpp @@ -0,0 +1,44 @@ +// sm_traits.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_SM_TRAITS_H +#define LEXERTL_SM_TRAITS_H + +namespace lexertl +{ +template +struct basic_sm_traits +{ + enum {char_24_bit = sizeof(ch_type) > 2, compressed = comp, lookup = look, + is_dfa = dfa_nfa}; + typedef ch_type input_char_type; + typedef ch_type char_type; + typedef sm_type id_type; + + static id_type npos () + { + return static_cast(~0); + } +}; + +template +struct basic_sm_traits +{ + enum {char_24_bit = sizeof(ch_type) > 2, compressed = true, lookup = look, + is_dfa = dfa_nfa}; + typedef ch_type input_char_type; + typedef unsigned char char_type; + typedef sm_type id_type; + + static id_type npos () + { + return static_cast(~0); + } +}; +} + +#endif diff --git a/inc/lexertl/state_machine.hpp b/inc/lexertl/state_machine.hpp new file mode 100644 index 0000000..e14786c --- /dev/null +++ b/inc/lexertl/state_machine.hpp @@ -0,0 +1,525 @@ +// state_machine.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_STATE_MACHINE_HPP +#define LEXERTL_STATE_MACHINE_HPP + +#include "compile_assert.hpp" +// memcmp() +#include +#include +#include "internals.hpp" +#include +#include +#include "sm_traits.hpp" +#include "string_token.hpp" + +namespace lexertl +{ +template +class basic_state_machine +{ +public: + typedef basic_sm_traits 1), true, true> traits; + typedef detail::basic_internals internals; + + // If you get a compile error here you have + // failed to define an unsigned id type. + compile_assert<(static_cast(~0) > 0)> + _valid_id_type; + + basic_state_machine () : + _valid_id_type (), + _internals () + { + } + + void clear () + { + _internals.clear (); + } + + internals &data () + { + return _internals; + } + + const internals &data () const + { + return _internals; + } + + bool empty () const + { + return _internals.empty (); + } + + id_type eoi () const + { + return _internals._eoi; + } + + void minimise () + { + const id_type dfas_ = static_cast(_internals. + _dfa->size ()); + + for (id_type i_ = 0; i_ < dfas_; ++i_) + { + const id_type dfa_alphabet_ = _internals._dfa_alphabet[i_]; + id_type_vector *dfa_ = _internals._dfa[i_]; + + if (dfa_alphabet_ != 0) + { + std::size_t size_ = 0; + + do + { + size_ = dfa_->size (); + minimise_dfa (dfa_alphabet_, *dfa_, size_); + } while (dfa_->size () != size_); + } + } + } + + static id_type npos () + { + return static_cast(~0); + } + + static id_type skip () + { + return static_cast(~1); + } + + void swap (basic_state_machine &rhs_) + { + _internals.swap (rhs_._internals); + } + +private: + typedef typename internals::id_type_vector id_type_vector; + typedef std::set index_set; + internals _internals; + + void minimise_dfa (const id_type dfa_alphabet_, + id_type_vector &dfa_, std::size_t size_) + { + const id_type *first_ = &dfa_.front (); + const id_type *end_ = first_ + size_; + id_type index_ = 1; + id_type new_index_ = 1; + id_type_vector lookup_ (size_ / dfa_alphabet_, npos ()); + id_type *lookup_ptr_ = &lookup_.front (); + index_set index_set_; + const id_type bol_index_ = dfa_.front (); + + *lookup_ptr_ = 0; + // Only one 'jam' state, so skip it. + first_ += dfa_alphabet_; + + for (; first_ < end_; first_ += dfa_alphabet_, ++index_) + { + const id_type *second_ = first_ + dfa_alphabet_; + + for (id_type curr_index_ = index_ + 1; second_ < end_; + ++curr_index_, second_ += dfa_alphabet_) + { + if (index_set_.find (curr_index_) != index_set_.end ()) + { + continue; + } + + // Some systems have memcmp in namespace std. + using namespace std; + + if (memcmp (first_, second_, sizeof (id_type) * + dfa_alphabet_) == 0) + { + index_set_.insert (curr_index_); + lookup_ptr_[curr_index_] = new_index_; + } + } + + if (lookup_ptr_[index_] == npos ()) + { + lookup_ptr_[index_] = new_index_; + ++new_index_; + } + } + + if (!index_set_.empty ()) + { + const id_type *front_ = &dfa_.front (); + id_type_vector new_dfa_ (front_, front_ + dfa_alphabet_); + typename index_set::const_iterator set_end_ = index_set_.end (); + const id_type *ptr_ = front_ + dfa_alphabet_; + id_type *new_ptr_ = 0; + + new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0); + new_ptr_ = &new_dfa_.front () + dfa_alphabet_; + size_ /= dfa_alphabet_; + + if (bol_index_) + { + new_dfa_.front () = lookup_ptr_[bol_index_]; + } + + for (index_ = 1; index_ < size_; ++index_) + { + if (index_set_.find (index_) != set_end_) + { + ptr_ += dfa_alphabet_; + continue; + } + + new_ptr_[end_state_index] = ptr_[end_state_index]; + new_ptr_[id_index] = ptr_[id_index]; + new_ptr_[user_id_index] = ptr_[user_id_index]; + new_ptr_[push_dfa_index] = ptr_[push_dfa_index]; + new_ptr_[next_dfa_index] = ptr_[next_dfa_index]; + new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]]; + new_ptr_ += transitions_index; + ptr_ += transitions_index; + + for (id_type i_ = transitions_index; i_ < dfa_alphabet_; ++i_) + { + *new_ptr_++ = lookup_ptr_[*ptr_++]; + } + } + + dfa_.swap (new_dfa_); + } + } +}; + +typedef basic_state_machine state_machine; +typedef basic_state_machine wstate_machine; + +template +struct basic_char_state_machine +{ + typedef basic_sm_traits traits; + typedef detail::basic_internals internals; + typedef typename internals::id_type_vector id_type_vector; + + struct state + { + typedef basic_string_token string_token; + typedef std::map id_type_string_token_map; + typedef std::pair id_type_string_token_pair; + enum push_pop_dfa {neither, push_dfa, pop_dfa}; + + bool _end_state; + push_pop_dfa _push_pop_dfa; + id_type _id; + id_type _user_id; + id_type _push_dfa; + id_type _next_dfa; + id_type _eol_index; + id_type_string_token_map _transitions; + + state () : + _end_state (false), + _push_pop_dfa (neither), + _id (0), + _user_id (traits::npos ()), + _push_dfa (traits::npos ()), + _next_dfa (0), + _eol_index (traits::npos ()), + _transitions () + { + } + + bool operator == (const state rhs_) const + { + return _end_state == rhs_._end_state && + _push_pop_dfa == rhs_._push_pop_dfa && + _id == rhs_._id && + _user_id == rhs_._user_id && + _push_dfa == rhs_._push_dfa && + _next_dfa == rhs_._next_dfa && + _eol_index == rhs_._eol_index && + _transitions == rhs_._transitions; + } + }; + + typedef typename state::string_token string_token; + typedef std::vector state_vector; + typedef std::vector string_token_vector; + typedef typename state::id_type_string_token_pair + id_type_string_token_pair; + + struct dfa + { + id_type _bol_index; + state_vector _states; + + dfa (const std::size_t size_) : + _bol_index (traits::npos ()), + _states (state_vector (size_)) + { + } + + std::size_t size () const + { + return _states.size (); + } + + void swap (dfa &rhs_) + { + std::swap (_bol_index, rhs_._bol_index); + _states.swap (rhs_._states); + } + }; + + typedef std::deque dfa_deque; + + dfa_deque _sm_deque; + + // If you get a compile error here you have + // failed to define an unsigned id type. + compile_assert<(static_cast(~0) > 0)> + _valid_id_type; + + basic_char_state_machine () : + _sm_deque (), + _valid_id_type () + { + } + + void append (const string_token_vector &token_vector_, + const internals &internals_, const id_type dfa_index_) + { + const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[dfa_index_]; + const std::size_t alphabet_ = dfa_alphabet_ - transitions_index; + const id_type_vector &source_dfa_ = *internals_._dfa[dfa_index_]; + const id_type *ptr_ = &source_dfa_.front (); + const std::size_t size_ = (source_dfa_.size () - dfa_alphabet_) / + dfa_alphabet_; + typename state::id_type_string_token_map::iterator trans_iter_; + + _sm_deque.push_back (dfa (size_)); + + dfa &dest_dfa_ = _sm_deque.back (); + + if (*ptr_) + { + dest_dfa_._bol_index = *ptr_ - 1; + } + + ptr_ += dfa_alphabet_; + + for (id_type i_ = 0; i_ < size_; ++i_) + { + state &state_ = dest_dfa_._states[i_]; + + state_._end_state = ptr_[end_state_index] != 0; + + if (ptr_[push_dfa_index] != npos ()) + { + state_._push_pop_dfa = state::push_dfa; + } + else if (ptr_[end_state_index] & pop_dfa_bit) + { + state_._push_pop_dfa = state::pop_dfa; + } + + state_._id = ptr_[id_index]; + state_._user_id = ptr_[user_id_index]; + state_._push_dfa = ptr_[push_dfa_index]; + state_._next_dfa = ptr_[next_dfa_index]; + + if (ptr_[eol_index]) + { + state_._eol_index = ptr_[eol_index] - 1; + } + + ptr_ += transitions_index; + + for (id_type col_index_ = 0; col_index_ < alphabet_; + ++col_index_, ++ptr_) + { + const id_type next_ = *ptr_; + + if (next_ > 0) + { + trans_iter_ = state_._transitions.find (next_ - 1); + + if (trans_iter_ == state_._transitions.end ()) + { + trans_iter_ = state_._transitions.insert + (id_type_string_token_pair (next_ - 1, + token_vector_[col_index_])).first; + } + else + { + trans_iter_->second.insert (token_vector_[col_index_]); + } + } + } + } + } + + void clear () + { + _sm_deque.clear (); + } + + bool empty () const + { + return _sm_deque.empty (); + } + + void minimise () + { + const id_type dfas_ = static_cast(_sm_deque.size ()); + + for (id_type i_ = 0; i_ < dfas_; ++i_) + { + dfa *dfa_ = &_sm_deque[i_]; + + if (dfa_->size () > 0) + { + std::size_t size_ = 0; + + do + { + size_ = dfa_->size (); + minimise_dfa (*dfa_, size_); + } while (dfa_->size () != size_); + } + } + } + + static id_type npos () + { + return traits::npos (); + } + + id_type size () const + { + return static_cast(_sm_deque.size ()); + } + + static id_type skip () + { + return static_cast(~1); + } + + void swap (basic_char_state_machine &csm_) + { + _sm_deque.swap (csm_._sm_deque); + } + +private: + typedef std::set index_set; + + void minimise_dfa (dfa &dfa_, std::size_t size_) + { + const state *first_ = &dfa_._states.front (); + const state *end_ = first_ + size_; + id_type index_ = 0; + id_type new_index_ = 0; + id_type_vector lookup_ (size_, npos ()); + id_type *lookup_ptr_ = &lookup_.front (); + index_set index_set_; + + for (; first_ != end_; ++first_, ++index_) + { + const state *second_ = first_ + 1; + + for (id_type curr_index_ = index_ + 1; second_ != end_; + ++curr_index_, ++second_) + { + if (index_set_.find (curr_index_) != index_set_.end ()) + { + continue; + } + + if (*first_ == *second_) + { + index_set_.insert (curr_index_); + lookup_ptr_[curr_index_] = new_index_; + } + } + + if (lookup_ptr_[index_] == npos ()) + { + lookup_ptr_[index_] = new_index_; + ++new_index_; + } + } + + if (!index_set_.empty ()) + { + const state *front_ = &dfa_._states.front (); + dfa new_dfa_ (new_index_); + typename index_set::const_iterator set_end_ = index_set_.end (); + const state *ptr_ = front_; + state *new_ptr_ = &new_dfa_._states.front (); + + if (dfa_._bol_index != npos ()) + { + new_dfa_._bol_index = lookup_ptr_[dfa_._bol_index]; + } + + for (index_ = 0; index_ < size_; ++index_) + { + if (index_set_.find (index_) != set_end_) + { + ++ptr_; + continue; + } + + new_ptr_->_end_state = ptr_->_end_state; + new_ptr_->_id = ptr_->_end_state; + new_ptr_->_user_id = ptr_->_user_id; + new_ptr_->_next_dfa = ptr_->_next_dfa; + + if (ptr_->_eol_index != npos ()) + { + new_ptr_->_eol_index = lookup_ptr_[ptr_->_eol_index]; + } + + typename state::id_type_string_token_map::const_iterator + iter_ = ptr_->_transitions.begin (); + typename state::id_type_string_token_map::const_iterator end_ = + ptr_->_transitions.end (); + typename state::id_type_string_token_map::iterator find_; + + for (; iter_ != end_; ++iter_) + { + find_ = new_ptr_->_transitions.find + (lookup_ptr_[iter_->first]); + + if (find_ == new_ptr_->_transitions.end ()) + { + new_ptr_->_transitions.insert + (id_type_string_token_pair + (lookup_ptr_[iter_->first], iter_->second)); + } + else + { + find_->second.insert (iter_->second); + } + } + + ++ptr_; + ++new_ptr_; + } + + dfa_.swap (new_dfa_); + } + } +}; + +typedef basic_char_state_machine char_state_machine; +typedef basic_char_state_machine wchar_state_machine; +} + +#endif diff --git a/inc/lexertl/stream_shared_iterator.hpp b/inc/lexertl/stream_shared_iterator.hpp new file mode 100644 index 0000000..61b529e --- /dev/null +++ b/inc/lexertl/stream_shared_iterator.hpp @@ -0,0 +1,350 @@ +// stream_shared_iterator.hpp +// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef LEXERTL_STREAM_SHARED_ITERATOR_H +#define LEXERTL_STREAM_SHARED_ITERATOR_H + +#include +// memcpy +#include +#include +#include +#include +#include "runtime_error.hpp" +#include "size_t.hpp" +#include + +namespace lexertl +{ +template +class basic_stream_shared_iterator +{ +public: + typedef std::basic_istream istream; + typedef std::forward_iterator_tag iterator_category; + typedef std::size_t difference_type; + typedef char_type value_type; + typedef char_type *pointer; + typedef char_type &reference; + + basic_stream_shared_iterator () : + _master (false), + _live (false), + _index (shared::npos ()), + _shared (0) + { + } + + basic_stream_shared_iterator (istream &stream_, + const std::size_t buff_size_ = 1024, + const std::size_t increment_ = 1024) : + _master (true), + _live (false), + _index (shared::npos ()), + // For exception safety don't call new yet + _shared (0) + { + // Safe to call potentially throwing new now. + _shared = new shared (stream_, buff_size_, increment_); + ++_shared->_ref_count; + _iter = _shared->_clients.insert (_shared->_clients.end (), this); + } + + basic_stream_shared_iterator (const basic_stream_shared_iterator &rhs_) : + _master (false), + _live (false), + _index (rhs_._master ? rhs_._shared->lowest () : rhs_._index), + _shared (rhs_._shared) + { + if (_shared) + { + // New copy of an iterator. + // The assumption is that any copy must be live + // even if the rhs is not (otherwise we will never + // have a record of the start of the current range!) + ++_shared->_ref_count; + _iter = _shared->_clients.insert (_shared->_clients.end (), this); + _live = true; + } + } + + ~basic_stream_shared_iterator () + { + if (_shared) + { + --_shared->_ref_count; + _shared->erase (this); + + if (_shared->_ref_count == 0) + { + delete _shared; + _shared = 0; + } + } + } + + basic_stream_shared_iterator &operator = + (const basic_stream_shared_iterator &rhs_) + { + if (this != &rhs_) + { + _master = false; + _index = rhs_._master ? rhs_._shared->lowest () : rhs_._index; + + if (_live && !rhs_._live) + { + _shared->erase (this); + + if (!rhs_._shared) + { + --_shared->_ref_count; + } + } + else if (!_live && rhs_._live) + { + rhs_._iter = rhs_._shared->_clients.insert (rhs_._shared-> + _clients.end (), this); + + if (!_shared) + { + ++rhs_._shared->_ref_count; + } + } + + _live = rhs_._live; + _shared = rhs_._shared; + } + + return *this; + } + + bool operator == (const basic_stream_shared_iterator &rhs_) const + { + return _index == rhs_._index && + (_shared == rhs_._shared || + (_index == shared::npos () || rhs_._index == shared::npos ()) && + (!_shared || !rhs_._shared)); + } + + bool operator != (const basic_stream_shared_iterator &rhs_) const + { + return !(*this == rhs_); + } + + const char_type &operator * () + { + check_master (); + return _shared->_buffer[_index]; + } + + basic_stream_shared_iterator &operator ++ () + { + check_master (); + ++_index; + update_state (); + return *this; + } + + basic_stream_shared_iterator operator ++ (int) + { + basic_stream_shared_iterator iter_ = *this; + + check_master (); + ++_index; + update_state (); + return iter_; + } + +private: + class shared + { + public: + std::size_t _ref_count; + typedef std::vector char_vector; + typedef std::list iter_list; + istream &_stream; + std::size_t _increment; + std::size_t _len; + char_vector _buffer; + iter_list _clients; + + shared (istream &stream_, const std::size_t buff_size_, + const std::size_t increment_) : + _ref_count (0), + _increment (increment_), + _stream (stream_) + { + _buffer.resize (buff_size_); + _stream.read (&_buffer.front (), _buffer.size ()); + _len = static_cast(_stream.gcount ()); + } + + bool reload_buffer () + { + const std::size_t lowest_ = lowest (); + std::size_t read_ = 0; + + if (lowest_ == 0) + { + // Resize buffer + const std::size_t old_size_ = _buffer.size (); + const std::size_t new_size_ = old_size_ + _increment; + + _buffer.resize (new_size_); + _stream.read (&_buffer.front () + old_size_, _increment); + read_ = static_cast(_stream.gcount ()); + + if (read_) + { + read_ += old_size_; + _len = read_; + } + } + else + { + // Some systems have memcpy in namespace std + using namespace std; + const size_t start_ = _buffer.size () - lowest_; + const size_t len_ = _buffer.size () - start_; + + memcpy (&_buffer.front (), &_buffer[lowest_], start_ * + sizeof (char_type)); + _stream.read (&_buffer.front () + start_, len_); + read_ = static_cast(_stream.gcount ()); + subtract (lowest_); + + if (read_) + { + read_ += start_; + _len = read_; + } + else + { + _len = highest (); + } + } + + return read_ != 0; + } + + void erase (basic_stream_shared_iterator *ptr_) + { + if (ptr_->_iter != _clients.end ()) + { + _clients.erase (ptr_->_iter); + ptr_->_iter = _clients.end (); + } + } + + std::size_t lowest () const + { + std::size_t lowest_ = npos (); + typename iter_list::const_iterator iter_ = _clients.begin (); + typename iter_list::const_iterator end_ = _clients.end (); + + for (; iter_ != end_; ++iter_) + { + const basic_stream_shared_iterator *ptr_ = *iter_; + + if (ptr_->_index < lowest_) + { + lowest_ = ptr_->_index; + } + } + + if (lowest_ == npos ()) + { + lowest_ = 0; + } + + return lowest_; + } + + std::size_t highest () const + { + std::size_t highest_ = 0; + typename iter_list::const_iterator iter_ = _clients.begin (); + typename iter_list::const_iterator end_ = _clients.end (); + + for (; iter_ != end_; ++iter_) + { + const basic_stream_shared_iterator *ptr_ = *iter_; + + if (ptr_->_index != npos () && ptr_->_index > highest_) + { + highest_ = ptr_->_index; + } + } + + return highest_; + } + + void subtract (const std::size_t lowest_) + { + typename iter_list::iterator iter_ = _clients.begin (); + typename iter_list::iterator end_ = _clients.end (); + + for (; iter_ != end_; ++iter_) + { + basic_stream_shared_iterator *ptr_ = *iter_; + + if (ptr_->_index != npos ()) + { + ptr_->_index -= lowest_; + } + } + } + + static std::size_t npos () + { + return static_cast(~0); + } + + private: + shared &operator = (const shared &rhs_); + }; + + bool _master; + bool _live; + std::size_t _index; + shared *_shared; + mutable typename shared::iter_list::iterator _iter; + + void check_master () + { + if (!_shared) + { + throw runtime_error ("Cannot manipulate null (end) " + "stream_shared_iterators."); + } + + if (_master) + { + _master = false; + _live = true; + _index = _shared->lowest (); + } + } + + void update_state () + { + if (_index >= _shared->_len) + { + if (!_shared->reload_buffer ()) + { + _shared->erase (this); + _index = shared::npos (); + _live = false; + } + } + } +}; + +typedef basic_stream_shared_iterator stream_shared_iterator; +typedef basic_stream_shared_iterator wstream_shared_iterator; +} + +#endif diff --git a/inc/lexertl/string_token.hpp b/inc/lexertl/string_token.hpp new file mode 100644 index 0000000..0c8c88d --- /dev/null +++ b/inc/lexertl/string_token.hpp @@ -0,0 +1,421 @@ +// string_token.hpp +// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_STRING_TOKEN_HPP +#define LEXERTL_STRING_TOKEN_HPP + +#include "char_traits.hpp" +#include // Needed by GCC 4.4 +#include +#include +#include +#include + +namespace lexertl +{ +template +struct basic_string_token +{ + typedef ch_type char_type; + typedef basic_char_traits char_traits; + typedef typename char_traits::index_type index_type; + typedef std::pair range; + typedef std::vector range_vector; + typedef std::basic_string string; + typedef basic_string_token string_token; + + range_vector _ranges; + + basic_string_token () : + _ranges () + { + } + + basic_string_token (char_type ch_) : + _ranges () + { + insert (range (ch_, ch_)); + } + + basic_string_token (char_type first_, char_type second_) : + _ranges () + { + insert (range (first_, second_)); + } + + void clear () + { + _ranges.clear (); + } + + bool empty () const + { + return _ranges.empty (); + } + + bool any () const + { + return _ranges.size () == 1 && _ranges.front ().first == 0 && + _ranges.front ().second == char_traits::max_val (); + } + + bool operator < (const basic_string_token &rhs_) const + { + return _ranges < rhs_._ranges; + } + + bool operator == (const basic_string_token &rhs_) const + { + return _ranges == rhs_._ranges; + } + + bool negatable () const + { + std::size_t size_ = 0; + typename range_vector::const_iterator iter_ = _ranges.begin (); + typename range_vector::const_iterator end_ = _ranges.end (); + + for (; iter_ != end_; ++iter_) + { + size_ += static_cast(iter_->second) + 1 - + static_cast(iter_->first); + } + + return size_ > static_cast(char_traits::max_val ()) / 2; + } + + void swap (basic_string_token &rhs_) + { + _ranges.swap (rhs_._ranges); + } + + void insert (const basic_string_token &rhs_) + { + typename range_vector::const_iterator iter_ = rhs_._ranges.begin (); + typename range_vector::const_iterator end_ = rhs_._ranges.end (); + + for (; iter_ != end_; ++iter_) + { + insert (*iter_); + } + } + + // Deliberately pass by value - may modify + typename range_vector::iterator insert (range rhs_) + { + bool insert_ = true; + typename range_vector::iterator iter_ = _ranges.begin (); + typename range_vector::const_iterator end_ = _ranges.end (); + + while (iter_ != end_) + { + // follows current item + if (rhs_.first > iter_->second) + { + if (rhs_.first == iter_->second + 1) + { + // Auto normalise + rhs_.first = iter_->first; + } + else + { + // No intersection, consider next + ++iter_; + continue; + } + } + // Precedes current item + else if (rhs_.second < iter_->first) + { + if (rhs_.second == iter_->first - 1) + { + // Auto normalise + rhs_.second = iter_->second; + } + else + { + // insert here + break; + } + } + else + { + // overlap (under) + if (rhs_.first < iter_->first) + { + if (rhs_.second < iter_->second) + { + rhs_.second = iter_->second; + } + } + // overlap (over) + else if (rhs_.second > iter_->second) + { + if (rhs_.first > iter_->first) + { + rhs_.first = iter_->first; + } + } + // subset + else + { + insert_ = false; + iter_ = _ranges.end (); + break; + } + } + + // Code minimisation: this always applies unless we have already + // exited the loop, or "continue" executed. + iter_ = _ranges.erase (iter_); + end_ = _ranges.end (); + } + + if (insert_) + { + iter_ = _ranges.insert(iter_, rhs_); + } + + return iter_; + } + + void negate () + { + index_type next_ = 0; + const index_type max_ = char_traits::max_val (); + string_token temp_; + typename range_vector::iterator iter_ = _ranges.begin (); + typename range_vector::const_iterator end_ = _ranges.end (); + bool finished_ = false; + + for (; iter_ != end_; ++iter_) + { + if (next_ < iter_->first) + { + temp_.insert (range (next_, iter_->first - 1)); + } + + if (iter_->second < max_) + { + next_ = iter_->second + 1; + } + else + { + finished_ = true; + break; + } + } + + if (!finished_) + { + temp_.insert (range (next_, max_)); + } + + swap (temp_); + } + + void intersect (basic_string_token &rhs_, basic_string_token &overlap_) + { + typename range_vector::iterator lhs_iter_ = _ranges.begin (); + typename range_vector::const_iterator lhs_end_ = _ranges.end (); + typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin (); + typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end (); + + while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_) + { + if (rhs_iter_->first > lhs_iter_->second) + { + ++lhs_iter_; + } + else if (rhs_iter_->second < lhs_iter_->first) + { + ++rhs_iter_; + } + else + { + range range_; + + if (rhs_iter_->first > lhs_iter_->first) + { + range_.first = rhs_iter_->first; + } + else + { + range_.first = lhs_iter_->first; + } + + if (rhs_iter_->second < lhs_iter_->second) + { + range_.second = rhs_iter_->second; + } + else + { + range_.second = lhs_iter_->second; + } + + adjust (range_, *this, lhs_iter_, lhs_end_); + adjust (range_, rhs_, rhs_iter_, rhs_end_); + overlap_.insert (range_); + } + } + } + + void remove (basic_string_token &rhs_) + { + typename range_vector::iterator lhs_iter_ = _ranges.begin (); + typename range_vector::const_iterator lhs_end_ = _ranges.end (); + typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin (); + typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end (); + + while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_) + { + if (rhs_iter_->first > lhs_iter_->second) + { + ++lhs_iter_; + } + else if (rhs_iter_->second < lhs_iter_->first) + { + ++rhs_iter_; + } + else + { + range range_; + + if (rhs_iter_->first > lhs_iter_->first) + { + range_.first = rhs_iter_->first; + } + else + { + range_.first = lhs_iter_->first; + } + + if (rhs_iter_->second < lhs_iter_->second) + { + range_.second = rhs_iter_->second; + } + else + { + range_.second = lhs_iter_->second; + } + + adjust (range_, *this, lhs_iter_, lhs_end_); + } + } + } + + static string escape_char (const typename char_traits::index_type ch_) + { + string out_; + + switch (ch_) + { + case '\0': + out_ += '\\'; + out_ += '0'; + break; + case '\a': + out_ += '\\'; + out_ += 'a'; + break; + case '\b': + out_ += '\\'; + out_ += 'b'; + break; + case 27: + out_ += '\\'; + out_ += 'x'; + out_ += '1'; + out_ += 'b'; + break; + case '\f': + out_ += '\\'; + out_ += 'f'; + break; + case '\n': + out_ += '\\'; + out_ += 'n'; + break; + case '\r': + out_ += '\\'; + out_ += 'r'; + break; + case '\t': + out_ += '\\'; + out_ += 't'; + break; + case '\v': + out_ += '\\'; + out_ += 'v'; + break; + case '\\': + out_ += '\\'; + out_ += '\\'; + break; + case '"': + out_ += '\\'; + out_ += '"'; + break; + case '\'': + out_ += '\\'; + out_ += '\''; + break; + default: + { + if (ch_ < 32 || ch_ > 126) + { + std::basic_stringstream ss_; + + out_ += '\\'; + out_ += 'x'; + ss_ << std::hex << + static_cast (ch_); + out_ += ss_.str (); + } + else + { + out_ += ch_; + } + + break; + } + } + + return out_; + } + +private: + void adjust (const range &range_, basic_string_token &token_, + typename range_vector::iterator &iter_, + typename range_vector::const_iterator &end_) + { + if (range_.first > iter_->first) + { + const index_type second_ = iter_->second; + + iter_->second = range_.first - 1; + + if (range_.second < second_) + { + range new_range_ (range_.second + 1, second_); + + iter_ = token_.insert (new_range_); + end_ = token_._ranges.end (); + } + } + else if (range_.second < iter_->second) + { + iter_->first = range_.second + 1; + } + else + { + iter_ = token_._ranges.erase (iter_); + end_ = token_._ranges.end (); + } + } +}; +} + +#endif diff --git a/inc/lexertl/utf_iterators.hpp b/inc/lexertl/utf_iterators.hpp new file mode 100644 index 0000000..f4251c5 --- /dev/null +++ b/inc/lexertl/utf_iterators.hpp @@ -0,0 +1,380 @@ +// utf_iterators.hpp +// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/) +// Inspired by http://utfcpp.sourceforge.net/ +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef LEXERTL_UTF_ITERATORS_HPP +#define LEXERTL_UTF_ITERATORS_HPP + +#include + +namespace lexertl +{ +template +class basic_utf8_in_iterator : + public std::iterator +{ +public: + basic_utf8_in_iterator () : + _char (0) + { + } + + explicit basic_utf8_in_iterator (const char_iterator& it_) : + _it (it_), + _char (0) + { + next (); + } + + char_type operator * () const + { + return _char; + } + + bool operator == (const basic_utf8_in_iterator &rhs_) const + { + return _it == rhs_._it; + } + + bool operator != (const basic_utf8_in_iterator &rhs_) const + { + return _it != rhs_._it; + } + + basic_utf8_in_iterator &operator ++ () + { + next (); + return *this; + } + + basic_utf8_in_iterator operator ++ (int) + { + basic_utf8_in_iterator temp_ = *this; + + next (); + return temp_; + } + +private: + typedef typename std::iterator_traits:: + difference_type difference_type; + char_iterator _it; + char_type _char; + + void next () + { + const char len_ = len (_it); + char_type ch_ = *_it & 0xff; + + switch (len_) + { + case 1: + break; + case 2: + ++_it; + ch_ = (ch_ << 6 & 0x7ff) | (*_it & 0x3f); + break; + case 3: + ++_it; + ch_ = (ch_ << 12 & 0xffff) | ((*_it & 0xff) << 6 & 0xfff); + ++_it; + ch_ |= *_it & 0x3f; + break; + case 4: + ++_it; + ch_ = (ch_ << 18 & 0x1fffff) | ((*_it & 0xff) << 12 & 0x3ffff); + ++_it; + ch_ |= (*_it & 0xff) << 6 & 0xfff; + ++_it; + ch_ |= *_it & 0x3f; + break; + } + + ++_it; + _char = ch_; + } + + char len (const char_iterator &it_) const + { + const unsigned char ch_ = *it_; + + return ch_ < 0x80 ? 1 : + ch_ >> 5 == 0x06 ? 2 : + ch_ >> 4 == 0x0e ? 3 : + ch_ >> 3 == 0x1e ? 4 : 0; + } +}; + +template +class basic_utf8_out_iterator : + public std::iterator +{ +public: + basic_utf8_out_iterator () : + _count (0), + _index (0) + { + } + + explicit basic_utf8_out_iterator (const char_iterator& it_) : + _it (it_), + _count (0), + _index (0) + { + next (); + } + + char operator * () const + { + return _bytes[_index]; + } + + bool operator == (const basic_utf8_out_iterator &rhs_) const + { + return _it == rhs_._it; + } + + bool operator != (const basic_utf8_out_iterator &rhs_) const + { + return _it != rhs_._it; + } + + basic_utf8_out_iterator &operator ++ () + { + ++_index; + + if (_index >= _count) + { + next (); + } + + return *this; + } + + basic_utf8_out_iterator operator ++ (int) + { + basic_utf8_out_iterator temp_ = *this; + + ++_index; + + if (_index >= _count) + { + next (); + } + + return temp_; + } + +private: + char_iterator _it; + char _bytes[4]; + unsigned char _count; + unsigned char _index; + + void next () + { + const std::size_t ch_ = *_it; + + _count = len (ch_); + _index = 0; + + switch (_count) + { + case 1: + _bytes[0] = static_cast(ch_); + break; + case 2: + _bytes[0] = static_cast((ch_ >> 6) | 0xc0); + _bytes[1] = (ch_ & 0x3f) | 0x80; + break; + case 3: + _bytes[0] = static_cast((ch_ >> 12) | 0xe0); + _bytes[1] = ((ch_ >> 6) & 0x3f) | 0x80; + _bytes[2] = (ch_ & 0x3f) | 0x80; + break; + case 4: + _bytes[0] = static_cast((ch_ >> 18) | 0xf0); + _bytes[1] = ((ch_ >> 12) & 0x3f) | 0x80; + _bytes[2] = ((ch_ >> 6) & 0x3f) | 0x80; + _bytes[3] = (ch_ & 0x3f) | 0x80; + break; + } + + ++_it; + } + + char len (const std::size_t ch_) const + { + return ch_ < 0x80 ? 1 : + ch_ < 0x800 ? 2 : + ch_ < 0x10000 ? 3 : + 4; + } +}; + +template +class basic_utf16_in_iterator : + public std::iterator +{ +public: + basic_utf16_in_iterator () : + _char (0) + { + } + + explicit basic_utf16_in_iterator (const char_iterator &it_) : + _it (it_), + _char (0) + { + next (); + } + + char_type operator * () const + { + return _char; + } + + bool operator == (const basic_utf16_in_iterator &rhs_) const + { + return _it == rhs_._it; + } + + bool operator != (const basic_utf16_in_iterator &rhs_) const + { + return _it != rhs_._it; + } + + basic_utf16_in_iterator &operator ++ () + { + next (); + return *this; + } + + basic_utf16_in_iterator operator ++ (int) + { + basic_utf16_in_iterator temp_ = *this; + + next (); + return temp_; + } + +private: + typedef typename std::iterator_traits:: + difference_type difference_type; + char_iterator _it; + char_type _char; + + void next () + { + char_type ch_ = *_it & 0xffff; + + if (ch_ >= 0xd800 && ch_ <= 0xdbff) + { + const char_type surrogate_ = *++_it & 0xffff; + + ch_ = (((ch_ - 0xd800) << 10) | (surrogate_ - 0xdc00)) + 0x10000; + } + + ++_it; + _char = ch_; + } +}; + +template +class basic_utf16_out_iterator : + public std::iterator +{ +public: + basic_utf16_out_iterator () : + _count (0), + _index (0) + { + } + + explicit basic_utf16_out_iterator (const char_iterator& it_) : + _it (it_), + _count (0), + _index (0) + { + next (); + } + + wchar_t operator * () const + { + return _chars[_index]; + } + + bool operator == (const basic_utf16_out_iterator &rhs_) const + { + return _it == rhs_._it; + } + + bool operator != (const basic_utf16_out_iterator &rhs_) const + { + return _it != rhs_._it; + } + + basic_utf16_out_iterator &operator ++ () + { + ++_index; + + if (_index >= _count) + { + next (); + } + + return *this; + } + + basic_utf16_out_iterator operator ++ (int) + { + basic_utf16_out_iterator temp_ = *this; + + ++_index; + + if (_index >= _count) + { + next (); + } + + return temp_; + } + +private: + char_iterator _it; + wchar_t _chars[2]; + unsigned char _count; + unsigned char _index; + + void next () + { + const std::size_t ch_ = *_it; + + _count = len (ch_); + _index = 0; + + switch (_count) + { + case 1: + _chars[0] = static_cast(ch_); + break; + case 2: + _chars[0] = static_cast((ch_ >> 10) + 0xdc00u - + (0x10000 >> 10)); + _chars[1] = static_cast((ch_ & 0x3ff) + 0xdc00u); + break; + } + + ++_it; + } + + char len (const std::size_t ch_) const + { + return ch_ > 0xffff ? 2 : 1; + } +}; +} + +#endif diff --git a/src/test.cpp b/src/test.cpp index 7730549..c4c52f7 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -4,6 +4,15 @@ * Licensed under the GNU GPL v2. */ +// this file is auto generated from grammar/grammar.y +// but it does not work yet +// #include "grammar.h" + +#include "lexertl/generator.hpp" +#include "lexertl/lookup.hpp" +#include "lexertl/rules.hpp" +#include "lexertl/state_machine.hpp" + int main() { return 0; }