Fix typo in grammar.

Add lexertl.
This commit is contained in:
Markus Hauschild
2013-05-31 21:35:44 +02:00
parent 7be9545e02
commit f3d3389f54
44 changed files with 12465 additions and 2 deletions

View File

@@ -1,5 +1,6 @@
%include { %include {
#include <cstdio>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <assert.h> #include <assert.h>
@@ -20,8 +21,8 @@
programm(A) ::= fundefs(B). { A = B; } programm(A) ::= fundefs(B). { A = B; }
fundefs(A) ::= . { A = 0: } fundefs(A) ::= . { A = 0; }
fundefs(A) ::= fundefs fundef(B). { A = A + B: } fundefs(A) ::= fundefs fundef(B). { A = A + B; }
fundef(A) ::= type(T) T_IDENTIFIER(ID) params(P) T_BEGIN statements(S) T_END. { A = T + ID + P + S; } fundef(A) ::= type(T) T_IDENTIFIER(ID) params(P) T_BEGIN statements(S) T_END. { A = T + ID + P + S; }

22
inc/lexertl/bool.hpp Normal file
View File

@@ -0,0 +1,22 @@
// bool.hpp
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_BOOL_H
#define LEXERTL_BOOL_H
namespace lexertl
{
// Named template param for compiler compatibility
template<bool b>
struct bool_
{
};
typedef bool_<true> true_;
typedef bool_<false> false_;
}
#endif

View File

@@ -0,0 +1,50 @@
// char_traits.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_CHAR_TRAITS_H
#define LEXERTL_CHAR_TRAITS_H
#include <cstddef>
namespace lexertl
{
template<typename ch_type>
struct basic_char_traits
{
typedef ch_type char_type;
typedef ch_type index_type;
static index_type index (const char_type ch)
{
return ch;
}
static index_type max_val ()
{
return sizeof(char_type) > 2 ? 0x10ffff :
static_cast<index_type>(~0);
}
};
template<>
struct basic_char_traits<char>
{
typedef char char_type;
typedef unsigned char index_type;
static index_type index (const char ch)
{
return static_cast<index_type>(ch);
}
static index_type max_val ()
{
return static_cast<index_type>(~0);
}
};
}
#endif

View File

@@ -0,0 +1,24 @@
// compile_assert.hpp
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_COMPILE_ASSERT_H
#define LEXERTL_COMPILE_ASSERT_H
namespace lexertl
{
// Named template param for compiler compatibility
template<bool b>
struct compile_assert;
// enum for compiler compatibility
template<>
struct compile_assert<true>
{
enum {value = 1};
};
}
#endif

View File

@@ -0,0 +1,228 @@
// bitvector.hpp
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_BITVECTOR_HPP
#define LEXERTL_BITVECTOR_HPP
#include <vector>
namespace lexertl
{
template<typename T>
class basic_bitvector
{
public:
template<typename Ty>
class reference
{
public:
reference (Ty &block_, const std::size_t mask_) :
_block (block_),
_mask (mask_)
{
}
operator bool () const
{
return (_block & _mask) != 0;
}
reference<Ty> &operator = (const bool bit_)
{
if (bit_)
{
_block |= _mask;
}
else
{
_block &= ~_mask;
}
return *this;
}
reference<Ty> &operator = (reference<Ty> &rhs_)
{
if (rhs_)
{
_block |= _mask;
}
else
{
_block &= ~_mask;
}
}
private:
Ty &_block;
const std::size_t _mask;
};
basic_bitvector (const std::size_t size_) :
_vec (block (size_) + (bit (size_) ? 1 : 0), 0)
{
}
basic_bitvector (const basic_bitvector &rhs_) :
_vec (rhs_._vec)
{
}
basic_bitvector &operator = (const basic_bitvector &rhs_)
{
if (&rhs_ != this)
{
_vec = rhs_._vec;
}
return *this;
}
bool operator [] (const std::size_t index_) const
{
return (_vec[block (index_)] & (1 << bit (index_))) != 0;
}
reference<T> operator [] (const std::size_t index_)
{
return reference<T> (_vec[block (index_)], (1 << bit (index_)));
}
basic_bitvector<T> &operator |= (const basic_bitvector<T> &rhs_)
{
typename t_vector::iterator lhs_iter_ = _vec.begin ();
typename t_vector::iterator lhs_end_ = _vec.end ();
typename t_vector::const_iterator rhs_iter_ = rhs_._vec.begin ();
typename t_vector::const_iterator rhs_end_ = rhs_._vec.end ();
for (; lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_;
++lhs_iter_, ++rhs_iter_)
{
*lhs_iter_ |= *rhs_iter_;
}
return *this;
}
basic_bitvector<T> &operator &= (const basic_bitvector<T> &rhs_)
{
typename t_vector::iterator lhs_iter_ = _vec.begin ();
typename t_vector::iterator lhs_end_ = _vec.end ();
typename t_vector::const_iterator rhs_iter_ = rhs_._vec.begin ();
typename t_vector::const_iterator rhs_end_ = rhs_._vec.end ();
for (; lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_;
++lhs_iter_, ++rhs_iter_)
{
*lhs_iter_ &= *rhs_iter_;
}
return *this;
}
void clear ()
{
typename t_vector::iterator iter_ = _vec.begin ();
typename t_vector::iterator end_ = _vec.end ();
for (; iter_ != end_; ++iter_)
{
*iter_ = 0;
}
}
bool any () const
{
typename t_vector::const_iterator iter_ = _vec.begin ();
typename t_vector::const_iterator end_ = _vec.end ();
for (; iter_ != end_; ++iter_)
{
if (*iter_) break;
}
return iter_ != end_;
}
void negate ()
{
typename t_vector::iterator iter_ = _vec.begin ();
typename t_vector::iterator end_ = _vec.end ();
for (; iter_ != end_; ++iter_)
{
*iter_ = ~*iter_;
}
}
std::size_t find_first () const
{
return find_next (npos ());
}
std::size_t find_next (const std::size_t index_) const
{
std::size_t ret_ = npos ();
const std::size_t block_ = index_ == npos () ? 0 : block (index_ + 1);
std::size_t bit_ = index_ == npos () ? 0 : bit (index_ + 1);
typename t_vector::const_iterator iter_ = _vec.begin () + block_;
typename t_vector::const_iterator end_ = _vec.end ();
for (std::size_t i_ = block_; iter_ != end_; ++iter_, ++i_)
{
const bool bits_ = (*iter_ & (static_cast<T>(~0) << bit_)) != 0;
if (bits_)
{
std::size_t j_ = bit_;
std::size_t b_ = 1 << bit_;
bool found_ = false;
for (; j_ < sizeof(T) * 8; ++j_, b_ <<= 1)
{
if (*iter_ & b_)
{
found_ = true;
break;
}
}
if (found_)
{
ret_ = i_ * sizeof(T) * 8 + j_;
break;
}
}
bit_ = 0;
}
return ret_;
}
std::size_t npos () const
{
return static_cast<std::size_t>(~0);
}
private:
typedef std::vector<T> t_vector;
t_vector _vec;
std::size_t block (const std::size_t index_) const
{
return index_ / (sizeof(T) * 8);
}
std::size_t bit (const std::size_t index_) const
{
return index_ % (sizeof(T) * 8);
}
};
}
#endif

View File

@@ -0,0 +1,69 @@
// ptr_list.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_PTR_LIST_HPP
#define LEXERTL_PTR_LIST_HPP
#include <list>
namespace lexertl
{
namespace detail
{
template<typename ptr_type>
class ptr_list
{
public:
typedef std::list<ptr_type *> list;
ptr_list () :
_list ()
{
}
~ptr_list ()
{
clear ();
}
list *operator -> ()
{
return &_list;
}
const list *operator -> () const
{
return &_list;
}
list &operator * ()
{
return _list;
}
const list &operator * () const
{
return _list;
}
void clear ()
{
while (!_list.empty ())
{
delete _list.front ();
_list.pop_front ();
}
}
private:
list _list;
ptr_list (const ptr_list &); // No copy construction.
ptr_list &operator = (const ptr_list &); // No assignment.
};
}
}
#endif

View File

@@ -0,0 +1,72 @@
// ptr_map.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_PTR_MAP_HPP
#define LEXERTL_PTR_MAP_HPP
#include <map>
namespace lexertl
{
namespace detail
{
template<typename key_type, typename ptr_type>
class ptr_map
{
public:
typedef std::map<key_type, ptr_type *> map;
typedef std::pair<key_type, ptr_type *> pair;
typedef std::pair<typename map::iterator, bool> iter_pair;
ptr_map ()
{
}
~ptr_map ()
{
clear ();
}
map *operator -> ()
{
return &_map;
}
const map *operator -> () const
{
return &_map;
}
map &operator * ()
{
return _map;
}
const map &operator * () const
{
return _map;
}
void clear ()
{
for (typename map::iterator iter_ = _map.begin (), end_ = _map.end ();
iter_ != end_; ++iter_)
{
delete iter_->second;
}
_map.clear ();
}
private:
map _map;
ptr_map (const ptr_map &); // No copy construction.
ptr_map &operator = (const ptr_map &); // No assignment.
};
}
}
#endif

View File

@@ -0,0 +1,69 @@
// ptr_stack.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_PTR_STACK_HPP
#define LEXERTL_PTR_STACK_HPP
#include <stack>
namespace lexertl
{
namespace detail
{
template<typename ptr_type>
class ptr_stack
{
public:
typedef std::stack<ptr_type *> stack;
ptr_stack () :
_stack ()
{
}
~ptr_stack ()
{
clear ();
}
stack *operator -> ()
{
return &_stack;
}
const stack *operator -> () const
{
return &_stack;
}
stack &operator * ()
{
return _stack;
}
const stack &operator * () const
{
return _stack;
}
void clear ()
{
while (!_stack.empty ())
{
delete _stack.top ();
_stack.pop ();
}
}
private:
stack _stack;
ptr_stack (const ptr_stack &); // No copy construction.
ptr_stack &operator = (const ptr_stack &); // No assignment.
};
}
}
#endif

View File

@@ -0,0 +1,106 @@
// ptr_vector.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_PTR_VECTOR_HPP
#define LEXERTL_PTR_VECTOR_HPP
#include "../size_t.hpp"
#include <vector>
namespace lexertl
{
namespace detail
{
template<typename ptr_type>
class ptr_vector
{
public:
typedef std::vector<ptr_type *> vector;
ptr_vector () :
_vector ()
{
}
~ptr_vector ()
{
clear ();
}
vector *operator -> ()
{
return &_vector;
}
const vector *operator -> () const
{
return &_vector;
}
vector &operator * ()
{
return _vector;
}
const vector &operator * () const
{
return _vector;
}
ptr_type * &operator [] (const std::size_t index_)
{
return _vector[index_];
}
ptr_type * const &operator [] (const std::size_t index_) const
{
return _vector[index_];
}
bool operator == (const ptr_vector &rhs_) const
{
bool equal_ = _vector.size () == rhs_._vector.size ();
if (equal_)
{
typename vector::const_iterator lhs_iter_ = _vector.begin ();
typename vector::const_iterator end_ = _vector.end ();
typename vector::const_iterator rhs_iter_ = rhs_._vector.begin ();
for (; equal_ && lhs_iter_ != end_; ++lhs_iter_, ++rhs_iter_)
{
equal_ = **lhs_iter_ == **rhs_iter_;
}
}
return equal_;
}
void clear ()
{
if (!_vector.empty ())
{
ptr_type **iter_ = &_vector.front ();
ptr_type **end_ = iter_ + _vector.size ();
for (; iter_ != end_; ++iter_)
{
delete *iter_;
}
}
_vector.clear ();
}
private:
vector _vector;
ptr_vector (const ptr_vector &); // No copy construction.
ptr_vector &operator = (const ptr_vector &); // No assignment.
};
}
}
#endif

353
inc/lexertl/debug.hpp Normal file
View File

@@ -0,0 +1,353 @@
// debug.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_DEBUG_HPP
#define LEXERTL_DEBUG_HPP
#include <map>
#include <ostream>
#include "rules.hpp"
#include "size_t.hpp"
#include "state_machine.hpp"
#include "string_token.hpp"
#include <vector>
namespace lexertl
{
template<typename sm, typename char_type, typename id_type = std::size_t,
bool is_dfa = true>
class basic_debug
{
public:
typedef lexertl::basic_char_state_machine<char_type, id_type, is_dfa>
char_state_machine;
typedef std::basic_ostream<char_type> ostream;
typedef lexertl::basic_rules<char_type, id_type> rules;
typedef std::basic_string<char_type> string;
static void dump (const sm &sm_, rules &rules_, ostream &stream_)
{
char_state_machine csm_;
sm_to_csm (sm_, csm_);
dump (csm_, rules_, stream_);
}
static void dump (const sm &sm_, ostream &stream_)
{
char_state_machine csm_;
sm_to_csm (sm_, csm_);
dump (csm_, stream_);
}
static void dump (const char_state_machine &csm_, rules &rules_,
ostream &stream_)
{
for (std::size_t dfa_ = 0, dfas_ = csm_.size (); dfa_ < dfas_; ++dfa_)
{
lexer_state (stream_);
stream_ << rules_.state (dfa_) << std::endl << std::endl;
dump_ex (csm_._sm_deque[dfa_], stream_);
}
}
static void dump (const char_state_machine &csm_, ostream &stream_)
{
for (std::size_t dfa_ = 0, dfas_ = csm_.size (); dfa_ < dfas_; ++dfa_)
{
lexer_state (stream_);
stream_ << dfa_ << std::endl << std::endl;
dump_ex (csm_._sm_deque[dfa_], stream_);
}
}
protected:
typedef typename char_state_machine::state dfa_state;
typedef typename dfa_state::string_token string_token;
typedef std::basic_stringstream<char_type> stringstream;
static void sm_to_csm (const sm &sm_, char_state_machine &csm_)
{
const detail::basic_internals<id_type> &internals_ = sm_.data ();
const std::size_t dfas_ = internals_._dfa->size ();
for (id_type i_ = 0; i_ < dfas_; ++i_)
{
if (internals_._dfa_alphabet[i_] == 0) continue;
const std::size_t alphabet_ = internals_._dfa_alphabet[i_] -
transitions_index;
typename char_state_machine::string_token_vector token_vector_
(alphabet_, string_token ());
id_type *ptr_ = &internals_._lookup[i_]->front ();
for (std::size_t c_ = 0; c_ < 256; ++c_, ++ptr_)
{
if (*ptr_ >= transitions_index)
{
string_token &token_ = token_vector_
[*ptr_ - transitions_index];
token_.insert (typename string_token::range
(typename string_token::index_type (c_),
typename string_token::index_type (c_)));
}
}
csm_.append (token_vector_, internals_, i_);
}
}
static void dump_ex (const typename char_state_machine::dfa &dfa_,
ostream &stream_)
{
const std::size_t states_ = dfa_._states.size ();
const id_type bol_index_ = dfa_._bol_index;
typename dfa_state::id_type_string_token_map::const_iterator iter_;
typename dfa_state::id_type_string_token_map::const_iterator end_;
for (std::size_t i_ = 0; i_ < states_; ++i_)
{
const dfa_state &state_ = dfa_._states[i_];
state (stream_);
stream_ << i_ << std::endl;
if (state_._end_state)
{
end_state (stream_);
if (state_._push_pop_dfa == dfa_state::push_dfa)
{
push (stream_);
stream_ << state_._push_dfa;
}
else if (state_._push_pop_dfa == dfa_state::pop_dfa)
{
pop (stream_);
}
id (stream_);
stream_ << static_cast<std::size_t>(state_._id);
user_id (stream_);
stream_ << static_cast<std::size_t>(state_._user_id);
dfa (stream_);
stream_ << static_cast<std::size_t>(state_._next_dfa);
stream_ << std::endl;
}
if (i_ == 0 && bol_index_ != char_state_machine::npos ())
{
bol (stream_);
stream_ << static_cast<std::size_t>(bol_index_) << std::endl;
}
if (state_._eol_index != char_state_machine::npos ())
{
eol (stream_);
stream_ << static_cast<std::size_t>(state_._eol_index) <<
std::endl;
}
iter_ = state_._transitions.begin ();
end_ = state_._transitions.end ();
for (; iter_ != end_; ++iter_)
{
string_token token_ = iter_->second;
open_bracket (stream_);
if (!iter_->second.any () && iter_->second.negatable ())
{
token_.negate ();
negated (stream_);
}
string chars_;
typename string_token::range_vector::const_iterator
ranges_iter_ = token_._ranges.begin ();
typename string_token::range_vector::const_iterator
ranges_end_ = token_._ranges.end ();
for (; ranges_iter_ != ranges_end_; ++ranges_iter_)
{
if (ranges_iter_->first == '^' ||
ranges_iter_->first == ']')
{
stream_ << '\\';
}
chars_ = string_token::escape_char
(ranges_iter_->first);
if (ranges_iter_->first != ranges_iter_->second)
{
if (ranges_iter_->first + 1 < ranges_iter_->second)
{
chars_ += '-';
}
if (ranges_iter_->second == '^' ||
ranges_iter_->second == ']')
{
stream_ << '\\';
}
chars_ += string_token::escape_char
(ranges_iter_->second);
}
stream_ << chars_;
}
close_bracket (stream_);
stream_ << static_cast<std::size_t>(iter_->first) <<
std::endl;
}
stream_ << std::endl;
}
}
static void lexer_state (std::ostream &stream_)
{
stream_ << "Lexer state: ";
}
static void lexer_state (std::wostream &stream_)
{
stream_ << L"Lexer state: ";
}
static void state (std::ostream &stream_)
{
stream_ << "State: ";
}
static void state (std::wostream &stream_)
{
stream_ << L"State: ";
}
static void bol (std::ostream &stream_)
{
stream_ << " BOL -> ";
}
static void bol (std::wostream &stream_)
{
stream_ << L" BOL -> ";
}
static void eol (std::ostream &stream_)
{
stream_ << " EOL -> ";
}
static void eol (std::wostream &stream_)
{
stream_ << L" EOL -> ";
}
static void end_state (std::ostream &stream_)
{
stream_ << " END STATE";
}
static void end_state (std::wostream &stream_)
{
stream_ << L" END STATE";
}
static void id (std::ostream &stream_)
{
stream_ << ", Id = ";
}
static void id (std::wostream &stream_)
{
stream_ << L", Id = ";
}
static void push (std::ostream &stream_)
{
stream_ << ", PUSH ";
}
static void push (std::wostream &stream_)
{
stream_ << L", PUSH ";
}
static void pop (std::ostream &stream_)
{
stream_ << ", POP";
}
static void pop (std::wostream &stream_)
{
stream_ << L", POP";
}
static void user_id (std::ostream &stream_)
{
stream_ << ", User Id = ";
}
static void user_id (std::wostream &stream_)
{
stream_ << L", User Id = ";
}
static void open_bracket (std::ostream &stream_)
{
stream_ << " [";
}
static void open_bracket (std::wostream &stream_)
{
stream_ << L" [";
}
static void negated (std::ostream &stream_)
{
stream_ << "^";
}
static void negated (std::wostream &stream_)
{
stream_ << L"^";
}
static void close_bracket (std::ostream &stream_)
{
stream_ << "] -> ";
}
static void close_bracket (std::wostream &stream_)
{
stream_ << L"] -> ";
}
static void dfa (std::ostream &stream_)
{
stream_ << ", dfa = ";
}
static void dfa (std::wostream &stream_)
{
stream_ << L", dfa = ";
}
};
typedef basic_debug<basic_state_machine<char>, char> debug;
typedef basic_debug<basic_state_machine<wchar_t>, wchar_t> wdebug;
}
#endif

25
inc/lexertl/enums.hpp Normal file
View File

@@ -0,0 +1,25 @@
// enums.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_ENUMS_H
#define LEXERTL_ENUMS_H
namespace lexertl
{
enum regex_flags {icase = 1, dot_not_newline = 2, skip_ws = 4,
match_zero_len = 8};
// 0 = end state, 1 = id, 2 = user id, 3 = push_dfa_index
// 4 = next dfa, 5 = dead state, 6 = dfa_start
enum {end_state_index, id_index, user_id_index, push_dfa_index,
next_dfa_index, eol_index, dead_state_index, transitions_index};
// Rule flags:
enum feature_flags {bol_bit = 1, eol_bit = 2, skip_bit = 4, again_bit = 8,
multi_state_bit = 16, recursive_bit = 32, advance_bit = 64};
// End state flags:
enum {end_state_bit = 1, pop_dfa_bit = 2};
}
#endif

1122
inc/lexertl/generate_cpp.hpp Normal file

File diff suppressed because it is too large Load Diff

829
inc/lexertl/generator.hpp Normal file
View File

@@ -0,0 +1,829 @@
// generator.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_GENERATOR_HPP
#define LEXERTL_GENERATOR_HPP
#include <algorithm>
#include "bool.hpp"
#include "partition/charset.hpp"
#include "char_traits.hpp"
#include "partition/equivset.hpp"
#include <memory>
#include "parser/parser.hpp"
#include "containers/ptr_list.hpp"
#include "rules.hpp"
#include "size_t.hpp"
#include "state_machine.hpp"
namespace lexertl
{
template<typename rules, typename sm, typename char_traits = basic_char_traits
<typename sm::traits::input_char_type> >
class basic_generator
{
public:
typedef typename rules::id_type id_type;
typedef typename rules::char_type rules_char_type;
typedef typename sm::traits sm_traits;
typedef detail::basic_parser<rules_char_type, sm_traits> parser;
typedef typename parser::charset_map charset_map;
typedef typename parser::node node;
typedef typename parser::node_ptr_vector node_ptr_vector;
static void build (const rules &rules_, sm &sm_)
{
const std::size_t size_ = rules_.statemap ().size ();
// Strong exception guarantee
// http://www.boost.org/community/exception_safety.html
internals internals_;
sm temp_sm_;
node_ptr_vector node_ptr_vector_;
internals_._eoi = rules_.eoi ();
internals_.add_states (size_);
for (id_type index_ = 0; index_ < size_; ++index_)
{
if (rules_.regexes ()[index_].empty ())
{
std::ostringstream ss_;
ss_ << "Lexer states with no rules are not allowed "
"(lexer state " << index_ << ".)";
throw runtime_error (ss_.str ());
}
else
{
// Note that the following variables are per DFA.
// Map of regex charset tokens (strings) to index
charset_map charset_map_;
// Used to fix up $ and \n clashes.
id_type nl_id_ = sm_traits::npos ();
// Regex syntax tree
node *root_ = build_tree (rules_, index_, node_ptr_vector_,
charset_map_, nl_id_);
build_dfa (charset_map_, root_, internals_, temp_sm_, index_,
nl_id_);
if (internals_._dfa[index_]->size () /
internals_._dfa_alphabet[index_] >= sm_traits::npos ())
{
// Overflow
throw runtime_error ("The data type you have chosen "
"cannot hold this many DFA rows.");
}
}
}
// If you get a compile error here the id_type from rules and
// state machine do no match.
create (internals_, temp_sm_, rules_.features (), lookup ());
sm_.swap (temp_sm_);
}
static node *build_tree (const rules &rules_, const std::size_t dfa_,
node_ptr_vector &node_ptr_vector_, charset_map &charset_map_,
id_type &nl_id_)
{
typename parser::macro_map macro_map_;
parser parser_ (rules_.locale (), node_ptr_vector_, macro_map_,
charset_map_, rules_.eoi ());
const typename rules::string_deque_deque &regexes_ =
rules_.regexes ();
typename rules::string_deque::const_iterator regex_iter_ =
regexes_[dfa_].begin ();
typename rules::string_deque::const_iterator regex_iter_end_ =
regexes_[dfa_].end ();
const typename rules::string &regex_ = *regex_iter_;
const typename rules::id_vector_deque &ids_ = rules_.ids ();
const typename rules::id_vector_deque &user_ids_ =
rules_.user_ids ();
typename rules::id_vector::const_iterator id_iter_ =
ids_[dfa_].begin ();
typename rules::id_vector::const_iterator user_id_iter_ =
user_ids_[dfa_].begin ();
const typename rules::id_vector_deque &next_dfas_ =
rules_.next_dfas ();
const typename rules::id_vector_deque &pushes_ = rules_.pushes ();
const typename rules::bool_vector_deque &pops_ = rules_.pops ();
typename rules::id_vector::const_iterator next_dfa_iter_ =
next_dfas_[dfa_].begin ();
typename rules::id_vector::const_iterator push_dfa_iter_ =
pushes_[dfa_].begin ();
typename rules::bool_vector::const_iterator pop_dfa_iter_ =
pops_[dfa_].begin ();
const bool seen_bol_ = (rules_.features ()[dfa_] & bol_bit) != 0;
node *root_ = 0;
// Macros have a different context per lexer state
// as equivsets (generally) differ.
build_macros (rules_, macro_map_, node_ptr_vector_, charset_map_,
nl_id_);
root_ = parser_.parse (regex_.c_str (),
regex_.c_str () + regex_.size (), *id_iter_, *user_id_iter_,
*next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_,
rules_.flags (), nl_id_, seen_bol_, false);
++regex_iter_;
++id_iter_;
++user_id_iter_;
++next_dfa_iter_;
++push_dfa_iter_;
++pop_dfa_iter_;
// Build syntax trees
while (regex_iter_ != regex_iter_end_)
{
// Re-declare var, otherwise we perform an assignment..!
const typename rules::string &regex_ = *regex_iter_;
node *rhs_ = parser_.parse (regex_.c_str (),
regex_.c_str () + regex_.size (), *id_iter_, *user_id_iter_,
*next_dfa_iter_, *push_dfa_iter_, *pop_dfa_iter_,
rules_.flags (), nl_id_,
(rules_.features ()[dfa_] & bol_bit) != 0, false);
node_ptr_vector_->push_back
(static_cast<selection_node *>(0));
node_ptr_vector_->back () = new selection_node (root_, rhs_);
root_ = node_ptr_vector_->back ();
++regex_iter_;
++id_iter_;
++user_id_iter_;
++next_dfa_iter_;
++push_dfa_iter_;
++pop_dfa_iter_;
}
return root_;
}
protected:
typedef bool_<sm_traits::compressed> compressed;
typedef detail::basic_equivset<id_type> equivset;
typedef detail::ptr_list<equivset> equivset_list;
typedef std::auto_ptr<equivset> equivset_ptr;
typedef typename sm_traits::char_type sm_char_type;
typedef detail::basic_charset<sm_char_type, id_type> charset;
typedef std::auto_ptr<charset> charset_ptr;
typedef detail::ptr_list<charset> charset_list;
typedef detail::basic_internals<id_type> internals;
typedef typename std::set<id_type> id_type_set;
typedef typename internals::id_type_vector id_type_vector;
typedef typename charset::index_set index_set;
typedef std::vector<index_set> index_set_vector;
typedef bool_<sm_traits::is_dfa> is_dfa;
typedef bool_<sm_traits::lookup> lookup;
typedef typename parser::macro_map macro_map;
typedef typename macro_map::iterator macro_iter;
typedef std::pair<macro_iter, bool> macro_iter_pair;
typedef std::set<const node *> node_set;
typedef detail::ptr_vector<node_set> node_set_vector;
typedef typename node::node_vector node_vector;
typedef detail::ptr_vector<node_vector> node_vector_vector;
typedef std::pair<typename rules::string, const node *> macro_pair;
typedef typename parser::selection_node selection_node;
typedef typename std::vector<std::size_t> size_t_vector;
typedef typename parser::string_token string_token;
static void build_macros (const rules &rules_,
macro_map &macro_map_, node_ptr_vector &node_ptr_vector_,
charset_map &charset_map_, id_type &nl_id_)
{
const typename rules::string_pair_deque &macrodeque_ =
rules_.macrodeque ();
for (typename rules::string_pair_deque::const_iterator iter_ =
macrodeque_.begin (), end_ = macrodeque_.end ();
iter_ != end_; ++iter_)
{
const typename rules::string &name_ = iter_->first;
const typename rules::string &regex_ = iter_->second;
parser parser_ (rules_.locale (), node_ptr_vector_, macro_map_,
charset_map_, rules_.eoi ());
node *node_ = parser_.parse (regex_.c_str (),
regex_.c_str () + regex_.size (), 0, 0, 0, false, false,
rules_.flags (), nl_id_, false, true);
macro_iter_pair map_iter_ = macro_map_.insert (macro_pair (name_,
static_cast<const node *>(0)));
map_iter_.first->second = node_;
}
}
static void build_dfa (const charset_map &charset_map_, const node *root_,
internals &internals_, sm &sm_, const id_type dfa_index_,
id_type &nl_id_)
{
// partitioned charset list
charset_list charset_list_;
// vector mapping token indexes to partitioned token index sets
index_set_vector set_mapping_;
typename internals::id_type_vector &dfa_ =
*internals_._dfa[dfa_index_];
std::size_t dfa_alphabet_ = 0;
const node_vector *followpos_ = &root_->firstpos ();
node_set_vector seen_sets_;
node_vector_vector seen_vectors_;
size_t_vector hash_vector_;
id_type zero_id_ = sm_traits::npos ();
id_type_set eol_set_;
set_mapping_.resize (charset_map_.size ());
partition_charsets (charset_map_, charset_list_, is_dfa ());
build_set_mapping (charset_list_, internals_, dfa_index_,
set_mapping_);
if (nl_id_ != sm_traits::npos ())
{
nl_id_ = *set_mapping_[nl_id_].begin ();
zero_id_ = sm_traits::compressed ?
*set_mapping_[charset_map_.find (string_token (0, 0))->
second].begin () : sm_traits::npos ();
}
dfa_alphabet_ = charset_list_->size () + transitions_index +
(nl_id_ == sm_traits::npos () ? 0 : 1);
if (dfa_alphabet_ > sm_traits::npos ())
{
// Overflow
throw runtime_error ("The data type you have chosen cannot hold "
"the dfa alphabet.");
}
internals_._dfa_alphabet[dfa_index_] = dfa_alphabet_;
// 'jam' state
dfa_.resize (dfa_alphabet_, 0);
closure (followpos_, seen_sets_, seen_vectors_, hash_vector_,
dfa_alphabet_, dfa_);
for (id_type index_ = 0; index_ < static_cast<id_type>
(seen_vectors_->size ()); ++index_)
{
equivset_list equiv_list_;
build_equiv_list (seen_vectors_[index_], set_mapping_,
equiv_list_, is_dfa ());
for (typename equivset_list::list::const_iterator iter_ =
equiv_list_->begin (), end_ = equiv_list_->end ();
iter_ != end_; ++iter_)
{
equivset *equivset_ = *iter_;
const id_type transition_ = closure
(&equivset_->_followpos, seen_sets_, seen_vectors_,
hash_vector_, dfa_alphabet_, dfa_);
if (transition_ != sm_traits::npos ())
{
id_type *ptr_ = &dfa_.front () + ((index_ + 1) *
dfa_alphabet_);
// Prune abstemious transitions from end states.
if (*ptr_ && !equivset_->_greedy) continue;
for (typename equivset::index_vector::const_iterator
equiv_iter_ = equivset_->_index_vector.begin (),
equiv_end_ = equivset_->_index_vector.end ();
equiv_iter_ != equiv_end_; ++equiv_iter_)
{
const id_type i_ = *equiv_iter_;
if (i_ == parser::bol_token ())
{
dfa_.front () = transition_;
}
else if (i_ == parser:: eol_token ())
{
ptr_[eol_index] = transition_;
eol_set_.insert (index_ + 1);
}
else
{
ptr_[i_ + transitions_index] = transition_;
}
}
}
}
}
fix_clashes (eol_set_, nl_id_, zero_id_, dfa_, dfa_alphabet_,
compressed ());
append_dfa (charset_list_, internals_, sm_, dfa_index_, lookup ());
}
// Uncompressed
static void fix_clashes (const id_type_set &eol_set_,
const id_type nl_id_, const id_type /*zero_id_*/,
typename internals::id_type_vector &dfa_,
const std::size_t dfa_alphabet_, const false_ &)
{
typename id_type_set::const_iterator eol_iter_ =
eol_set_.begin ();
typename id_type_set::const_iterator eol_end_ =
eol_set_.end ();
for (; eol_iter_ != eol_end_; ++eol_iter_)
{
id_type *ptr_ = &dfa_.front () + *eol_iter_ * dfa_alphabet_;
const id_type eol_state_ = ptr_[eol_index];
const id_type nl_state_ = ptr_[nl_id_ + transitions_index];
if (nl_state_)
{
ptr_[transitions_index + nl_id_] = 0;
ptr_ = &dfa_.front () + eol_state_ * dfa_alphabet_;
if (ptr_[transitions_index + nl_id_] == 0)
{
ptr_[transitions_index + nl_id_] = nl_state_;
}
}
}
}
// Compressed
static void fix_clashes (const id_type_set &eol_set_,
const id_type nl_id_, const id_type zero_id_,
typename internals::id_type_vector &dfa_,
const std::size_t dfa_alphabet_, const true_ &)
{
typename id_type_set::const_iterator eol_iter_ =
eol_set_.begin ();
typename id_type_set::const_iterator eol_end_ =
eol_set_.end ();
std::size_t i_ = 0;
for (; eol_iter_ != eol_end_; ++eol_iter_)
{
id_type *ptr_ = &dfa_.front () + *eol_iter_ * dfa_alphabet_;
const id_type eol_state_ = ptr_[eol_index];
id_type nl_state_ = 0;
for (; i_ < (sm_traits::char_24_bit ? 2 : 1); ++i_)
{
ptr_ = &dfa_.front () + ptr_[transitions_index + zero_id_] *
dfa_alphabet_;
}
nl_state_ = ptr_[transitions_index + nl_id_];
if (nl_state_)
{
ptr_ = &dfa_.front () + eol_state_ * dfa_alphabet_;
if (ptr_[transitions_index + zero_id_] != 0) continue;
ptr_[transitions_index + zero_id_] = dfa_.size () /
dfa_alphabet_;
dfa_.resize (dfa_.size () + dfa_alphabet_, 0);
for (i_ = 0; i_ < (sm_traits::char_24_bit ? 1 : 0); ++i_)
{
ptr_ = &dfa_.front () + dfa_.size () - dfa_alphabet_;
ptr_[transitions_index + zero_id_] = dfa_.size () /
dfa_alphabet_;
dfa_.resize (dfa_.size () + dfa_alphabet_, 0);
}
ptr_ = &dfa_.front () + dfa_.size () - dfa_alphabet_;
ptr_[transitions_index + nl_id_] = nl_state_;
}
}
}
// char_state_machine version
static void append_dfa (const charset_list &charset_list_,
const internals &internals_, sm &sm_, const id_type dfa_index_,
const false_ &)
{
typename charset_list::list::const_iterator list_iter_ =
charset_list_->begin ();
std::size_t size_ = charset_list_->size ();
typename sm::string_token_vector token_vector_;
token_vector_.reserve (size_);
for (std::size_t i_ = 0; i_ < size_; ++i_, ++list_iter_)
{
const charset *charset_ = *list_iter_;
token_vector_.push_back (charset_->_token);
}
sm_.append (token_vector_, internals_, dfa_index_);
}
// state_machine version
static void append_dfa (const charset_list &,
const internals &, sm &, const id_type, const true_ &)
{
// Nothing to do - will use create() instead
}
// char_state_machine version
static void create (internals &, sm &, const id_type_vector &,
const false_ &)
{
// Nothing to do - will use append_dfa() instead
}
// state_machine version
static void create (internals &internals_, sm &sm_,
const id_type_vector &features_, const true_ &)
{
for (std::size_t i_ = 0, size_ = internals_._dfa->size ();
i_ < size_; ++i_)
{
internals_._features |= features_[i_];
}
if (internals_._dfa->size () > 1)
{
internals_._features |= multi_state_bit;
}
sm_.data ().swap (internals_);
}
// NFA version
static void partition_charsets (const charset_map &map_,
charset_list &lhs_, const false_ &)
{
fill_rhs_list (map_, lhs_);
}
// DFA version
static void partition_charsets (const charset_map &map_,
charset_list &lhs_, const true_ &)
{
charset_list rhs_;
fill_rhs_list (map_, rhs_);
if (!rhs_->empty ())
{
typename charset_list::list::iterator iter_;
typename charset_list::list::iterator end_;
charset_ptr overlap_ (new charset);
lhs_->push_back (static_cast<charset *>(0));
lhs_->back () = rhs_->front ();
rhs_->pop_front ();
while (!rhs_->empty ())
{
charset_ptr r_ (rhs_->front ());
rhs_->pop_front ();
iter_ = lhs_->begin ();
end_ = lhs_->end ();
while (!r_->empty () && iter_ != end_)
{
typename charset_list::list::iterator l_iter_ = iter_;
(*l_iter_)->intersect (*r_.get (), *overlap_.get ());
if (overlap_->empty ())
{
++iter_;
}
else if ((*l_iter_)->empty ())
{
delete *l_iter_;
*l_iter_ = overlap_.release ();
overlap_.reset (new charset);
++iter_;
}
else if (r_->empty ())
{
delete r_.release ();
r_ = overlap_;
overlap_.reset (new charset);
break;
}
else
{
iter_ = lhs_->insert (++iter_,
static_cast<charset *>(0));
*iter_ = overlap_.release ();
overlap_.reset (new charset);
++iter_;
end_ = lhs_->end ();
}
}
if (!r_->empty ())
{
lhs_->push_back (static_cast<charset *>(0));
lhs_->back () = r_.release ();
}
}
}
}
static void fill_rhs_list (const charset_map &map_,
charset_list &list_)
{
typename charset_map::const_iterator iter_ = map_.begin ();
typename charset_map::const_iterator end_ = map_.end ();
for (; iter_ != end_; ++iter_)
{
list_->push_back (static_cast<charset *>(0));
list_->back () = new charset (iter_->first, iter_->second);
}
}
static void build_set_mapping (const charset_list &charset_list_,
internals &internals_, const id_type dfa_index_,
index_set_vector &set_mapping_)
{
typename charset_list::list::const_iterator iter_ =
charset_list_->begin ();
typename charset_list::list::const_iterator end_ =
charset_list_->end ();
typename index_set::const_iterator set_iter_;
typename index_set::const_iterator set_end_;
for (id_type index_ = 0; iter_ != end_; ++iter_, ++index_)
{
const charset *cs_ = *iter_;
set_iter_ = cs_->_index_set.begin ();
set_end_ = cs_->_index_set.end ();
fill_lookup (cs_->_token, internals_._lookup[dfa_index_],
index_, lookup ());
for (; set_iter_ != set_end_; ++set_iter_)
{
set_mapping_[*set_iter_].insert (index_);
}
}
}
// char_state_machine version
static void fill_lookup (const string_token &, id_type_vector *,
const id_type, const false_ &)
{
// Do nothing (lookup not used)
}
// state_machine version
static void fill_lookup (const string_token &charset_,
id_type_vector *lookup_, const id_type index_, const true_ &)
{
typename string_token::range_vector::const_iterator iter_ =
charset_._ranges.begin ();
typename string_token::range_vector::const_iterator end_ =
charset_._ranges.end ();
id_type *ptr_ = &lookup_->front ();
for (; iter_ != end_; ++iter_)
{
for (typename char_traits::index_type char_ = iter_->first;
char_ < iter_->second; ++char_)
{
// Note char_ must be unsigned
ptr_[char_] = index_ + transitions_index;
}
// Note iter_->second must be unsigned
ptr_[iter_->second] = index_ + transitions_index;
}
}
static id_type closure (const node_vector *followpos_,
node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
size_t_vector &hash_vector_, const id_type size_, id_type_vector &dfa_)
{
bool end_state_ = false;
id_type id_ = 0;
id_type user_id_ = sm_traits::npos ();
id_type next_dfa_ = 0;
id_type push_dfa_ = sm_traits::npos ();
bool pop_dfa_ = false;
std::size_t hash_ = 0;
if (followpos_->empty ()) return sm_traits::npos ();
id_type index_ = 0;
std::auto_ptr<node_set> set_ptr_ (new node_set);
std::auto_ptr<node_vector> vector_ptr_ (new node_vector);
for (typename node_vector::const_iterator iter_ =
followpos_->begin (), end_ = followpos_->end ();
iter_ != end_; ++iter_)
{
closure_ex (*iter_, end_state_, id_, user_id_, next_dfa_,
push_dfa_, pop_dfa_, set_ptr_.get (),
vector_ptr_.get (), hash_);
}
bool found_ = false;
typename size_t_vector::const_iterator hash_iter_ =
hash_vector_.begin ();
typename size_t_vector::const_iterator hash_end_ =
hash_vector_.end ();
typename node_set_vector::vector::const_iterator set_iter_ =
seen_sets_->begin ();
for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
{
found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
++index_;
if (found_) break;
}
if (!found_)
{
seen_sets_->push_back (static_cast<node_set *>(0));
seen_sets_->back () = set_ptr_.release ();
seen_vectors_->push_back (static_cast<node_vector *>(0));
seen_vectors_->back () = vector_ptr_.release ();
hash_vector_.push_back (hash_);
// State 0 is the jam state...
index_ = static_cast<id_type>(seen_sets_->size ());
const std::size_t old_size_ = dfa_.size ();
dfa_.resize (old_size_ + size_, 0);
if (end_state_)
{
dfa_[old_size_] |= end_state_bit;
if (pop_dfa_)
{
dfa_[old_size_] |= pop_dfa_bit;
}
dfa_[old_size_ + id_index] = id_;
dfa_[old_size_ + user_id_index] = user_id_;
dfa_[old_size_ + push_dfa_index] = push_dfa_;
dfa_[old_size_ + next_dfa_index] = next_dfa_;
}
}
return index_;
}
static void closure_ex (node *node_, bool &end_state_,
id_type &id_, id_type &user_id_, id_type &next_dfa_,
id_type &push_dfa_, bool &pop_dfa_, node_set *set_ptr_,
node_vector *vector_ptr_, std::size_t &hash_)
{
const bool temp_end_state_ = node_->end_state ();
if (temp_end_state_)
{
if (!end_state_)
{
end_state_ = true;
id_ = node_->id ();
user_id_ = node_->user_id ();
next_dfa_ = node_->next_dfa ();
push_dfa_ = node_->push_dfa ();
pop_dfa_ = node_->pop_dfa ();
}
}
if (set_ptr_->insert (node_).second)
{
vector_ptr_->push_back (node_);
hash_ += reinterpret_cast<std::size_t> (node_);
}
}
// NFA version
static void build_equiv_list (const node_vector *vector_,
const index_set_vector &set_mapping_, equivset_list &lhs_,
const false_ &)
{
fill_rhs_list (vector_, set_mapping_, lhs_);
}
// DFA version
static void build_equiv_list (const node_vector *vector_,
const index_set_vector &set_mapping_, equivset_list &lhs_,
const true_ &)
{
equivset_list rhs_;
fill_rhs_list (vector_, set_mapping_, rhs_);
if (!rhs_->empty ())
{
typename equivset_list::list::iterator iter_;
typename equivset_list::list::iterator end_;
equivset_ptr overlap_ (new equivset);
lhs_->push_back (static_cast<equivset *>(0));
lhs_->back () = rhs_->front ();
rhs_->pop_front ();
while (!rhs_->empty ())
{
equivset_ptr r_ (rhs_->front ());
rhs_->pop_front ();
iter_ = lhs_->begin ();
end_ = lhs_->end ();
while (!r_->empty () && iter_ != end_)
{
typename equivset_list::list::iterator l_iter_ = iter_;
(*l_iter_)->intersect (*r_.get (), *overlap_.get ());
if (overlap_->empty ())
{
++iter_;
}
else if ((*l_iter_)->empty ())
{
delete *l_iter_;
*l_iter_ = overlap_.release ();
overlap_.reset (new equivset);
++iter_;
}
else if (r_->empty ())
{
delete r_.release ();
r_ = overlap_;
overlap_.reset (new equivset);
break;
}
else
{
iter_ = lhs_->insert (++iter_,
static_cast<equivset *>(0));
*iter_ = overlap_.release ();
overlap_.reset (new equivset);
++iter_;
end_ = lhs_->end ();
}
}
if (!r_->empty ())
{
lhs_->push_back (static_cast<equivset *>(0));
lhs_->back () = r_.release ();
}
}
}
}
static void fill_rhs_list (const node_vector *vector_,
const index_set_vector &set_mapping_, equivset_list &list_)
{
typename node_vector::const_iterator iter_ =
vector_->begin ();
typename node_vector::const_iterator end_ =
vector_->end ();
for (; iter_ != end_; ++iter_)
{
const node *node_ = *iter_;
if (!node_->end_state ())
{
const id_type token_ = node_->token ();
if (token_ != node::null_token ())
{
list_->push_back (static_cast<equivset *>(0));
if (token_ == parser::bol_token () ||
token_ == parser::eol_token ())
{
std::set<id_type> index_set_;
index_set_.insert (token_);
list_->back () = new equivset (index_set_,
token_, node_->greedy (), node_->followpos ());
}
else
{
list_->back () = new equivset (set_mapping_[token_],
token_, node_->greedy (), node_->followpos ());
}
}
}
}
}
};
typedef basic_generator<rules, state_machine> generator;
typedef basic_generator<wrules, wstate_machine> wgenerator;
typedef basic_generator<rules, char_state_machine> char_generator;
typedef basic_generator<wrules, wchar_state_machine> wchar_generator;
}
#endif

80
inc/lexertl/internals.hpp Normal file
View File

@@ -0,0 +1,80 @@
// internals.hpp
// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_INTERNALS_HPP
#define LEXERTL_INTERNALS_HPP
#include "enums.hpp"
#include "containers/ptr_vector.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
struct basic_internals
{
typedef std::vector<id_type> id_type_vector;
typedef ptr_vector<id_type_vector> id_type_vector_vector;
id_type _eoi;
id_type_vector_vector _lookup;
id_type_vector _dfa_alphabet;
id_type _features;
id_type_vector_vector _dfa;
basic_internals () :
_eoi (0),
_lookup (),
_dfa_alphabet (),
_features (0),
_dfa ()
{
}
void clear ()
{
_eoi = 0;
_lookup.clear ();
_dfa_alphabet.clear ();
_features = 0;
_dfa.clear ();
}
bool empty () const
{
return _dfa->empty ();
}
void add_states (const std::size_t num_)
{
for (std::size_t index_ = 0; index_ < num_; ++index_)
{
_lookup->push_back (static_cast<id_type_vector *>(0));
// lookup *always* has a size 256 now.
_lookup->back () = new id_type_vector (256, dead_state_index);
_dfa_alphabet.push_back (0);
_dfa->push_back (static_cast<id_type_vector *>(0));
_dfa->back () = new id_type_vector;
}
}
void swap (basic_internals &internals_)
{
std::swap (_eoi, internals_._eoi);
_lookup->swap (*internals_._lookup);
_dfa_alphabet.swap (internals_._dfa_alphabet);
std::swap (_features, internals_._features);
_dfa->swap (*internals_._dfa);
}
private:
basic_internals (const basic_internals &); // No copy construction.
basic_internals &operator = (const basic_internals &); // No assignment.
};
}
}
#endif

29
inc/lexertl/is_same.hpp Normal file
View File

@@ -0,0 +1,29 @@
// is_same.hpp
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_IS_SAME_HPP
#define LEXERTL_IS_SAME_HPP
namespace lexertl
{
namespace detail
{
template<typename t1, typename t2>
struct is_same
{
enum {same = false};
};
template<typename t1>
struct is_same<t1, t1>
{
enum {same = true};
};
}
}
#endif

View File

@@ -0,0 +1,24 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

477
inc/lexertl/lookup.hpp Normal file
View File

@@ -0,0 +1,477 @@
// lookup.hpp
// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_LOOKUP_HPP
#define LEXERTL_LOOKUP_HPP
#include <assert.h>
#include "bool.hpp"
#include "match_results.hpp"
#include "state_machine.hpp"
namespace lexertl
{
namespace detail
{
template<bool>
struct bol_state
{
bol_state (const bool)
{
}
};
template<>
struct bol_state<true>
{
bool _bol;
bool _end_bol;
bol_state (const bool bol_) :
_bol (bol_),
_end_bol (bol_)
{
}
};
template<typename id_type, bool>
struct eol_state
{
};
template<typename id_type>
struct eol_state<id_type, true>
{
id_type _EOL_state;
eol_state () :
_EOL_state (0)
{
}
};
template<typename id_type, bool>
struct multi_state_state
{
multi_state_state (const id_type)
{
}
};
template<typename id_type>
struct multi_state_state<id_type, true>
{
id_type _start_state;
multi_state_state (const id_type state_) :
_start_state (state_)
{
}
};
template<typename id_type, bool>
struct recursive_state
{
recursive_state (const id_type *)
{
}
};
template<typename id_type>
struct recursive_state<id_type, true>
{
bool _pop;
id_type _push_dfa;
recursive_state (const id_type *ptr_) :
_pop ((*ptr_ & pop_dfa_bit) != 0),
_push_dfa (*(ptr_ + push_dfa_index))
{
}
};
template<typename id_type, typename index_type, std::size_t flags>
struct lookup_state
{
typedef basic_internals<id_type> internals;
const id_type *_lookup;
id_type _dfa_alphabet;
const id_type *_dfa;
const id_type *_ptr;
bool _end_state;
id_type _id;
id_type _uid;
bol_state<(flags & bol_bit) != 0> _bol_state;
eol_state<id_type, (flags & eol_bit) != 0> _eol_state;
multi_state_state<id_type, (flags & multi_state_bit) != 0>
_multi_state_state;
recursive_state<id_type, (flags & recursive_bit) != 0> _recursive_state;
lookup_state (const internals &internals_, const bool bol_,
const id_type state_) :
_lookup (&internals_._lookup[state_]->front ()),
_dfa_alphabet (internals_._dfa_alphabet[state_]),
_dfa (&internals_._dfa[state_]->front ()),
_ptr (_dfa + _dfa_alphabet),
_end_state (*_ptr != 0),
_id (*(_ptr + id_index)),
_uid (*(_ptr + user_id_index)),
_bol_state (bol_),
_eol_state (),
_multi_state_state (state_),
_recursive_state (_ptr)
{
}
void reset_recursive (const false_ &)
{
// Do nothing
}
void reset_recursive (const true_ &)
{
_recursive_state._pop = (*_ptr & pop_dfa_bit) != 0;
_recursive_state._push_dfa = *(_ptr + push_dfa_index);
}
void bol_start_state (const false_ &)
{
// Do nothing
}
void bol_start_state (const true_ &)
{
if (_bol_state._bol)
{
const id_type state_ = *_dfa;
if (state_)
{
_ptr = &_dfa[state_ * _dfa_alphabet];
}
}
}
template<typename char_type>
bool eol (const char_type, const false_ &)
{
return false;
}
template<typename char_type>
bool eol (const char_type curr_, const true_ &)
{
bool ret_ = false;
_eol_state._EOL_state = _ptr[eol_index];
ret_ = _eol_state._EOL_state && curr_ == '\n';
if (ret_)
{
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
}
return ret_;
}
template<typename char_type>
id_type next_char (const char_type prev_char_, const false_ &)
{
const id_type state_= _ptr[_lookup
[static_cast<index_type>(prev_char_)]];
if (state_ != 0)
{
_ptr = &_dfa[state_ * _dfa_alphabet];
}
return state_;
}
template<typename char_type>
id_type next_char (const char_type prev_char_, const true_ &)
{
const std::size_t bytes_ = sizeof (char_type) < 3 ?
sizeof (char_type) : 3;
const std::size_t shift_[] = {0, 8, 16};
id_type state_= 0;
for (std::size_t i_ = 0; i_ < bytes_; ++i_)
{
state_ = _ptr[_lookup[static_cast<unsigned char>((prev_char_ >>
shift_[bytes_ - 1 - i_]) & 0xff)]];
if (state_ == 0)
{
break;
}
_ptr = &_dfa[state_ * _dfa_alphabet];
}
return state_;
}
template<typename char_type>
void bol (const char_type, const false_ &)
{
// Do nothing
}
template<typename char_type>
void bol (const char_type prev_char_, const true_ &)
{
_bol_state._bol = prev_char_ == '\n';
}
void eol (const id_type, const false_ &)
{
// Do nothing
}
void eol (const id_type err_val_, const true_ &)
{
_eol_state._EOL_state = err_val_;
}
void reset_start_state (const false_ &)
{
// Do nothing
}
void reset_start_state (const true_ &)
{
_multi_state_state._start_state = *(_ptr + next_dfa_index);
}
void reset_end_bol (const false_ &)
{
// Do nothing
}
void reset_end_bol (const true_ &)
{
_bol_state._end_bol = _bol_state._bol;
}
template<typename iter_type>
void end_state (iter_type &end_token_, iter_type &curr_)
{
if (*_ptr)
{
_end_state = true;
reset_end_bol (bool_<(flags & bol_bit) != 0> ());
_id = *(_ptr + id_index);
_uid = *(_ptr + user_id_index);
reset_recursive (bool_<(flags & recursive_bit) != 0> ());
reset_start_state (bool_<(flags & multi_state_bit) != 0> ());
end_token_ = curr_;
}
}
template<typename iter_type, typename char_type>
void check_eol (iter_type &, iter_type &, const id_type,
const char_type, const false_ &)
{
// Do nothing
}
template<typename iter_type, typename char_type>
void check_eol (iter_type &end_token_, iter_type &curr_,
const id_type npos, const char_type eoi_, const true_ &)
{
if (_eol_state._EOL_state != npos && curr_ == eoi_)
{
_eol_state._EOL_state = _ptr[eol_index];
if (_eol_state._EOL_state)
{
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
end_state (end_token_, curr_);
}
}
}
template<typename results>
void pop (results &, const false_ &)
{
// Nothing to do
}
template<typename results>
void pop (results &results_, const true_ &)
{
if (_recursive_state._pop)
{
_multi_state_state._start_state = results_.stack.top ().first;
results_.stack.pop ();
}
else if (_recursive_state._push_dfa != results::npos ())
{
results_.stack.push (typename results::id_type_pair
(_recursive_state._push_dfa, _id));
}
}
template<typename results>
bool id_eoi (const id_type eoi_, const results &, const false_ &)
{
return _id == eoi_;
}
template<typename results>
bool id_eoi (const id_type eoi_, const results &results_, const true_ &)
{
return _id == eoi_ || (_recursive_state._pop &&
!results_.stack.empty () && results_.stack.top ().second == eoi_);
}
void start_state (id_type &, const false_ &)
{
// Do nothing
}
void start_state (id_type &start_state_, const true_ &)
{
start_state_ = _multi_state_state._start_state;
}
void bol (bool &, const false_ &)
{
// Do nothing
}
void bol (bool &end_bol_, const true_ &)
{
end_bol_ = _bol_state._end_bol;
}
};
template<typename results>
void inc_end (results &, const false_ &)
{
// Do nothing
}
template<typename results>
void inc_end (results &results_, const true_ &)
{
++results_.end;
}
template<typename iter_type, std::size_t flags, typename id_type,
typename results, bool compressed, bool recursive>
void next (const basic_state_machine<typename std::iterator_traits
<iter_type>::value_type, id_type> &sm_,
results &results_, const bool_<compressed> &compressed_,
const bool_<recursive> &recursive_)
{
const basic_internals<id_type> &internals_ = sm_.data ();
typename results::iter_type end_token_ = results_.end;
skip:
typename results::iter_type curr_ = results_.end;
results_.start = curr_;
again:
if (curr_ == results_.eoi)
{
results_.id = internals_._eoi;
results_.user_id = results::npos ();
return;
}
lookup_state<id_type, typename results::index_type, flags> lu_state_
(internals_, results_.bol, results_.state);
lu_state_.bol_start_state (bool_<(flags & bol_bit) != 0> ());
while (curr_ != results_.eoi)
{
if (!lu_state_.eol (*curr_, bool_<(flags & eol_bit) != 0> ()))
{
const typename results::char_type prev_char_ = *curr_++;
const id_type state_ = lu_state_.next_char (prev_char_,
compressed_);
lu_state_.bol (prev_char_, bool_<(flags & bol_bit) != 0> ());
if (state_ == 0)
{
lu_state_.eol (results::npos (),
bool_<(flags & eol_bit) != 0> ());
break;
}
}
lu_state_.end_state (end_token_, curr_);
}
lu_state_.check_eol (end_token_, curr_, results::npos (), results_.eoi,
bool_<(flags & eol_bit) != 0> ());
if (lu_state_._end_state)
{
// Return longest match
lu_state_.pop (results_, recursive_);
lu_state_.start_state (results_.state,
bool_<(flags & multi_state_bit) != 0> ());
lu_state_.bol (results_.bol, bool_<(flags & bol_bit) != 0> ());
results_.end = end_token_;
if (lu_state_._id == sm_.skip ()) goto skip;
if (lu_state_.id_eoi (internals_._eoi, results_, recursive_))
{
curr_ = end_token_;
goto again;
}
}
else
{
results_.end = end_token_;
results_.bol = *results_.end == '\n';
results_.start = results_.end;
// No match causes char to be skipped
inc_end (results_, bool_<(flags & advance_bit) != 0> ());
lu_state_._id = results::npos ();
lu_state_._uid = results::npos ();
}
results_.id = lu_state_._id;
results_.user_id = lu_state_._uid;
}
}
template<typename iter_type, typename id_type, std::size_t flags>
void lookup (const basic_state_machine<typename std::iterator_traits
<iter_type>::value_type, id_type> &sm_,
match_results<iter_type, id_type, flags> &results_)
{
// If this asserts, you have either not defined all the correct
// flags, or you should be using recursive_match_results instead
// of match_results.
assert ((sm_.data ()._features & flags) == sm_.data ()._features);
detail::next<iter_type, flags, id_type> (sm_, results_, bool_<(sizeof
(typename std::iterator_traits<iter_type>::value_type) > 1)> (),
false_ ());
}
template<typename iter_type, typename id_type, std::size_t flags>
void lookup (const basic_state_machine<typename std::iterator_traits
<iter_type>::value_type, id_type> &sm_,
recursive_match_results<iter_type, id_type, flags> &results_)
{
// If this asserts, you have not defined all the correct flags
assert ((sm_.data ()._features & flags) == sm_.data ()._features);
detail::next<iter_type, flags | recursive_bit, id_type> (sm_, results_,
bool_<(sizeof(typename std::iterator_traits<iter_type>::
value_type) > 1)> (), true_ ());
}
}
#endif

View File

@@ -0,0 +1,150 @@
// match_results.hpp
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_MATCH_RESULTS_HPP
#define LEXERTL_MATCH_RESULTS_HPP
#include "char_traits.hpp"
#include "enums.hpp"
#include <iterator>
#include <stack>
#include <string>
namespace lexertl
{
template<typename iter, typename id_type = std::size_t,
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
multi_state_bit | advance_bit>
struct match_results
{
typedef iter iter_type;
typedef typename std::iterator_traits<iter_type>::value_type char_type;
typedef typename basic_char_traits<char_type>::index_type index_type;
typedef std::basic_string<char_type> string;
id_type id;
id_type user_id;
iter_type start;
iter_type end;
iter_type eoi;
bool bol;
id_type state;
match_results () :
id (0),
user_id (npos ()),
start (iter_type ()),
end (iter_type ()),
eoi (iter_type ()),
bol (true),
state (0)
{
}
match_results (const iter_type &start_, const iter_type &end_) :
id (0),
user_id (npos ()),
start (start_),
end (start_),
eoi (end_),
bol (true),
state (0)
{
}
virtual ~match_results ()
{
}
string str () const
{
return string (start, end);
}
virtual void clear ()
{
id = 0;
user_id = npos ();
start = eoi;
end = eoi;
bol = true;
state = 0;
}
virtual void reset (const iter_type &start_, const iter_type &end_)
{
id = 0;
user_id = npos ();
start = start_;
end = start_;
eoi = end_;
bol = true;
state = 0;
}
static id_type npos ()
{
return static_cast<id_type>(~0);
}
static id_type skip ()
{
return static_cast<id_type>(~1);
}
};
template<typename iter, typename id_type = std::size_t,
std::size_t flags = bol_bit | eol_bit | skip_bit | again_bit |
multi_state_bit | recursive_bit | advance_bit>
struct recursive_match_results : public match_results<iter, id_type, flags>
{
typedef std::pair<id_type, id_type> id_type_pair;
std::stack<id_type_pair> stack;
recursive_match_results () :
match_results<iter, id_type, flags> (),
stack ()
{
}
recursive_match_results (const iter &start_, const iter &end_) :
match_results<iter, id_type, flags> (start_, end_),
stack ()
{
}
virtual ~recursive_match_results ()
{
}
virtual void clear ()
{
match_results<iter, id_type, flags>::clear ();
while (!stack.empty()) stack.pop ();
}
virtual void reset (const iter &start_, const iter &end_)
{
match_results<iter, id_type, flags>::reset (start_, end_);
while (!stack.empty()) stack.pop ();
}
};
typedef match_results<std::string::const_iterator> smatch;
typedef match_results<const char *> cmatch;
typedef match_results<std::wstring::const_iterator> wsmatch;
typedef match_results<const wchar_t *> wcmatch;
typedef recursive_match_results<std::string::const_iterator>
srmatch;
typedef recursive_match_results<const char *> crmatch;
typedef recursive_match_results<std::wstring::const_iterator>
wsrmatch;
typedef recursive_match_results<const wchar_t *> wcrmatch;
}
#endif

112
inc/lexertl/memory_file.hpp Normal file
View File

@@ -0,0 +1,112 @@
// memory_file.hpp
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
// Inspired by http://en.wikibooks.org/wiki/Optimizing_C%2B%2B/General_optimization_techniques/Input/Output#Memory-mapped_file
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_MEMORY_FILE_H
#define LEXERTL_MEMORY_FILE_H
#ifdef __unix__
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#elif defined _WIN32
#include <windows.h>
#endif
// Only files small enough to fit into memory are supported.
namespace lexertl
{
template<typename CharT>
class basic_memory_file
{
public:
basic_memory_file (const char *pathname_) :
_data (0),
_size (0)
{
#ifdef __unix__
_fh = ::open (pathname_, O_RDONLY);
if (_fh > -1)
{
struct stat sbuf_;
if (::fstat (_fh, &sbuf_) > -1)
{
_data = static_cast<const CharT *>
(::mmap (0, sbuf_.st_size, PROT_READ, MAP_SHARED, _fh, 0));
if (_data == MAP_FAILED)
{
_data = 0;
}
else
{
_size = sbuf_.st_size;
}
}
}
#elif defined _WIN32
_fh = ::CreateFileA (pathname_, GENERIC_READ, FILE_SHARE_READ, 0,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
_fmh = 0;
if (_fh != INVALID_HANDLE_VALUE)
{
_fmh = ::CreateFileMapping (_fh, 0, PAGE_READONLY, 0, 0, 0);
if (_fmh != 0)
{
_data = static_cast<CharT *>(::MapViewOfFile
(_fmh, FILE_MAP_READ, 0, 0, 0));
if (_data) _size = ::GetFileSize(_fh, 0);
}
}
#endif
}
~basic_memory_file ()
{
#if defined(__unix__)
::munmap(const_cast<CharT *>(_data), _size);
::close(_fh);
#elif defined(_WIN32)
::UnmapViewOfFile(_data);
::CloseHandle(_fmh);
::CloseHandle(_fh);
#endif
}
const CharT *data () const
{
return _data;
}
std::size_t size () const
{
return _size;
}
private:
const CharT *_data;
std::size_t _size;
#ifdef __unix__
int _fh;
#elif defined _WIN32
HANDLE _fh;
HANDLE _fmh;
#else
#error Only Posix or Windows are supported.
#endif
};
typedef basic_memory_file<char> memory_file;
typedef basic_memory_file<wchar_t> wmemory_file;
}
#endif

View File

@@ -0,0 +1,45 @@
// Quick hack...
// If you find this really is faster then using std::ifstream, let me know
// as I can always spend some more time to improve it.
namespace lexertl
{
template<typename CharT, class Traits>
class basic_fast_filebuf : public std::basic_streambuf<CharT, Traits>
{
public:
basic_fast_filebuf (const char *filename_) :
_fp (0)
{
_fp = ::fopen(filename_, "r");
}
virtual ~basic_fast_filebuf()
{
::fclose(_fp);
_fp = 0;
}
protected:
FILE *_fp;
virtual std::streamsize xsgetn (CharT *ptr_, std::streamsize count_)
{
return ::fread (ptr_, sizeof(CharT),
static_cast<std::size_t>(count_), _fp);
}
};
typedef basic_fast_filebuf<char, std::char_traits<char> > fast_filebuf;
typedef basic_fast_filebuf<wchar_t, std::char_traits<wchar_t> > wfast_filebuf;
}
// Usage:
// lexertl::rules rules_;
// lexertl::state_machine state_machine_;
// fast_filebuf buf ("Unicode/PropList.txt");
// std::istream if_(&buf);
// lexertl::stream_shared_iterator iter_ (if_);
// lexertl::stream_shared_iterator end_;
// lexertl::match_results<lexertl::stream_shared_iterator>
// results_(iter_, end_);

View File

@@ -0,0 +1,561 @@
// string_token.hpp
// Copyright (c) 2005-2010 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STRING_TOKEN_HPP
#define LEXERTL_STRING_TOKEN_HPP
#include "../char_traits.hpp"
#include <iostream>
#include <limits>
#include <string>
#include <utility>
#include <vector>
namespace lexertl
{
template<typename char_type>
struct basic_string_token
{
typedef std::basic_string<char_type> string;
bool _negated;
string _chars;
basic_string_token () :
_negated (false)
{
}
basic_string_token (const bool negated_, const string &chars_) :
_negated (negated_),
_chars (chars_)
{
}
void remove_duplicates ()
{
const char_type *start_ = _chars.c_str ();
const char_type *end_ = start_ + _chars.size ();
// Optimisation for very large charsets:
// sorting via pointers is much quicker than
// via iterators...
std::sort (const_cast<char_type *> (start_), const_cast<char_type *>
(end_));
_chars.erase (std::unique (_chars.begin (), _chars.end ()),
_chars.end ());
}
void normalise ()
{
const std::size_t max_chars_ = sizeof (char_type) == 1 ?
num_chars : num_wchar_ts;
if (_chars.length () == max_chars_)
{
_negated = !_negated;
_chars.clear ();
}
else if (_chars.length () > max_chars_ / 2)
{
negate ();
}
}
void negate ()
{
const std::size_t max_chars_ = sizeof (char_type) == 1 ?
num_chars : num_wchar_ts;
char_type curr_char_ = std::numeric_limits<CharT>::min ();
string temp_;
const char_type *curr_ = _chars.c_str ();
const char_type *chars_end_ = curr_ + _chars.size ();
_negated = !_negated;
temp_.resize (max_chars_ - _chars.size ());
char_type *ptr_ = const_cast<char_type *> (temp_.c_str ());
std::size_t i_ = 0;
while (curr_ < chars_end_)
{
while (*curr_ > curr_char_)
{
*ptr_ = curr_char_;
++ptr_;
++curr_char_;
++i_;
}
++curr_char_;
++curr_;
++i_;
}
for (; i_ < max_chars_; ++i_)
{
*ptr_ = curr_char_;
++ptr_;
++curr_char_;
}
_chars = temp_;
}
bool operator < (const basic_string_token &rhs_) const
{
return _negated < rhs_._negated ||
(_negated == rhs_._negated && _chars < rhs_._chars);
}
bool operator == (const basic_string_token &rhs_) const
{
return _negated == rhs_._negated && _chars == rhs_._chars;
}
bool empty () const
{
return _chars.empty () && !_negated;
}
bool any () const
{
return _chars.empty () && _negated;
}
void clear ()
{
_negated = false;
_chars.clear ();
}
void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
{
if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
!any () && !rhs_.any ()))
{
intersect_same_types (rhs_, overlap_);
}
else
{
intersect_diff_types (rhs_, overlap_);
}
}
void merge (const basic_string_token &rhs_,
basic_string_token &merged_) const
{
if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
!any () && !rhs_.any ()))
{
merge_same_types (rhs_, merged_);
}
else
{
merge_diff_types (rhs_, merged_);
}
}
static string escape_char (const char_type ch_)
{
string out_;
switch (ch_)
{
case '\0':
out_ += '\\';
out_ += '0';
break;
case '\a':
out_ += '\\';
out_ += 'a';
break;
case '\b':
out_ += '\\';
out_ += 'b';
break;
case 27:
out_ += '\\';
out_ += 'x';
out_ += '1';
out_ += 'b';
break;
case '\f':
out_ += '\\';
out_ += 'f';
break;
case '\n':
out_ += '\\';
out_ += 'n';
break;
case '\r':
out_ += '\\';
out_ += 'r';
break;
case '\t':
out_ += '\\';
out_ += 't';
break;
case '\v':
out_ += '\\';
out_ += 'v';
break;
case '\\':
out_ += '\\';
out_ += '\\';
break;
case '"':
out_ += '\\';
out_ += '"';
break;
case '\'':
out_ += '\\';
out_ += '\'';
break;
default:
{
if (ch_ < 32)
{
std::basic_stringstream<char_type> ss_;
out_ += '\\';
out_ += 'x';
ss_ << std::hex <<
static_cast<std::size_t> (ch_);
out_ += ss_.str ();
}
else
{
out_ += ch_;
}
break;
}
}
return out_;
}
private:
void intersect_same_types (basic_string_token &rhs_,
basic_string_token &overlap_)
{
if (any ())
{
clear ();
overlap_._negated = true;
rhs_.clear ();
}
else
{
typename string::iterator iter_ = _chars.begin ();
typename string::iterator end_ = _chars.end ();
typename string::iterator rhs_iter_ = rhs_._chars.begin ();
typename string::iterator rhs_end_ = rhs_._chars.end ();
overlap_._negated = _negated;
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
if (*iter_ < *rhs_iter_)
{
++iter_;
}
else if (*iter_ > *rhs_iter_)
{
++rhs_iter_;
}
else
{
overlap_._chars += *iter_;
iter_ = _chars.erase (iter_);
end_ = _chars.end ();
rhs_iter_ = rhs_._chars.erase (rhs_iter_);
rhs_end_ = rhs_._chars.end ();
}
}
if (_negated)
{
// duplicates already merged, so safe to merge
// using std lib.
// src, dest
merge (_chars, overlap_._chars);
// duplicates already merged, so safe to merge
// using std lib.
// src, dest
merge (rhs_._chars, overlap_._chars);
_negated = false;
rhs_._negated = false;
std::swap (_chars, rhs_._chars);
normalise ();
overlap_.normalise ();
rhs_.normalise ();
}
else if (!overlap_._chars.empty ())
{
normalise ();
overlap_.normalise ();
rhs_.normalise ();
}
}
}
void intersect_diff_types (basic_string_token &rhs_,
basic_string_token &overlap_)
{
if (any ())
{
intersect_any (rhs_, overlap_);
}
else if (_negated)
{
intersect_negated (rhs_, overlap_);
}
else // _negated == false
{
intersect_charset (rhs_, overlap_);
}
}
void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_)
{
if (rhs_._negated)
{
rhs_.intersect_negated (*this, overlap_);
}
else // rhs._negated == false
{
rhs_.intersect_charset (*this, overlap_);
}
}
void intersect_negated (basic_string_token &rhs_,
basic_string_token &overlap_)
{
if (rhs_.any ())
{
overlap_._negated = true;
overlap_._chars = _chars;
rhs_._negated = false;
rhs_._chars = _chars;
clear ();
}
else // rhs._negated == false
{
rhs_.intersect_charset (*this, overlap_);
}
}
void intersect_charset (basic_string_token &rhs_,
basic_string_token &overlap_)
{
if (rhs_.any ())
{
overlap_._chars = _chars;
rhs_._negated = true;
rhs_._chars = _chars;
clear ();
}
else // rhs_._negated == true
{
typename string::iterator iter_ = _chars.begin ();
typename string::iterator end_ = _chars.end ();
typename string::iterator rhs_iter_ = rhs_._chars.begin ();
typename string::iterator rhs_end_ = rhs_._chars.end ();
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
if (*iter_ < *rhs_iter_)
{
overlap_._chars += *iter_;
rhs_iter_ = rhs_._chars.insert (rhs_iter_, *iter_);
++rhs_iter_;
rhs_end_ = rhs_._chars.end ();
iter_ = _chars.erase (iter_);
end_ = _chars.end ();
}
else if (*iter_ > *rhs_iter_)
{
++rhs_iter_;
}
else
{
++iter_;
++rhs_iter_;
}
}
if (iter_ != end_)
{
// nothing bigger in rhs_ than iter_,
// so safe to merge using std lib.
string temp_ (iter_, end_);
// src, dest
merge (temp_, overlap_._chars);
_chars.erase (iter_, end_);
}
if (!overlap_._chars.empty ())
{
merge (overlap_._chars, rhs_._chars);
// possible duplicates, so check for any and erase.
rhs_._chars.erase (std::unique (rhs_._chars.begin (),
rhs_._chars.end ()), rhs_._chars.end ());
normalise ();
overlap_.normalise ();
rhs_.normalise ();
}
}
}
void merge (string &src_, string &dest_)
{
string tmp_ (src_.size () + dest_.size (), 0);
std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (),
tmp_.begin ());
dest_ = tmp_;
}
void merge_same_types (const basic_string_token &rhs_,
basic_string_token &merged_) const
{
if (any ())
{
merged_._negated = true;
}
else if (_negated)
{
typename string::const_iterator iter_ = _chars.begin ();
typename string::const_iterator end_ = _chars.end ();
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
merged_._negated = _negated;
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
if (*iter_ < *rhs_iter_)
{
++iter_;
}
else if (*iter_ > *rhs_iter_)
{
++rhs_iter_;
}
else
{
merged_._chars += *iter_;
++iter_;
++rhs_iter_;
}
}
merged_.normalise ();
}
else
{
typename string::const_iterator iter_ = _chars.begin ();
typename string::const_iterator end_ = _chars.end ();
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
if (*iter_ < *rhs_iter_)
{
merged_._chars += *iter_;
++iter_;
}
else if (*iter_ > *rhs_iter_)
{
merged_._chars += *rhs_iter_;
++rhs_iter_;
}
else
{
merged_._chars += *iter_;
++iter_;
++rhs_iter_;
}
}
// Include any trailing chars
if (iter_ != end_)
{
string temp_ (iter_, end_);
merged_._chars += temp_;
}
else if (rhs_iter_ != rhs_end_)
{
string temp_ (rhs_iter_, rhs_end_);
merged_._chars += temp_;
}
merged_.normalise ();
}
}
void merge_diff_types (const basic_string_token &rhs_,
basic_string_token &merged_) const
{
if (_negated)
{
merge_negated (*this, rhs_, merged_);
}
else
{
merge_negated (rhs_, *this, merged_);
}
merged_.normalise ();
}
void merge_negated (const basic_string_token &lhs_,
const basic_string_token &rhs_, basic_string_token &merged_) const
{
typename string::const_iterator lhs_iter_ = lhs_._chars.begin ();
typename string::const_iterator lhs_end_ = lhs_._chars.end ();
typename string::const_iterator rhs_iter_ = rhs_._chars.begin ();
typename string::const_iterator rhs_end_ = rhs_._chars.end ();
merged_._negated = true;
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
{
if (*lhs_iter_ < *rhs_iter_)
{
merged_._chars += *lhs_iter_;
++lhs_iter_;
}
else if (*lhs_iter_ > *rhs_iter_)
{
++rhs_iter_;
}
else
{
++lhs_iter_;
++rhs_iter_;
}
}
// Only interested in any remaining 'negated' chars
if (lhs_iter_ != lhs_end_)
{
string temp_ (lhs_iter_, lhs_end_);
merged_._chars += temp_;
}
}
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,100 @@
// re_token.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKEN_HPP
#define LEXERTL_RE_TOKEN_HPP
#include "../../string_token.hpp"
namespace lexertl
{
namespace detail
{
enum token_type {BEGIN, REGEX, OREXP, SEQUENCE, SUB, EXPRESSION, REPEAT,
DUP, OR, CHARSET, BOL, EOL, MACRO, OPENPAREN, CLOSEPAREN, OPT, AOPT,
ZEROORMORE, AZEROORMORE, ONEORMORE, AONEORMORE, REPEATN, AREPEATN,
END};
template<typename input_char_type, typename char_type>
struct basic_re_token
{
typedef basic_string_token<char_type> string_token;
typedef std::basic_string<input_char_type> string;
token_type _type;
string _extra;
string_token _str;
basic_re_token (const token_type type_ = BEGIN) :
_type (type_),
_extra (),
_str ()
{
}
void clear ()
{
_type = BEGIN;
_extra.clear ();
_str.clear ();
}
basic_re_token &operator = (const basic_re_token &rhs_)
{
_type = rhs_._type;
_extra = rhs_._extra;
_str = rhs_._str;
return *this;
}
char precedence (const token_type type_) const
{
// Moved in here for Solaris compiler.
static const char precedence_table_[END + 1][END + 1] = {
// BEG, REG, ORE, SEQ, SUB, EXP, RPT, DUP, | , CHR, BOL, EOL, MCR, ( , ) , ? , ?? , * , *? , + , +?, {n}?, {n}, END
/*BEGIN*/{' ', '<', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*REGEX*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*OREXP*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* SEQ */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* SUB */{' ', ' ', ' ', ' ', ' ', '=', '<', ' ', '>', '<', '<', '<', '<', '<', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*EXPRE*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* RPT */{' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', '>', '>', '>', '<', '<', '<', '<', '<', '<', '<', '<', '>'},
/*DUPLI*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* | */{' ', ' ', ' ', '=', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '},
/*CHARA*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* BOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* EOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/*MACRO*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* ( */{' ', '=', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '},
/* ) */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* ? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* ?? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* * */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* *? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* + */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* +? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*{n,m}*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*{nm}?*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* END */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '}
};
return precedence_table_[_type][type_];
}
const char *precedence_string () const
{
// Moved in here for Solaris compiler.
static const char *precedence_strings_[END + 1] =
{"BEGIN", "REGEX", "OREXP", "SEQUENCE", "SUB", "EXPRESSION",
"REPEAT", "DUPLICATE", "|", "CHARSET", "^", "$", "MACRO", "(", ")",
"?", "??", "*", "*?", "+", "+?", "{n[,[m]]}", "{n[,[m]]}?", "END"};
return precedence_strings_[_type];
}
};
}
}
#endif

View File

@@ -0,0 +1,829 @@
// tokeniser.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_HPP
#define LEXERTL_RE_TOKENISER_HPP
#include <cstring>
#include "re_token.hpp"
#include "../../runtime_error.hpp"
#include "../../size_t.hpp"
#include <sstream>
#include "../../string_token.hpp"
#include "re_tokeniser_helper.hpp"
namespace lexertl
{
namespace detail
{
template<typename rules_char_type, typename char_type, typename id_type>
class basic_re_tokeniser
{
public:
typedef basic_re_token<rules_char_type, char_type> re_token;
typedef basic_re_tokeniser_helper<rules_char_type, char_type, id_type>
tokeniser_helper;
typedef typename tokeniser_helper::char_state char_state;
typedef typename tokeniser_helper::state state;
typedef basic_string_token<char_type> string_token;
static void next (re_token *lhs_, state &state_, re_token *token_)
{
rules_char_type ch_ = 0;
bool eos_ = state_.next (ch_);
bool skipped_ = false;
token_->clear ();
do
{
// string begin/end
while (!eos_ && ch_ == '"')
{
state_._in_string ^= 1;
eos_ = state_.next (ch_);
}
// (?# ...)
skipped_ = comment (eos_, ch_, state_);
// skip_ws set
skipped_ |= skip (eos_, ch_, state_);
} while (skipped_);
if (eos_)
{
if (state_._in_string)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '\"') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (state_._paren_count)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing ')') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
token_->_type = END;
}
else
{
if (ch_ == '\\')
{
// Even if we are in a string, respect escape sequences...
token_->_type = CHARSET;
escape (state_, token_->_str);
}
else if (state_._in_string)
{
// All other meta characters lose their special meaning
// inside a string.
token_->_type = CHARSET;
token_->_str.insert (typename string_token::range (ch_, ch_));
}
else
{
// Not an escape sequence and not inside a string, so
// check for meta characters.
switch (ch_)
{
case '(':
token_->_type = OPENPAREN;
++state_._paren_count;
read_options (state_);
break;
case ')':
--state_._paren_count;
if (state_._paren_count < 0)
{
std::ostringstream ss_;
ss_ << "Number of open parenthesis < 0 "
"at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
token_->_type = CLOSEPAREN;
if (!state_._flags_stack.empty ())
{
state_._flags = state_._flags_stack.top ();
state_._flags_stack.pop ();
}
break;
case '?':
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AOPT;
state_.increment ();
}
else
{
token_->_type = OPT;
}
break;
case '*':
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AZEROORMORE;
state_.increment ();
}
else
{
token_->_type = ZEROORMORE;
}
break;
case '+':
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AONEORMORE;
state_.increment ();
}
else
{
token_->_type = ONEORMORE;
}
break;
case '{':
open_curly (lhs_, state_, token_);
break;
case '|':
token_->_type = OR;
break;
case '^':
if (!state_._macro && state_._curr - 1 == state_._start)
{
token_->_type = BOL;
}
else
{
token_->_type = CHARSET;
token_->_str.insert (typename string_token::range
(ch_, ch_));
}
break;
case '$':
if (!state_._macro && state_._curr == state_._end)
{
token_->_type = EOL;
}
else
{
token_->_type = CHARSET;
token_->_str.insert (typename string_token::range
(ch_, ch_));
}
break;
case '.':
{
token_->_type = CHARSET;
if (state_._flags & dot_not_newline)
{
token_->_str.insert (typename string_token::range
('\n', '\n'));
}
token_->_str.negate ();
break;
}
case '[':
{
token_->_type = CHARSET;
tokeniser_helper::charset (state_, token_->_str);
break;
}
case '/':
{
std::ostringstream ss_;
ss_ << "Lookahead ('/') is not supported yet in " <<
"rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
break;
}
default:
token_->_type = CHARSET;
if ((state_._flags & icase) &&
(std::isupper (ch_, state_._locale) ||
std::islower (ch_, state_._locale)))
{
char_type upper_ = std::toupper
(ch_, state_._locale);
char_type lower_ = std::tolower
(ch_, state_._locale);
token_->_str.insert (typename string_token::range
(upper_, upper_));
token_->_str.insert (typename string_token::range
(lower_, lower_));
}
else
{
token_->_str.insert (typename string_token::range
(ch_, ch_));
}
break;
}
}
}
}
private:
static bool comment (bool &eos_, rules_char_type &ch_, state &state_)
{
bool skipped_ = false;
if (!eos_ && !state_._in_string && ch_ == '(' &&
!state_.eos () && *state_._curr == '?' &&
state_._curr + 1 < state_._end && *(state_._curr + 1) == '#')
{
std::size_t paren_count_ = 1;
state_.increment ();
state_.increment ();
do
{
eos_ = state_.next (ch_);
if (ch_ == '(')
{
++paren_count_;
}
else if (ch_ == ')')
{
--paren_count_;
}
} while (!eos_ && !(ch_ == ')' && paren_count_ == 0));
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (unterminated comment) " <<
"in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
else
{
eos_ = state_.next (ch_);
}
skipped_ = true;
}
return skipped_;
}
static bool skip (bool &eos_, rules_char_type &ch_, state &state_)
{
bool skipped_ = false;
if (!eos_ && (state_._flags & skip_ws) && !state_._in_string)
{
bool c_comment_ = false;
bool skip_ws_ = false;
do
{
c_comment_ = ch_ == '/' && !state_.eos () &&
*state_._curr == '*';
skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' ||
ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v');
if (c_comment_)
{
state_.increment ();
eos_ = state_.next (ch_);
while (!eos_ && !(ch_ == '*' && !state_.eos () &&
*state_._curr == '/'))
{
eos_ = state_.next (ch_);
}
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (unterminated " <<
"C style comment) in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
else
{
state_.increment ();
eos_ = state_.next (ch_);
}
skipped_ = true;
}
else if (skip_ws_)
{
eos_ = state_.next (ch_);
skipped_ = true;
}
} while (c_comment_ || skip_ws_);
}
return skipped_;
}
static void read_options (state &state_)
{
if (!state_.eos () && *state_._curr == '?')
{
rules_char_type ch_ = 0;
bool eos_ = false;
bool negate_ = false;
state_.increment ();
eos_ = state_.next (ch_);
state_._flags_stack.push (state_._flags);
while (!eos_ && ch_ != ':')
{
switch (ch_)
{
case '-':
negate_ ^= 1;
break;
case 'i':
if (negate_)
{
state_._flags = state_._flags & ~icase;
}
else
{
state_._flags = state_._flags | icase;
}
negate_ = false;
break;
case 's':
if (negate_)
{
state_._flags = state_._flags | dot_not_newline;
}
else
{
state_._flags = state_._flags & ~dot_not_newline;
}
negate_ = false;
break;
case 'x':
if (negate_)
{
state_._flags = state_._flags & ~skip_ws;
}
else
{
state_._flags = state_._flags | skip_ws;
}
negate_ = false;
break;
default:
{
std::ostringstream ss_;
ss_ << "Unknown option at index " <<
state_.index () - 1 << " in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
}
eos_ = state_.next (ch_);
}
// End of string handler will handle early termination
}
else if (!state_._flags_stack.empty ())
{
state_._flags_stack.push (state_._flags);
}
}
static void escape (state &state_, string_token &token_)
{
char_type ch_ = 0;
std::size_t str_len_ = 0;
const char *str_ = tokeniser_helper::escape_sequence (state_,
ch_, str_len_);
if (str_)
{
char_state state2_ (str_ + 1, str_ + str_len_, state_._id,
state_._flags, state_._locale, false);
tokeniser_helper::charset (state2_, token_);
}
else
{
token_.insert (typename string_token::range (ch_, ch_));
}
}
static void open_curly (re_token *lhs_, state &state_,
re_token *token_)
{
if (state_.eos ())
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
else if (*state_._curr == '-')
{
charset_difference (lhs_, state_, token_);
}
else if (*state_._curr == '+')
{
charset_union (lhs_, state_, token_);
}
else if (*state_._curr >= '0' && *state_._curr <= '9')
{
repeat_n (state_, token_);
}
else
{
macro (state_, token_);
}
}
static void charset_difference (re_token *lhs_, state &state_,
re_token *token_)
{
rules_char_type ch_ = 0;
if (lhs_->_type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must precede {-} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
state_.next (ch_);
if (state_.next (ch_))
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
re_token rhs_;
next (lhs_, state_, &rhs_);
if (rhs_._type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must follow {-} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
lhs_->_str.remove (rhs_._str);
if (lhs_->_str.empty ())
{
std::ostringstream ss_;
ss_ << "Empty charset created by {-} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
next (lhs_, state_, token_);
}
static void charset_union (re_token *lhs_, state &state_,
re_token *token_)
{
rules_char_type ch_ = 0;
if (lhs_->_type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must precede {+} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
state_.next (ch_);
if (state_.next (ch_))
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
re_token rhs_;
next (lhs_, state_, &rhs_);
if (rhs_._type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must follow {+} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
lhs_->_str.insert (rhs_._str);
next (lhs_, state_, token_);
}
// SYNTAX:
// {n[,[n]]}
// SEMANTIC RULES:
// {0} - INVALID (throw exception)
// {0,} = *
// {0,0} - INVALID (throw exception)
// {0,1} = ?
// {1,} = +
// {min,max} where min == max - {min}
// {min,max} where max < min - INVALID (throw exception)
static void repeat_n (state &state_, re_token *token_)
{
rules_char_type ch_ = 0;
bool eos_ = state_.next (ch_);
std::size_t min_ = 0;
std::size_t max_ = 0;
while (!eos_ && ch_ >= '0' && ch_ <= '9')
{
min_ *= 10;
min_ += ch_ - '0';
token_->_extra += ch_;
eos_ = state_.next (ch_);
}
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
bool min_max_ = false;
bool repeatn_ = true;
if (ch_ == ',')
{
token_->_extra += ch_;
eos_ = state_.next (ch_);
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (ch_ == '}')
{
// Small optimisation: Check for '*' equivalency.
if (min_ == 0)
{
token_->_type = ZEROORMORE;
repeatn_ = false;
}
// Small optimisation: Check for '+' equivalency.
else if (min_ == 1)
{
token_->_type = ONEORMORE;
repeatn_ = false;
}
}
else
{
if (ch_ < '0' || ch_ > '9')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
min_max_ = true;
do
{
max_ *= 10;
max_ += ch_ - '0';
token_->_extra += ch_;
eos_ = state_.next (ch_);
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') "
"in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
// Small optimisation: Check for '?' equivalency.
if (min_ == 0 && max_ == 1)
{
token_->_type = OPT;
repeatn_ = false;
}
// Small optimisation: if min == max, then min.
else if (min_ == max_)
{
token_->_extra.erase (token_->_extra.find (','));
min_max_ = false;
max_ = 0;
}
}
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
if (repeatn_)
{
// SEMANTIC VALIDATION follows:
// NOTE: {0,} has already become *
// therefore we don't check for a comma.
if (min_ == 0 && max_ == 0)
{
std::ostringstream ss_;
ss_ << "Cannot have exactly zero repeats preceding index " <<
state_.index () << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
if (min_max_ && max_ < min_)
{
std::ostringstream ss_;
ss_ << "Max less than min preceding index " <<
state_.index () << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AREPEATN;
state_.increment ();
}
else
{
token_->_type = REPEATN;
}
}
else if (token_->_type == ZEROORMORE)
{
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AZEROORMORE;
state_.increment ();
}
}
else if (token_->_type == ONEORMORE)
{
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AONEORMORE;
state_.increment ();
}
}
else if (token_->_type == OPT)
{
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AOPT;
state_.increment ();
}
}
}
static void macro (state &state_, re_token *token_)
{
rules_char_type ch_ = 0;
bool eos_ = false;
state_.next (ch_);
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
!(ch_ >= 'a' && ch_ <= 'z'))
{
std::ostringstream ss_;
ss_ << "Invalid MACRO name at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
do
{
token_->_extra += ch_;
eos_ = state_.next (ch_);
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex " <<
"(missing '}') in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
token_->_type = MACRO;
}
};
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,115 @@
// tokeniser_state.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_STATE_HPP
#define LEXERTL_RE_TOKENISER_STATE_HPP
#include "../../char_traits.hpp"
#include "../../enums.hpp"
#include <locale>
#include "../../size_t.hpp"
#include <stack>
namespace lexertl
{
namespace detail
{
template<typename ch_type, typename id_type>
struct basic_re_tokeniser_state
{
typedef ch_type char_type;
typedef typename basic_char_traits<char_type>::index_type index_type;
const char_type * const _start;
const char_type * const _end;
const char_type *_curr;
id_type _id;
std::size_t _flags;
std::stack<std::size_t> _flags_stack;
std::locale _locale;
bool _macro;
long _paren_count;
bool _in_string;
id_type _nl_id;
basic_re_tokeniser_state (const char_type *start_,
const char_type * const end_, id_type id_, const std::size_t flags_,
const std::locale locale_, const bool macro_) :
_start (start_),
_end (end_),
_curr (start_),
_id (id_),
_flags (flags_),
_flags_stack (),
_locale (locale_),
_macro (macro_),
_paren_count (0),
_in_string (false),
_nl_id (static_cast<id_type>(~0))
{
}
basic_re_tokeniser_state (const basic_re_tokeniser_state &rhs_)
{
assign (rhs_);
}
// prevent VC++ 7.1 warning:
const basic_re_tokeniser_state &operator =
(const basic_re_tokeniser_state &rhs_)
{
assign (rhs_);
}
void assign (const basic_re_tokeniser_state &rhs_)
{
_start = rhs_._start;
_end = rhs_._end;
_curr = rhs_._curr;
_id = rhs_._id;
_flags = rhs_._flags;
_flags_stack = rhs_._flags_stack;
_locale = rhs_._locale;
_macro = rhs_._macro;
_paren_count = rhs_._paren_count;
_in_string = rhs_._in_string;
_nl_id = rhs_._nl_id;
return this;
}
inline bool next (char_type &ch_)
{
if (_curr >= _end)
{
ch_ = 0;
return true;
}
else
{
ch_ = *_curr;
increment ();
return false;
}
}
inline void increment ()
{
++_curr;
}
inline std::size_t index ()
{
return _curr - _start;
}
inline bool eos ()
{
return _curr >= _end;
}
};
}
}
#endif

View File

@@ -0,0 +1,112 @@
// end_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_END_NODE_HPP
#define LEXERTL_END_NODE_HPP
#include "node.hpp"
#include "../../size_t.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_end_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_end_node (const id_type id_, const id_type user_id_,
const id_type next_dfa_, const id_type push_dfa_,
const bool pop_dfa_) :
basic_node<id_type> (false),
_id (id_),
_user_id (user_id_),
_next_dfa (next_dfa_),
_push_dfa (push_dfa_),
_pop_dfa (pop_dfa_),
_followpos ()
{
basic_node<id_type>::_firstpos.push_back (this);
basic_node<id_type>::_lastpos.push_back (this);
}
virtual ~basic_end_node ()
{
}
virtual node_type what_type () const
{
return node::END;
}
virtual bool traverse (const_node_stack &/*node_stack_*/,
bool_stack &/*perform_op_stack_*/) const
{
return false;
}
virtual const node_vector &followpos () const
{
// _followpos is always empty..!
return _followpos;
}
virtual bool end_state () const
{
return true;
}
virtual id_type id () const
{
return _id;
}
virtual id_type user_id () const
{
return _user_id;
}
virtual id_type next_dfa () const
{
return _next_dfa;
}
virtual id_type push_dfa () const
{
return _push_dfa;
}
virtual bool pop_dfa () const
{
return _pop_dfa;
}
private:
id_type _id;
id_type _user_id;
id_type _next_dfa;
id_type _push_dfa;
bool _pop_dfa;
node_vector _followpos;
virtual void copy_node (node_ptr_vector &/*node_ptr_vector_*/,
node_stack &/*new_node_stack_*/, bool_stack &/*perform_op_stack_*/,
bool &/*down_*/) const
{
// Nothing to do, as end_nodes are not copied.
}
};
}
}
#endif

View File

@@ -0,0 +1,103 @@
// iteration_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_ITERATION_NODE_HPP
#define LEXERTL_ITERATION_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_iteration_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_iteration_node (basic_node<id_type> *next_, const bool greedy_) :
basic_node<id_type> (true),
_next (next_),
_greedy (greedy_)
{
typename node_vector::iterator iter_;
typename node_vector::iterator end_;
_next->append_firstpos (node::_firstpos);
_next->append_lastpos (node::_lastpos);
for (iter_ = node::_lastpos.begin (), end_ = node::_lastpos.end ();
iter_ != end_; ++iter_)
{
(*iter_)->append_followpos (node::_firstpos);
}
for (iter_ = node::_firstpos.begin (), end_ = node::_firstpos.end ();
iter_ != end_; ++iter_)
{
(*iter_)->greedy (greedy_);
}
}
virtual ~basic_iteration_node ()
{
}
virtual node_type what_type () const
{
return node::ITERATION;
}
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const
{
perform_op_stack_.push (true);
node_stack_.push (_next);
return true;
}
private:
// Not owner of this pointer...
basic_node<id_type> *_next;
bool _greedy;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const
{
if (perform_op_stack_.top ())
{
basic_node<id_type> *ptr_ = new_node_stack_.top ();
node_ptr_vector_->push_back
(static_cast<basic_iteration_node<id_type> *>(0));
node_ptr_vector_->back () = new basic_iteration_node
(ptr_, _greedy);
new_node_stack_.top () = node_ptr_vector_->back ();
}
else
{
down_ = true;
}
perform_op_stack_.pop ();
}
// No copy construction.
basic_iteration_node (const basic_iteration_node &);
// No assignment.
const basic_iteration_node &operator = (const basic_iteration_node &);
};
}
}
#endif

View File

@@ -0,0 +1,114 @@
// leaf_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_LEAF_NODE_HPP
#define LEXERTL_LEAF_NODE_HPP
#include "../../enums.hpp" // null_token
#include "node.hpp"
#include "../../size_t.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_leaf_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_leaf_node (const id_type token_, const bool greedy_) :
basic_node<id_type> (token_ == node::null_token ()),
_token (token_),
_set_greedy (!greedy_),
_greedy (greedy_),
_followpos ()
{
if (!node::_nullable)
{
node::_firstpos.push_back (this);
node::_lastpos.push_back (this);
}
}
virtual ~basic_leaf_node ()
{
}
virtual void append_followpos (const node_vector &followpos_)
{
for (typename node_vector::const_iterator iter_ = followpos_.begin (),
end_ = followpos_.end (); iter_ != end_; ++iter_)
{
_followpos.push_back (*iter_);
}
}
virtual node_type what_type () const
{
return node::LEAF;
}
virtual bool traverse (const_node_stack &/*node_stack_*/,
bool_stack &/*perform_op_stack_*/) const
{
return false;
}
virtual id_type token () const
{
return _token;
}
virtual void greedy (const bool greedy_)
{
if (!_set_greedy)
{
_greedy = greedy_;
_set_greedy = true;
}
}
virtual bool greedy () const
{
return _greedy;
}
virtual const node_vector &followpos () const
{
return _followpos;
}
virtual node_vector &followpos ()
{
return _followpos;
}
private:
id_type _token;
bool _set_greedy;
bool _greedy;
node_vector _followpos;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &/*perform_op_stack_*/,
bool &/*down_*/) const
{
node_ptr_vector_->push_back (static_cast<basic_leaf_node *>(0));
node_ptr_vector_->back () = new basic_leaf_node (_token, _greedy);
new_node_stack_.push (node_ptr_vector_->back ());
}
};
}
}
#endif

View File

@@ -0,0 +1,241 @@
// node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_NODE_HPP
#define LEXERTL_NODE_HPP
#include <assert.h>
#include "../../containers/ptr_vector.hpp"
#include "../../runtime_error.hpp"
#include "../../size_t.hpp"
#include <stack>
#include <vector>
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_node
{
public:
enum node_type {LEAF, SEQUENCE, SELECTION, ITERATION, END};
typedef std::stack<bool> bool_stack;
typedef std::stack<basic_node<id_type> *> node_stack;
// stack and vector not owner of node pointers
typedef std::stack<const basic_node<id_type> *> const_node_stack;
typedef std::vector<basic_node<id_type> *> node_vector;
typedef ptr_vector<basic_node<id_type> > node_ptr_vector;
basic_node () :
_nullable (false),
_firstpos (),
_lastpos ()
{
}
basic_node (const bool nullable_) :
_nullable (nullable_),
_firstpos (),
_lastpos ()
{
}
virtual ~basic_node ()
{
}
static id_type null_token ()
{
return static_cast<id_type>(~0);
}
bool nullable () const
{
return _nullable;
}
void append_firstpos (node_vector &firstpos_) const
{
firstpos_.insert (firstpos_.end (),
_firstpos.begin (), _firstpos.end ());
}
void append_lastpos (node_vector &lastpos_) const
{
lastpos_.insert (lastpos_.end (),
_lastpos.begin (), _lastpos.end ());
}
virtual void append_followpos (const node_vector &/*followpos_*/)
{
throw runtime_error ("Internal error node::append_followpos().");
}
basic_node *copy (node_ptr_vector &node_ptr_vector_) const
{
basic_node *new_root_ = 0;
const_node_stack node_stack_;
bool_stack perform_op_stack_;
bool down_ = true;
node_stack new_node_stack_;
node_stack_.push (this);
while (!node_stack_.empty ())
{
while (down_)
{
down_ = node_stack_.top ()->traverse (node_stack_,
perform_op_stack_);
}
while (!down_ && !node_stack_.empty ())
{
const basic_node *top_ = node_stack_.top ();
top_->copy_node (node_ptr_vector_, new_node_stack_,
perform_op_stack_, down_);
if (!down_) node_stack_.pop ();
}
}
assert (new_node_stack_.size () == 1);
new_root_ = new_node_stack_.top ();
new_node_stack_.pop ();
return new_root_;
}
virtual node_type what_type () const = 0;
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const = 0;
node_vector &firstpos ()
{
return _firstpos;
}
const node_vector &firstpos () const
{
return _firstpos;
}
// _lastpos modified externally, so not const &
node_vector &lastpos ()
{
return _lastpos;
}
virtual bool end_state () const
{
return false;
}
virtual id_type id () const
{
throw runtime_error ("Internal error node::id().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual id_type user_id () const
{
throw runtime_error ("Internal error node::user_id().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual id_type next_dfa () const
{
throw runtime_error ("Internal error node::next_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual id_type push_dfa () const
{
throw runtime_error ("Internal error node::push_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual bool pop_dfa () const
{
throw runtime_error ("Internal error node::pop_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return false;
#endif
}
virtual id_type token () const
{
throw runtime_error ("Internal error node::token().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual void greedy (const bool /*greedy_*/)
{
throw runtime_error ("Internal error node::greedy(bool).");
}
virtual bool greedy () const
{
throw runtime_error ("Internal error node::greedy().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return false;
#endif
}
virtual const node_vector &followpos () const
{
throw runtime_error ("Internal error node::followpos().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return firstpos;
#endif
}
virtual node_vector &followpos ()
{
throw runtime_error ("Internal error node::followpos().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return firstpos;
#endif
}
protected:
const bool _nullable;
node_vector _firstpos;
node_vector _lastpos;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const = 0;
private:
basic_node (const basic_node &); // No copy construction.
const basic_node &operator = (const basic_node &); // No assignment.
};
}
}
#endif

View File

@@ -0,0 +1,106 @@
// selection_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SELECTION_NODE_HPP
#define LEXERTL_SELECTION_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_selection_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
basic_selection_node (basic_node<id_type> *left_,
basic_node<id_type> *right_) :
basic_node<id_type> (left_->nullable () || right_->nullable ()),
_left (left_),
_right (right_)
{
_left->append_firstpos (node::_firstpos);
_right->append_firstpos (node::_firstpos);
_left->append_lastpos (node::_lastpos);
_right->append_lastpos (node::_lastpos);
}
virtual ~basic_selection_node ()
{
}
virtual node_type what_type () const
{
return node::SELECTION;
}
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const
{
perform_op_stack_.push (true);
switch (_right->what_type ())
{
case node::SEQUENCE:
case node::SELECTION:
case node::ITERATION:
perform_op_stack_.push (false);
break;
default:
break;
}
node_stack_.push (_right);
node_stack_.push (_left);
return true;
}
private:
// Not owner of these pointers...
basic_node<id_type> *_left;
basic_node<id_type> *_right;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const
{
if (perform_op_stack_.top ())
{
basic_node<id_type> *rhs_ = new_node_stack_.top ();
new_node_stack_.pop ();
basic_node<id_type> *lhs_ = new_node_stack_.top ();
node_ptr_vector_->push_back
(static_cast<basic_selection_node *>(0));
node_ptr_vector_->back () = new basic_selection_node (lhs_, rhs_);
new_node_stack_.top () = node_ptr_vector_->back ();
}
else
{
down_ = true;
}
perform_op_stack_.pop ();
}
// No copy construction.
basic_selection_node (const basic_selection_node &);
// No assignment.
const basic_selection_node &operator = (const basic_selection_node &);
};
}
}
#endif

View File

@@ -0,0 +1,126 @@
// sequence_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SEQUENCE_NODE_HPP
#define LEXERTL_SEQUENCE_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_sequence_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_sequence_node (basic_node<id_type> *left_,
basic_node<id_type> *right_) :
basic_node<id_type> (left_->nullable () && right_->nullable ()),
_left (left_),
_right (right_)
{
_left->append_firstpos (node::_firstpos);
if (_left->nullable ())
{
_right->append_firstpos (node::_firstpos);
}
if (_right->nullable ())
{
_left->append_lastpos (node::_lastpos);
}
_right->append_lastpos (node::_lastpos);
node_vector &lastpos_ = _left->lastpos ();
const node_vector &firstpos_ = _right->firstpos ();
for (typename node_vector::iterator iter_ = lastpos_.begin (),
end_ = lastpos_.end (); iter_ != end_; ++iter_)
{
(*iter_)->append_followpos (firstpos_);
}
}
virtual ~basic_sequence_node ()
{
}
virtual node_type what_type () const
{
return node::SEQUENCE;
}
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const
{
perform_op_stack_.push (true);
switch (_right->what_type ())
{
case node::SEQUENCE:
case node::SELECTION:
case node::ITERATION:
perform_op_stack_.push (false);
break;
default:
break;
}
node_stack_.push (_right);
node_stack_.push (_left);
return true;
}
private:
// Not owner of these pointers...
basic_node<id_type> *_left;
basic_node<id_type> *_right;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const
{
if (perform_op_stack_.top ())
{
basic_node<id_type> *rhs_ = new_node_stack_.top ();
new_node_stack_.pop ();
basic_node<id_type> *lhs_ = new_node_stack_.top ();
node_ptr_vector_->push_back
(static_cast<basic_sequence_node<id_type> *>(0));
node_ptr_vector_->back () = new basic_sequence_node<id_type>
(lhs_, rhs_);
new_node_stack_.top () = node_ptr_vector_->back ();
}
else
{
down_ = true;
}
perform_op_stack_.pop ();
}
// No copy construction.
basic_sequence_node (const basic_sequence_node &);
// No assignment.
const basic_sequence_node &operator = (const basic_sequence_node &);
};
}
}
#endif

View File

@@ -0,0 +1,73 @@
// charset.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_CHARSET_HPP
#define LEXERTL_CHARSET_HPP
#include <algorithm>
#include <iterator>
#include <set>
#include "../size_t.hpp"
#include "../string_token.hpp"
namespace lexertl
{
namespace detail
{
template<typename char_type, typename id_type>
struct basic_charset
{
typedef basic_string_token<char_type> token;
typedef std::set<id_type> index_set;
token _token;
index_set _index_set;
basic_charset () :
_token (),
_index_set ()
{
}
basic_charset (const token &token_, const std::size_t index_) :
_token (token_),
_index_set ()
{
_index_set.insert (index_);
}
bool empty () const
{
return _token.empty () && _index_set.empty ();
}
void intersect (basic_charset &rhs_, basic_charset &overlap_)
{
_token.intersect (rhs_._token, overlap_._token);
if (!overlap_._token.empty ())
{
std::merge (_index_set.begin (), _index_set.end (),
rhs_._index_set.begin (), rhs_._index_set.end (),
std::inserter (overlap_._index_set,
overlap_._index_set.end ()));
if (_token.empty ())
{
_index_set.clear ();
}
if (rhs_._token.empty ())
{
rhs_._index_set.clear ();
}
}
}
};
}
}
#endif

View File

@@ -0,0 +1,134 @@
// equivset.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_EQUIVSET_HPP
#define LEXERTL_EQUIVSET_HPP
#include <algorithm>
#include "../parser/tree/node.hpp"
#include <set>
namespace lexertl
{
namespace detail
{
template<typename id_type>
struct basic_equivset
{
typedef std::set<id_type> index_set;
typedef std::vector<id_type> index_vector;
// Not owner of nodes:
typedef basic_node<id_type> node;
typedef std::vector<node *> node_vector;
index_vector _index_vector;
id_type _id;
bool _greedy;
node_vector _followpos;
basic_equivset () :
_index_vector (),
_id (0),
_greedy (true),
_followpos ()
{
}
basic_equivset (const index_set &index_set_, const id_type id_,
const bool greedy_, const node_vector &followpos_) :
_index_vector (index_set_.begin (), index_set_.end ()),
_id (id_),
_greedy (greedy_),
_followpos (followpos_)
{
}
bool empty () const
{
return _index_vector.empty () && _followpos.empty ();
}
void intersect (basic_equivset &rhs_, basic_equivset &overlap_)
{
intersect_indexes (rhs_._index_vector, overlap_._index_vector);
if (!overlap_._index_vector.empty ())
{
// Note that the LHS takes priority in order to
// respect rule ordering priority in the lex spec.
overlap_._id = _id;
overlap_._greedy = _greedy;
overlap_._followpos = _followpos;
typename node_vector::const_iterator overlap_begin_ =
overlap_._followpos.begin ();
typename node_vector::const_iterator overlap_end_ =
overlap_._followpos.end ();
typename node_vector::const_iterator rhs_iter_ =
rhs_._followpos.begin ();
typename node_vector::const_iterator rhs_end_ =
rhs_._followpos.end ();
for (; rhs_iter_ != rhs_end_; ++rhs_iter_)
{
node *node_ = *rhs_iter_;
if (std::find (overlap_begin_, overlap_end_, node_) ==
overlap_end_)
{
overlap_._followpos.push_back (node_);
overlap_begin_ = overlap_._followpos.begin ();
overlap_end_ = overlap_._followpos.end ();
}
}
if (_index_vector.empty ())
{
_followpos.clear ();
}
if (rhs_._index_vector.empty ())
{
rhs_._followpos.clear ();
}
}
}
private:
void intersect_indexes (index_vector &rhs_, index_vector &overlap_)
{
typename index_vector::iterator iter_ = _index_vector.begin ();
typename index_vector::iterator end_ = _index_vector.end ();
typename index_vector::iterator rhs_iter_ = rhs_.begin ();
typename index_vector::iterator rhs_end_ = rhs_.end ();
while (iter_ != end_ && rhs_iter_ != rhs_end_)
{
const id_type index_ = *iter_;
const id_type rhs_index_ = *rhs_iter_;
if (index_ < rhs_index_)
{
++iter_;
}
else if (index_ > rhs_index_)
{
++rhs_iter_;
}
else
{
overlap_.push_back (index_);
iter_ = _index_vector.erase (iter_);
end_ = _index_vector.end ();
rhs_iter_ = rhs_.erase (rhs_iter_);
rhs_end_ = rhs_.end ();
}
}
}
};
}
}
#endif

743
inc/lexertl/rules.hpp Normal file
View File

@@ -0,0 +1,743 @@
// rules.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RULES_HPP
#define LEXERTL_RULES_HPP
#include "compile_assert.hpp"
#include <deque>
#include "enums.hpp"
#include "internals.hpp"
#include <locale>
#include <map>
#include "runtime_error.hpp"
#include <set>
#include "size_t.hpp"
#include <sstream>
#include <string>
#include <vector>
namespace lexertl
{
template<typename ch_type, typename id_ty = std::size_t>
class basic_rules
{
public:
typedef std::vector<bool> bool_vector;
typedef std::deque<bool_vector> bool_vector_deque;
typedef ch_type char_type;
typedef id_ty id_type;
typedef std::vector<id_type> id_vector;
typedef std::deque<id_vector> id_vector_deque;
typedef std::basic_string<char_type> string;
typedef std::deque<string> string_deque;
typedef std::deque<string_deque> string_deque_deque;
typedef std::set<string> string_set;
typedef std::pair<string, string> string_pair;
typedef std::deque<string_pair> string_pair_deque;
typedef std::map<string, id_type> string_id_type_map;
typedef std::pair<string, id_type> string_id_type_pair;
// If you get a compile error here you have
// failed to define an unsigned id type.
compile_assert<(static_cast<id_type>(~0) > 0)>
_valid_id_type;
basic_rules (const std::size_t flags_ = dot_not_newline) :
_valid_id_type (),
_statemap (),
_macrodeque (),
_macroset (),
_regexes (),
_features (),
_ids (),
_user_ids (),
_next_dfas (),
_pushes (),
_pops (),
_flags (flags_),
_locale (),
_lexer_state_names (),
_eoi (0)
{
add_state (initial ());
}
void clear ()
{
_statemap.clear ();
_macrodeque.clear ();
_macroset.clear ();
_regexes.clear ();
_features.clear ();
_ids.clear ();
_user_ids.clear ();
_next_dfas.clear ();
_pushes.clear ();
_pops.clear ();
_flags = dot_not_newline;
_locale = std::locale ();
_lexer_state_names.clear ();
_eoi = 0;
add_state (initial ());
}
void clear (const id_type dfa_)
{
if (_regexes.size () > dfa_)
{
_regexes[dfa_].clear ();
_features[dfa_] = 0;
_ids[dfa_].clear ();
_user_ids[dfa_].clear ();
_next_dfas[dfa_].clear ();
_pushes[dfa_].clear ();
_pops[dfa_].clear ();
}
}
void flags (const std::size_t flags_)
{
_flags = flags_;
}
std::size_t flags () const
{
return _flags;
}
static id_type skip ()
{
return static_cast<id_type>(~1);
}
void eoi (const id_type eoi_)
{
_eoi = eoi_;
}
id_type eoi () const
{
return _eoi;
}
std::locale imbue (const std::locale &locale_)
{
std::locale loc_ = _locale;
_locale = locale_;
return loc_;
}
const std::locale &locale () const
{
return _locale;
}
const char_type *state (const id_type index_) const
{
if (index_ == 0)
{
return initial ();
}
else
{
const id_type i_ = index_ - 1;
if (_lexer_state_names.size () > i_)
{
return _lexer_state_names[i_].c_str ();
}
else
{
return 0;
}
}
}
id_type state (const char_type *name_) const
{
typename string_id_type_map::const_iterator iter_ =
_statemap.find (name_);
if (iter_ == _statemap.end ())
{
return npos ();
}
else
{
return iter_->second;
}
}
id_type add_state (const char_type *name_)
{
validate (name_);
if (_statemap.insert (string_id_type_pair (name_,
_statemap.size ())).second)
{
_regexes.push_back (string_deque ());
_features.push_back (0);
_ids.push_back (id_vector ());
_user_ids.push_back (id_vector ());
_next_dfas.push_back (id_vector ());
_pushes.push_back (id_vector ());
_pops.push_back (bool_vector ());
if (string (name_) != initial ())
{
_lexer_state_names.push_back (name_);
}
}
else
{
return _statemap.find (name_)->second;
}
if (_next_dfas.size () > npos ())
{
// Overflow
throw runtime_error ("The data type you have chosen cannot hold "
"this many lexer start states.");
}
// Initial is not stored, so no need to - 1.
return static_cast<id_type>(_lexer_state_names.size ());
}
void add_macro (const char_type *name_, const char_type *regex_)
{
add_macro (name_, string (regex_));
}
void add_macro (const char_type *name_, const char_type *regex_start_,
const char_type *regex_end_)
{
add_macro (name_, string (regex_start_, regex_end_));
}
void add_macro (const char_type *name_, const string &regex_)
{
validate (name_);
typename string_set::const_iterator iter_ = _macroset.find (name_);
if (iter_ == _macroset.end ())
{
_macrodeque.push_back (string_pair (name_, regex_));
_macroset.insert (name_);
}
else
{
std::basic_stringstream<char_type> ss_;
std::ostringstream os_;
os_ << "Attempt to redefine MACRO '";
while (*name_)
{
os_ << ss_.narrow (*name_++, static_cast<char_type> (' '));
}
os_ << "'.";
throw runtime_error (os_.str ());
}
}
void add_macros (const basic_rules &rules_)
{
const string_pair_deque &macros_ = rules_.macrodeque ();
typename string_pair_deque::const_iterator macro_iter_ =
macros_.begin ();
typename string_pair_deque::const_iterator macro_end_ =
macros_.end ();
for (; macro_iter_ != macro_end_; ++macro_iter_)
{
add_macro (macro_iter_->first.c_str (),
macro_iter_->second.c_str ());
}
}
void merge_macros (const basic_rules &rules_)
{
const string_pair_deque &macros_ = rules_.macrodeque ();
typename string_pair_deque::const_iterator macro_iter_ =
macros_.begin ();
typename string_pair_deque::const_iterator macro_end_ =
macros_.end ();
typename string_set::const_iterator macro_dest_iter_;
typename string_set::const_iterator macro_dest_end_ = _macroset.end ();
for (; macro_iter_ != macro_end_; ++macro_iter_)
{
macro_dest_iter_ = _macroset.find (macro_iter_->first);
if (macro_dest_iter_ == macro_dest_end_)
{
add_macro (macro_iter_->first.c_str (),
macro_iter_->second.c_str ());
}
}
}
// Add rule to INITIAL
void add (const char_type *regex_, const id_type id_,
const id_type user_id_ = npos ())
{
add (string (regex_), id_, user_id_);
}
void add (const char_type *regex_start_, const char_type *regex_end_,
const id_type id_, const id_type user_id_ = npos ())
{
add (string (regex_start_, regex_end_), id_, user_id_);
}
void add (const string &regex_, const id_type id_,
const id_type user_id_ = npos ())
{
check_for_invalid_id (id_);
_regexes.front ().push_back (regex_);
if (regex_[0] == '^')
{
_features.front () |= bol_bit;
}
if (regex_.size () > 0 && regex_[regex_.size () - 1] == '$')
{
_features.front () |= eol_bit;
}
if (id_ == skip ())
{
_features.front () |= skip_bit;
}
else if (id_ == eoi ())
{
_features.front () |= again_bit;
}
_ids.front ().push_back (id_);
_user_ids.front ().push_back (user_id_);
_next_dfas.front ().push_back (0);
_pushes.front ().push_back (npos ());
_pops.front ().push_back (false);
}
// Add rule with no id
void add (const char_type *curr_dfa_,
const char_type *regex_, const char_type *new_dfa_)
{
add (curr_dfa_, string (regex_), new_dfa_);
}
void add (const char_type *curr_dfa_,
const char_type *regex_start_, const char_type *regex_end_,
const char_type *new_dfa_)
{
add (curr_dfa_, string (regex_start_, regex_end_), new_dfa_);
}
void add (const char_type *curr_dfa_, const string &regex_,
const char_type *new_dfa_)
{
add (curr_dfa_, regex_, _eoi, new_dfa_, false);
}
// Add rule with id
void add (const char_type *curr_dfa_,
const char_type *regex_, const id_type id_,
const char_type *new_dfa_, const id_type user_id_ = npos ())
{
add (curr_dfa_, string (regex_), id_, new_dfa_, user_id_);
}
void add (const char_type *curr_dfa_, const char_type *regex_start_,
const char_type *regex_end_, const id_type id_,
const char_type *new_dfa_, const id_type user_id_ = npos ())
{
add (curr_dfa_, string (regex_start_, regex_end_),
id_, new_dfa_, user_id_);
}
void add (const char_type *curr_dfa_, const string &regex_,
const id_type id_, const char_type *new_dfa_,
const id_type user_id_ = npos ())
{
add (curr_dfa_, regex_, id_, new_dfa_, true, user_id_);
}
const string_id_type_map &statemap () const
{
return _statemap;
}
const string_pair_deque &macrodeque () const
{
return _macrodeque;
}
const string_deque_deque &regexes () const
{
return _regexes;
}
const id_vector &features () const
{
return _features;
}
const id_vector_deque &ids () const
{
return _ids;
}
const id_vector_deque &user_ids () const
{
return _user_ids;
}
const id_vector_deque &next_dfas () const
{
return _next_dfas;
}
const id_vector_deque &pushes () const
{
return _pushes;
}
const bool_vector_deque &pops () const
{
return _pops;
}
bool empty () const
{
typename string_deque_deque::const_iterator iter_ = _regexes.begin ();
typename string_deque_deque::const_iterator end_ = _regexes.end ();
bool empty_ = true;
for (; iter_ != end_; ++iter_)
{
if (!iter_->empty ())
{
empty_ = false;
break;
}
}
return empty_;
}
static const char_type *initial ()
{
static const char_type initial_[] =
{'I', 'N', 'I', 'T', 'I', 'A', 'L', 0};
return initial_;
}
static const char_type *dot ()
{
static const char_type dot_[] = {'.', 0};
return dot_;
}
static const char_type *all_states ()
{
static const char_type star_[] = {'*', 0};
return star_;
}
static id_type npos ()
{
return static_cast<id_type>(~0);
}
private:
string_id_type_map _statemap;
string_pair_deque _macrodeque;
string_set _macroset;
string_deque_deque _regexes;
id_vector _features;
id_vector_deque _ids;
id_vector_deque _user_ids;
id_vector_deque _next_dfas;
id_vector_deque _pushes;
bool_vector_deque _pops;
std::size_t _flags;
std::locale _locale;
string_deque _lexer_state_names;
id_type _eoi;
void add (const char_type *curr_dfa_, const string &regex_,
const id_type id_, const char_type *new_dfa_,
const bool check_, const id_type user_id_ = npos ())
{
const bool star_ = *curr_dfa_ == '*' && *(curr_dfa_ + 1) == 0;
const bool dot_ = *new_dfa_ == '.' && *(new_dfa_ + 1) == 0;
const bool push_ = *new_dfa_ == '>';
const char_type *push_dfa_ = 0;
const bool pop_ = *new_dfa_ == '<';
if (push_ || pop_)
{
++new_dfa_;
}
if (check_)
{
check_for_invalid_id (id_);
}
if (!dot_ && !pop_)
{
const char_type *temp_ = new_dfa_;
while (*temp_ && *temp_ != ':')
{
++temp_;
}
if (*temp_) push_dfa_ = temp_ + 1;
validate (new_dfa_, *temp_ ? temp_ : 0);
if (push_dfa_)
{
validate (push_dfa_);
}
}
// npos means pop here
id_type new_dfa_id_ = npos ();
id_type push_dfa_id_ = npos ();
typename string_id_type_map::const_iterator iter_;
typename string_id_type_map::const_iterator end_ = _statemap.end ();
id_vector next_dfas_;
if (!dot_ && !pop_)
{
if (push_dfa_)
{
iter_ = _statemap.find (string (new_dfa_, push_dfa_ - 1));
}
else
{
iter_ = _statemap.find (new_dfa_);
}
if (iter_ == end_)
{
std::basic_stringstream<char_type> ss_;
std::ostringstream os_;
os_ << "Unknown state name '";
while (*new_dfa_)
{
os_ << ss_.narrow (*new_dfa_++, ' ');
}
os_ << "'.";
throw runtime_error (os_.str ());
}
new_dfa_id_ = iter_->second;
if (push_dfa_)
{
iter_ = _statemap.find (push_dfa_);
if (iter_ == end_)
{
std::basic_stringstream<char_type> ss_;
std::ostringstream os_;
os_ << "Unknown state name '";
while (*push_dfa_)
{
os_ << ss_.narrow (*push_dfa_++, ' ');
}
os_ << "'.";
throw runtime_error (os_.str ());
}
push_dfa_id_ = iter_->second;
}
}
if (star_)
{
const std::size_t size_ = _statemap.size ();
for (id_type i_ = 0; i_ < size_; ++i_)
{
next_dfas_.push_back (i_);
}
}
else
{
const char_type *start_ = curr_dfa_;
string next_dfa_;
while (*curr_dfa_)
{
while (*curr_dfa_ && *curr_dfa_ != ',')
{
++curr_dfa_;
}
next_dfa_.assign (start_, curr_dfa_);
if (*curr_dfa_)
{
++curr_dfa_;
start_ = curr_dfa_;
}
validate (next_dfa_.c_str ());
iter_ = _statemap.find (next_dfa_.c_str ());
if (iter_ == end_)
{
std::basic_stringstream<char_type> ss_;
std::ostringstream os_;
os_ << "Unknown state name '";
curr_dfa_ = next_dfa_.c_str ();
while (*curr_dfa_)
{
os_ << ss_.narrow (*curr_dfa_++, ' ');
}
os_ << "'.";
throw runtime_error (os_.str ());
}
next_dfas_.push_back (iter_->second);
}
}
for (std::size_t i_ = 0, size_ = next_dfas_.size ();
i_ < size_; ++i_)
{
const id_type curr_ = next_dfas_[i_];
_regexes[curr_].push_back (regex_);
if (regex_[0] == '^')
{
_features[curr_] |= bol_bit;
}
if (regex_[regex_.size () - 1] == '$')
{
_features[curr_] |= eol_bit;
}
if (id_ == skip ())
{
_features[curr_] |= skip_bit;
}
else if (id_ == eoi ())
{
_features[curr_] |= again_bit;
}
if (push_ || pop_)
{
_features[curr_] |= recursive_bit;
}
_ids[curr_].push_back (id_);
_user_ids[curr_].push_back (user_id_);
_next_dfas[curr_].push_back (dot_ ? curr_ : new_dfa_id_);
_pushes[curr_].push_back (push_ ? (push_dfa_ ?
push_dfa_id_ : curr_) : npos ());
_pops[curr_].push_back (pop_);
}
}
void validate (const char_type *name_, const char_type *end_ = 0) const
{
const char_type *start_ = name_;
if (*name_ != '_' && !(*name_ >= 'A' && *name_ <= 'Z') &&
!(*name_ >= 'a' && *name_ <= 'z'))
{
std::basic_stringstream<char_type> ss_;
std::ostringstream os_;
os_ << "Invalid name '";
while (*name_)
{
os_ << ss_.narrow (*name_++, ' ');
}
os_ << "'.";
throw runtime_error (os_.str ());
}
else if (*name_)
{
++name_;
}
while (*name_ && name_ != end_)
{
if (*name_ != '_' && *name_ != '-' &&
!(*name_ >= 'A' && *name_ <= 'Z') &&
!(*name_ >= 'a' && *name_ <= 'z') &&
!(*name_ >= '0' && *name_ <= '9'))
{
std::basic_stringstream<char_type> ss_;
std::ostringstream os_;
os_ << "Invalid name '";
name_ = start_;
while (*name_)
{
os_ << ss_.narrow (*name_++, ' ');
}
os_ << "'.";
throw runtime_error (os_.str ());
}
++name_;
}
}
void check_for_invalid_id (const id_type id_) const
{
if (id_ == _eoi)
{
throw runtime_error ("Cannot resuse the id for eoi.");
}
if (id_ == npos ())
{
throw runtime_error ("id npos is reserved for the "
"UNKNOWN token.");
}
}
};
typedef basic_rules<char> rules;
typedef basic_rules<wchar_t> wrules;
}
#endif

View File

@@ -0,0 +1,23 @@
// runtime_error.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RUNTIME_ERROR_HPP
#define LEXERTL_RUNTIME_ERROR_HPP
#include <stdexcept>
namespace lexertl
{
class runtime_error : public std::runtime_error
{
public:
runtime_error (const std::string &what_arg_) :
std::runtime_error (what_arg_)
{
}
};
}
#endif

28
inc/lexertl/serialise.hpp Normal file
View File

@@ -0,0 +1,28 @@
// serialise.hpp
// Copyright (c) 2007-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SERIALISE_HPP
#define LEXERTL_SERIALISE_HPP
#include "state_machine.hpp"
#include <boost/serialization/vector.hpp>
namespace lexertl
{
// IMPORTANT! This won't work if you don't enable RTTI!
template<typename CharT, typename id_type, class Archive>
void serialise (basic_state_machine<CharT, id_type> &sm_, Archive &ar_)
{
detail::basic_internals<id_type> &internals_ = sm_.data ();
ar_ & internals_._eoi;
ar_ & *internals_._lookup;
ar_ & internals_._dfa_alphabet;
ar_ & internals_._features;
ar_ & *internals_._dfa;
}
}
#endif

12
inc/lexertl/size_t.hpp Normal file
View File

@@ -0,0 +1,12 @@
// size_t.h
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SIZE_T_H
#define LEXERTL_SIZE_T_H
#include <stddef.h> // ptrdiff_t
#include <cstring>
#endif

44
inc/lexertl/sm_traits.hpp Normal file
View File

@@ -0,0 +1,44 @@
// sm_traits.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SM_TRAITS_H
#define LEXERTL_SM_TRAITS_H
namespace lexertl
{
template<typename ch_type, typename sm_type, bool comp, bool look,
bool dfa_nfa>
struct basic_sm_traits
{
enum {char_24_bit = sizeof(ch_type) > 2, compressed = comp, lookup = look,
is_dfa = dfa_nfa};
typedef ch_type input_char_type;
typedef ch_type char_type;
typedef sm_type id_type;
static id_type npos ()
{
return static_cast<id_type>(~0);
}
};
template<typename ch_type, typename sm_type, bool look, bool dfa_nfa>
struct basic_sm_traits<ch_type, sm_type, true, look, dfa_nfa>
{
enum {char_24_bit = sizeof(ch_type) > 2, compressed = true, lookup = look,
is_dfa = dfa_nfa};
typedef ch_type input_char_type;
typedef unsigned char char_type;
typedef sm_type id_type;
static id_type npos ()
{
return static_cast<id_type>(~0);
}
};
}
#endif

View File

@@ -0,0 +1,525 @@
// state_machine.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STATE_MACHINE_HPP
#define LEXERTL_STATE_MACHINE_HPP
#include "compile_assert.hpp"
// memcmp()
#include <cstring>
#include <deque>
#include "internals.hpp"
#include <map>
#include <set>
#include "sm_traits.hpp"
#include "string_token.hpp"
namespace lexertl
{
template<typename char_type, typename id_type = std::size_t>
class basic_state_machine
{
public:
typedef basic_sm_traits<char_type, id_type,
(sizeof (char_type) > 1), true, true> traits;
typedef detail::basic_internals<id_type> internals;
// If you get a compile error here you have
// failed to define an unsigned id type.
compile_assert<(static_cast<id_type>(~0) > 0)>
_valid_id_type;
basic_state_machine () :
_valid_id_type (),
_internals ()
{
}
void clear ()
{
_internals.clear ();
}
internals &data ()
{
return _internals;
}
const internals &data () const
{
return _internals;
}
bool empty () const
{
return _internals.empty ();
}
id_type eoi () const
{
return _internals._eoi;
}
void minimise ()
{
const id_type dfas_ = static_cast<id_type>(_internals.
_dfa->size ());
for (id_type i_ = 0; i_ < dfas_; ++i_)
{
const id_type dfa_alphabet_ = _internals._dfa_alphabet[i_];
id_type_vector *dfa_ = _internals._dfa[i_];
if (dfa_alphabet_ != 0)
{
std::size_t size_ = 0;
do
{
size_ = dfa_->size ();
minimise_dfa (dfa_alphabet_, *dfa_, size_);
} while (dfa_->size () != size_);
}
}
}
static id_type npos ()
{
return static_cast<id_type>(~0);
}
static id_type skip ()
{
return static_cast<id_type>(~1);
}
void swap (basic_state_machine &rhs_)
{
_internals.swap (rhs_._internals);
}
private:
typedef typename internals::id_type_vector id_type_vector;
typedef std::set<id_type> index_set;
internals _internals;
void minimise_dfa (const id_type dfa_alphabet_,
id_type_vector &dfa_, std::size_t size_)
{
const id_type *first_ = &dfa_.front ();
const id_type *end_ = first_ + size_;
id_type index_ = 1;
id_type new_index_ = 1;
id_type_vector lookup_ (size_ / dfa_alphabet_, npos ());
id_type *lookup_ptr_ = &lookup_.front ();
index_set index_set_;
const id_type bol_index_ = dfa_.front ();
*lookup_ptr_ = 0;
// Only one 'jam' state, so skip it.
first_ += dfa_alphabet_;
for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
{
const id_type *second_ = first_ + dfa_alphabet_;
for (id_type curr_index_ = index_ + 1; second_ < end_;
++curr_index_, second_ += dfa_alphabet_)
{
if (index_set_.find (curr_index_) != index_set_.end ())
{
continue;
}
// Some systems have memcmp in namespace std.
using namespace std;
if (memcmp (first_, second_, sizeof (id_type) *
dfa_alphabet_) == 0)
{
index_set_.insert (curr_index_);
lookup_ptr_[curr_index_] = new_index_;
}
}
if (lookup_ptr_[index_] == npos ())
{
lookup_ptr_[index_] = new_index_;
++new_index_;
}
}
if (!index_set_.empty ())
{
const id_type *front_ = &dfa_.front ();
id_type_vector new_dfa_ (front_, front_ + dfa_alphabet_);
typename index_set::const_iterator set_end_ = index_set_.end ();
const id_type *ptr_ = front_ + dfa_alphabet_;
id_type *new_ptr_ = 0;
new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0);
new_ptr_ = &new_dfa_.front () + dfa_alphabet_;
size_ /= dfa_alphabet_;
if (bol_index_)
{
new_dfa_.front () = lookup_ptr_[bol_index_];
}
for (index_ = 1; index_ < size_; ++index_)
{
if (index_set_.find (index_) != set_end_)
{
ptr_ += dfa_alphabet_;
continue;
}
new_ptr_[end_state_index] = ptr_[end_state_index];
new_ptr_[id_index] = ptr_[id_index];
new_ptr_[user_id_index] = ptr_[user_id_index];
new_ptr_[push_dfa_index] = ptr_[push_dfa_index];
new_ptr_[next_dfa_index] = ptr_[next_dfa_index];
new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
new_ptr_ += transitions_index;
ptr_ += transitions_index;
for (id_type i_ = transitions_index; i_ < dfa_alphabet_; ++i_)
{
*new_ptr_++ = lookup_ptr_[*ptr_++];
}
}
dfa_.swap (new_dfa_);
}
}
};
typedef basic_state_machine<char> state_machine;
typedef basic_state_machine<wchar_t> wstate_machine;
template<typename char_type, typename id_type = std::size_t,
bool is_dfa = true>
struct basic_char_state_machine
{
typedef basic_sm_traits<char_type, id_type, false, false, is_dfa> traits;
typedef detail::basic_internals<id_type> internals;
typedef typename internals::id_type_vector id_type_vector;
struct state
{
typedef basic_string_token<char_type> string_token;
typedef std::map<id_type, string_token> id_type_string_token_map;
typedef std::pair<id_type, string_token> id_type_string_token_pair;
enum push_pop_dfa {neither, push_dfa, pop_dfa};
bool _end_state;
push_pop_dfa _push_pop_dfa;
id_type _id;
id_type _user_id;
id_type _push_dfa;
id_type _next_dfa;
id_type _eol_index;
id_type_string_token_map _transitions;
state () :
_end_state (false),
_push_pop_dfa (neither),
_id (0),
_user_id (traits::npos ()),
_push_dfa (traits::npos ()),
_next_dfa (0),
_eol_index (traits::npos ()),
_transitions ()
{
}
bool operator == (const state rhs_) const
{
return _end_state == rhs_._end_state &&
_push_pop_dfa == rhs_._push_pop_dfa &&
_id == rhs_._id &&
_user_id == rhs_._user_id &&
_push_dfa == rhs_._push_dfa &&
_next_dfa == rhs_._next_dfa &&
_eol_index == rhs_._eol_index &&
_transitions == rhs_._transitions;
}
};
typedef typename state::string_token string_token;
typedef std::vector<state> state_vector;
typedef std::vector<string_token> string_token_vector;
typedef typename state::id_type_string_token_pair
id_type_string_token_pair;
struct dfa
{
id_type _bol_index;
state_vector _states;
dfa (const std::size_t size_) :
_bol_index (traits::npos ()),
_states (state_vector (size_))
{
}
std::size_t size () const
{
return _states.size ();
}
void swap (dfa &rhs_)
{
std::swap (_bol_index, rhs_._bol_index);
_states.swap (rhs_._states);
}
};
typedef std::deque<dfa> dfa_deque;
dfa_deque _sm_deque;
// If you get a compile error here you have
// failed to define an unsigned id type.
compile_assert<(static_cast<id_type>(~0) > 0)>
_valid_id_type;
basic_char_state_machine () :
_sm_deque (),
_valid_id_type ()
{
}
void append (const string_token_vector &token_vector_,
const internals &internals_, const id_type dfa_index_)
{
const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[dfa_index_];
const std::size_t alphabet_ = dfa_alphabet_ - transitions_index;
const id_type_vector &source_dfa_ = *internals_._dfa[dfa_index_];
const id_type *ptr_ = &source_dfa_.front ();
const std::size_t size_ = (source_dfa_.size () - dfa_alphabet_) /
dfa_alphabet_;
typename state::id_type_string_token_map::iterator trans_iter_;
_sm_deque.push_back (dfa (size_));
dfa &dest_dfa_ = _sm_deque.back ();
if (*ptr_)
{
dest_dfa_._bol_index = *ptr_ - 1;
}
ptr_ += dfa_alphabet_;
for (id_type i_ = 0; i_ < size_; ++i_)
{
state &state_ = dest_dfa_._states[i_];
state_._end_state = ptr_[end_state_index] != 0;
if (ptr_[push_dfa_index] != npos ())
{
state_._push_pop_dfa = state::push_dfa;
}
else if (ptr_[end_state_index] & pop_dfa_bit)
{
state_._push_pop_dfa = state::pop_dfa;
}
state_._id = ptr_[id_index];
state_._user_id = ptr_[user_id_index];
state_._push_dfa = ptr_[push_dfa_index];
state_._next_dfa = ptr_[next_dfa_index];
if (ptr_[eol_index])
{
state_._eol_index = ptr_[eol_index] - 1;
}
ptr_ += transitions_index;
for (id_type col_index_ = 0; col_index_ < alphabet_;
++col_index_, ++ptr_)
{
const id_type next_ = *ptr_;
if (next_ > 0)
{
trans_iter_ = state_._transitions.find (next_ - 1);
if (trans_iter_ == state_._transitions.end ())
{
trans_iter_ = state_._transitions.insert
(id_type_string_token_pair (next_ - 1,
token_vector_[col_index_])).first;
}
else
{
trans_iter_->second.insert (token_vector_[col_index_]);
}
}
}
}
}
void clear ()
{
_sm_deque.clear ();
}
bool empty () const
{
return _sm_deque.empty ();
}
void minimise ()
{
const id_type dfas_ = static_cast<id_type>(_sm_deque.size ());
for (id_type i_ = 0; i_ < dfas_; ++i_)
{
dfa *dfa_ = &_sm_deque[i_];
if (dfa_->size () > 0)
{
std::size_t size_ = 0;
do
{
size_ = dfa_->size ();
minimise_dfa (*dfa_, size_);
} while (dfa_->size () != size_);
}
}
}
static id_type npos ()
{
return traits::npos ();
}
id_type size () const
{
return static_cast<id_type>(_sm_deque.size ());
}
static id_type skip ()
{
return static_cast<id_type>(~1);
}
void swap (basic_char_state_machine &csm_)
{
_sm_deque.swap (csm_._sm_deque);
}
private:
typedef std::set<id_type> index_set;
void minimise_dfa (dfa &dfa_, std::size_t size_)
{
const state *first_ = &dfa_._states.front ();
const state *end_ = first_ + size_;
id_type index_ = 0;
id_type new_index_ = 0;
id_type_vector lookup_ (size_, npos ());
id_type *lookup_ptr_ = &lookup_.front ();
index_set index_set_;
for (; first_ != end_; ++first_, ++index_)
{
const state *second_ = first_ + 1;
for (id_type curr_index_ = index_ + 1; second_ != end_;
++curr_index_, ++second_)
{
if (index_set_.find (curr_index_) != index_set_.end ())
{
continue;
}
if (*first_ == *second_)
{
index_set_.insert (curr_index_);
lookup_ptr_[curr_index_] = new_index_;
}
}
if (lookup_ptr_[index_] == npos ())
{
lookup_ptr_[index_] = new_index_;
++new_index_;
}
}
if (!index_set_.empty ())
{
const state *front_ = &dfa_._states.front ();
dfa new_dfa_ (new_index_);
typename index_set::const_iterator set_end_ = index_set_.end ();
const state *ptr_ = front_;
state *new_ptr_ = &new_dfa_._states.front ();
if (dfa_._bol_index != npos ())
{
new_dfa_._bol_index = lookup_ptr_[dfa_._bol_index];
}
for (index_ = 0; index_ < size_; ++index_)
{
if (index_set_.find (index_) != set_end_)
{
++ptr_;
continue;
}
new_ptr_->_end_state = ptr_->_end_state;
new_ptr_->_id = ptr_->_end_state;
new_ptr_->_user_id = ptr_->_user_id;
new_ptr_->_next_dfa = ptr_->_next_dfa;
if (ptr_->_eol_index != npos ())
{
new_ptr_->_eol_index = lookup_ptr_[ptr_->_eol_index];
}
typename state::id_type_string_token_map::const_iterator
iter_ = ptr_->_transitions.begin ();
typename state::id_type_string_token_map::const_iterator end_ =
ptr_->_transitions.end ();
typename state::id_type_string_token_map::iterator find_;
for (; iter_ != end_; ++iter_)
{
find_ = new_ptr_->_transitions.find
(lookup_ptr_[iter_->first]);
if (find_ == new_ptr_->_transitions.end ())
{
new_ptr_->_transitions.insert
(id_type_string_token_pair
(lookup_ptr_[iter_->first], iter_->second));
}
else
{
find_->second.insert (iter_->second);
}
}
++ptr_;
++new_ptr_;
}
dfa_.swap (new_dfa_);
}
}
};
typedef basic_char_state_machine<char> char_state_machine;
typedef basic_char_state_machine<wchar_t> wchar_state_machine;
}
#endif

View File

@@ -0,0 +1,350 @@
// stream_shared_iterator.hpp
// Copyright (c) 2010-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STREAM_SHARED_ITERATOR_H
#define LEXERTL_STREAM_SHARED_ITERATOR_H
#include <algorithm>
// memcpy
#include <cstring>
#include <iostream>
#include <list>
#include <math.h>
#include "runtime_error.hpp"
#include "size_t.hpp"
#include <vector>
namespace lexertl
{
template<typename char_type>
class basic_stream_shared_iterator
{
public:
typedef std::basic_istream<char_type> istream;
typedef std::forward_iterator_tag iterator_category;
typedef std::size_t difference_type;
typedef char_type value_type;
typedef char_type *pointer;
typedef char_type &reference;
basic_stream_shared_iterator () :
_master (false),
_live (false),
_index (shared::npos ()),
_shared (0)
{
}
basic_stream_shared_iterator (istream &stream_,
const std::size_t buff_size_ = 1024,
const std::size_t increment_ = 1024) :
_master (true),
_live (false),
_index (shared::npos ()),
// For exception safety don't call new yet
_shared (0)
{
// Safe to call potentially throwing new now.
_shared = new shared (stream_, buff_size_, increment_);
++_shared->_ref_count;
_iter = _shared->_clients.insert (_shared->_clients.end (), this);
}
basic_stream_shared_iterator (const basic_stream_shared_iterator &rhs_) :
_master (false),
_live (false),
_index (rhs_._master ? rhs_._shared->lowest () : rhs_._index),
_shared (rhs_._shared)
{
if (_shared)
{
// New copy of an iterator.
// The assumption is that any copy must be live
// even if the rhs is not (otherwise we will never
// have a record of the start of the current range!)
++_shared->_ref_count;
_iter = _shared->_clients.insert (_shared->_clients.end (), this);
_live = true;
}
}
~basic_stream_shared_iterator ()
{
if (_shared)
{
--_shared->_ref_count;
_shared->erase (this);
if (_shared->_ref_count == 0)
{
delete _shared;
_shared = 0;
}
}
}
basic_stream_shared_iterator &operator =
(const basic_stream_shared_iterator &rhs_)
{
if (this != &rhs_)
{
_master = false;
_index = rhs_._master ? rhs_._shared->lowest () : rhs_._index;
if (_live && !rhs_._live)
{
_shared->erase (this);
if (!rhs_._shared)
{
--_shared->_ref_count;
}
}
else if (!_live && rhs_._live)
{
rhs_._iter = rhs_._shared->_clients.insert (rhs_._shared->
_clients.end (), this);
if (!_shared)
{
++rhs_._shared->_ref_count;
}
}
_live = rhs_._live;
_shared = rhs_._shared;
}
return *this;
}
bool operator == (const basic_stream_shared_iterator &rhs_) const
{
return _index == rhs_._index &&
(_shared == rhs_._shared ||
(_index == shared::npos () || rhs_._index == shared::npos ()) &&
(!_shared || !rhs_._shared));
}
bool operator != (const basic_stream_shared_iterator &rhs_) const
{
return !(*this == rhs_);
}
const char_type &operator * ()
{
check_master ();
return _shared->_buffer[_index];
}
basic_stream_shared_iterator &operator ++ ()
{
check_master ();
++_index;
update_state ();
return *this;
}
basic_stream_shared_iterator operator ++ (int)
{
basic_stream_shared_iterator iter_ = *this;
check_master ();
++_index;
update_state ();
return iter_;
}
private:
class shared
{
public:
std::size_t _ref_count;
typedef std::vector<char_type> char_vector;
typedef std::list<basic_stream_shared_iterator *> iter_list;
istream &_stream;
std::size_t _increment;
std::size_t _len;
char_vector _buffer;
iter_list _clients;
shared (istream &stream_, const std::size_t buff_size_,
const std::size_t increment_) :
_ref_count (0),
_increment (increment_),
_stream (stream_)
{
_buffer.resize (buff_size_);
_stream.read (&_buffer.front (), _buffer.size ());
_len = static_cast<std::size_t>(_stream.gcount ());
}
bool reload_buffer ()
{
const std::size_t lowest_ = lowest ();
std::size_t read_ = 0;
if (lowest_ == 0)
{
// Resize buffer
const std::size_t old_size_ = _buffer.size ();
const std::size_t new_size_ = old_size_ + _increment;
_buffer.resize (new_size_);
_stream.read (&_buffer.front () + old_size_, _increment);
read_ = static_cast<std::size_t>(_stream.gcount ());
if (read_)
{
read_ += old_size_;
_len = read_;
}
}
else
{
// Some systems have memcpy in namespace std
using namespace std;
const size_t start_ = _buffer.size () - lowest_;
const size_t len_ = _buffer.size () - start_;
memcpy (&_buffer.front (), &_buffer[lowest_], start_ *
sizeof (char_type));
_stream.read (&_buffer.front () + start_, len_);
read_ = static_cast<size_t>(_stream.gcount ());
subtract (lowest_);
if (read_)
{
read_ += start_;
_len = read_;
}
else
{
_len = highest ();
}
}
return read_ != 0;
}
void erase (basic_stream_shared_iterator *ptr_)
{
if (ptr_->_iter != _clients.end ())
{
_clients.erase (ptr_->_iter);
ptr_->_iter = _clients.end ();
}
}
std::size_t lowest () const
{
std::size_t lowest_ = npos ();
typename iter_list::const_iterator iter_ = _clients.begin ();
typename iter_list::const_iterator end_ = _clients.end ();
for (; iter_ != end_; ++iter_)
{
const basic_stream_shared_iterator *ptr_ = *iter_;
if (ptr_->_index < lowest_)
{
lowest_ = ptr_->_index;
}
}
if (lowest_ == npos ())
{
lowest_ = 0;
}
return lowest_;
}
std::size_t highest () const
{
std::size_t highest_ = 0;
typename iter_list::const_iterator iter_ = _clients.begin ();
typename iter_list::const_iterator end_ = _clients.end ();
for (; iter_ != end_; ++iter_)
{
const basic_stream_shared_iterator *ptr_ = *iter_;
if (ptr_->_index != npos () && ptr_->_index > highest_)
{
highest_ = ptr_->_index;
}
}
return highest_;
}
void subtract (const std::size_t lowest_)
{
typename iter_list::iterator iter_ = _clients.begin ();
typename iter_list::iterator end_ = _clients.end ();
for (; iter_ != end_; ++iter_)
{
basic_stream_shared_iterator *ptr_ = *iter_;
if (ptr_->_index != npos ())
{
ptr_->_index -= lowest_;
}
}
}
static std::size_t npos ()
{
return static_cast<std::size_t>(~0);
}
private:
shared &operator = (const shared &rhs_);
};
bool _master;
bool _live;
std::size_t _index;
shared *_shared;
mutable typename shared::iter_list::iterator _iter;
void check_master ()
{
if (!_shared)
{
throw runtime_error ("Cannot manipulate null (end) "
"stream_shared_iterators.");
}
if (_master)
{
_master = false;
_live = true;
_index = _shared->lowest ();
}
}
void update_state ()
{
if (_index >= _shared->_len)
{
if (!_shared->reload_buffer ())
{
_shared->erase (this);
_index = shared::npos ();
_live = false;
}
}
}
};
typedef basic_stream_shared_iterator<char> stream_shared_iterator;
typedef basic_stream_shared_iterator<wchar_t> wstream_shared_iterator;
}
#endif

View File

@@ -0,0 +1,421 @@
// string_token.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_STRING_TOKEN_HPP
#define LEXERTL_STRING_TOKEN_HPP
#include "char_traits.hpp"
#include <ios> // Needed by GCC 4.4
#include <iostream>
#include <string>
#include <utility>
#include <vector>
namespace lexertl
{
template<typename ch_type>
struct basic_string_token
{
typedef ch_type char_type;
typedef basic_char_traits<char_type> char_traits;
typedef typename char_traits::index_type index_type;
typedef std::pair<index_type, index_type> range;
typedef std::vector<range> range_vector;
typedef std::basic_string<char_type> string;
typedef basic_string_token<char_type> string_token;
range_vector _ranges;
basic_string_token () :
_ranges ()
{
}
basic_string_token (char_type ch_) :
_ranges ()
{
insert (range (ch_, ch_));
}
basic_string_token (char_type first_, char_type second_) :
_ranges ()
{
insert (range (first_, second_));
}
void clear ()
{
_ranges.clear ();
}
bool empty () const
{
return _ranges.empty ();
}
bool any () const
{
return _ranges.size () == 1 && _ranges.front ().first == 0 &&
_ranges.front ().second == char_traits::max_val ();
}
bool operator < (const basic_string_token &rhs_) const
{
return _ranges < rhs_._ranges;
}
bool operator == (const basic_string_token &rhs_) const
{
return _ranges == rhs_._ranges;
}
bool negatable () const
{
std::size_t size_ = 0;
typename range_vector::const_iterator iter_ = _ranges.begin ();
typename range_vector::const_iterator end_ = _ranges.end ();
for (; iter_ != end_; ++iter_)
{
size_ += static_cast<std::size_t>(iter_->second) + 1 -
static_cast<std::size_t>(iter_->first);
}
return size_ > static_cast<std::size_t>(char_traits::max_val ()) / 2;
}
void swap (basic_string_token &rhs_)
{
_ranges.swap (rhs_._ranges);
}
void insert (const basic_string_token &rhs_)
{
typename range_vector::const_iterator iter_ = rhs_._ranges.begin ();
typename range_vector::const_iterator end_ = rhs_._ranges.end ();
for (; iter_ != end_; ++iter_)
{
insert (*iter_);
}
}
// Deliberately pass by value - may modify
typename range_vector::iterator insert (range rhs_)
{
bool insert_ = true;
typename range_vector::iterator iter_ = _ranges.begin ();
typename range_vector::const_iterator end_ = _ranges.end ();
while (iter_ != end_)
{
// follows current item
if (rhs_.first > iter_->second)
{
if (rhs_.first == iter_->second + 1)
{
// Auto normalise
rhs_.first = iter_->first;
}
else
{
// No intersection, consider next
++iter_;
continue;
}
}
// Precedes current item
else if (rhs_.second < iter_->first)
{
if (rhs_.second == iter_->first - 1)
{
// Auto normalise
rhs_.second = iter_->second;
}
else
{
// insert here
break;
}
}
else
{
// overlap (under)
if (rhs_.first < iter_->first)
{
if (rhs_.second < iter_->second)
{
rhs_.second = iter_->second;
}
}
// overlap (over)
else if (rhs_.second > iter_->second)
{
if (rhs_.first > iter_->first)
{
rhs_.first = iter_->first;
}
}
// subset
else
{
insert_ = false;
iter_ = _ranges.end ();
break;
}
}
// Code minimisation: this always applies unless we have already
// exited the loop, or "continue" executed.
iter_ = _ranges.erase (iter_);
end_ = _ranges.end ();
}
if (insert_)
{
iter_ = _ranges.insert(iter_, rhs_);
}
return iter_;
}
void negate ()
{
index_type next_ = 0;
const index_type max_ = char_traits::max_val ();
string_token temp_;
typename range_vector::iterator iter_ = _ranges.begin ();
typename range_vector::const_iterator end_ = _ranges.end ();
bool finished_ = false;
for (; iter_ != end_; ++iter_)
{
if (next_ < iter_->first)
{
temp_.insert (range (next_, iter_->first - 1));
}
if (iter_->second < max_)
{
next_ = iter_->second + 1;
}
else
{
finished_ = true;
break;
}
}
if (!finished_)
{
temp_.insert (range (next_, max_));
}
swap (temp_);
}
void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
{
typename range_vector::iterator lhs_iter_ = _ranges.begin ();
typename range_vector::const_iterator lhs_end_ = _ranges.end ();
typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin ();
typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end ();
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
{
if (rhs_iter_->first > lhs_iter_->second)
{
++lhs_iter_;
}
else if (rhs_iter_->second < lhs_iter_->first)
{
++rhs_iter_;
}
else
{
range range_;
if (rhs_iter_->first > lhs_iter_->first)
{
range_.first = rhs_iter_->first;
}
else
{
range_.first = lhs_iter_->first;
}
if (rhs_iter_->second < lhs_iter_->second)
{
range_.second = rhs_iter_->second;
}
else
{
range_.second = lhs_iter_->second;
}
adjust (range_, *this, lhs_iter_, lhs_end_);
adjust (range_, rhs_, rhs_iter_, rhs_end_);
overlap_.insert (range_);
}
}
}
void remove (basic_string_token &rhs_)
{
typename range_vector::iterator lhs_iter_ = _ranges.begin ();
typename range_vector::const_iterator lhs_end_ = _ranges.end ();
typename range_vector::iterator rhs_iter_ = rhs_._ranges.begin ();
typename range_vector::const_iterator rhs_end_ = rhs_._ranges.end ();
while (lhs_iter_ != lhs_end_ && rhs_iter_ != rhs_end_)
{
if (rhs_iter_->first > lhs_iter_->second)
{
++lhs_iter_;
}
else if (rhs_iter_->second < lhs_iter_->first)
{
++rhs_iter_;
}
else
{
range range_;
if (rhs_iter_->first > lhs_iter_->first)
{
range_.first = rhs_iter_->first;
}
else
{
range_.first = lhs_iter_->first;
}
if (rhs_iter_->second < lhs_iter_->second)
{
range_.second = rhs_iter_->second;
}
else
{
range_.second = lhs_iter_->second;
}
adjust (range_, *this, lhs_iter_, lhs_end_);
}
}
}
static string escape_char (const typename char_traits::index_type ch_)
{
string out_;
switch (ch_)
{
case '\0':
out_ += '\\';
out_ += '0';
break;
case '\a':
out_ += '\\';
out_ += 'a';
break;
case '\b':
out_ += '\\';
out_ += 'b';
break;
case 27:
out_ += '\\';
out_ += 'x';
out_ += '1';
out_ += 'b';
break;
case '\f':
out_ += '\\';
out_ += 'f';
break;
case '\n':
out_ += '\\';
out_ += 'n';
break;
case '\r':
out_ += '\\';
out_ += 'r';
break;
case '\t':
out_ += '\\';
out_ += 't';
break;
case '\v':
out_ += '\\';
out_ += 'v';
break;
case '\\':
out_ += '\\';
out_ += '\\';
break;
case '"':
out_ += '\\';
out_ += '"';
break;
case '\'':
out_ += '\\';
out_ += '\'';
break;
default:
{
if (ch_ < 32 || ch_ > 126)
{
std::basic_stringstream<char_type> ss_;
out_ += '\\';
out_ += 'x';
ss_ << std::hex <<
static_cast<std::size_t> (ch_);
out_ += ss_.str ();
}
else
{
out_ += ch_;
}
break;
}
}
return out_;
}
private:
void adjust (const range &range_, basic_string_token &token_,
typename range_vector::iterator &iter_,
typename range_vector::const_iterator &end_)
{
if (range_.first > iter_->first)
{
const index_type second_ = iter_->second;
iter_->second = range_.first - 1;
if (range_.second < second_)
{
range new_range_ (range_.second + 1, second_);
iter_ = token_.insert (new_range_);
end_ = token_._ranges.end ();
}
}
else if (range_.second < iter_->second)
{
iter_->first = range_.second + 1;
}
else
{
iter_ = token_._ranges.erase (iter_);
end_ = token_._ranges.end ();
}
}
};
}
#endif

View File

@@ -0,0 +1,380 @@
// utf_iterators.hpp
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
// Inspired by http://utfcpp.sourceforge.net/
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_UTF_ITERATORS_HPP
#define LEXERTL_UTF_ITERATORS_HPP
#include <iterator>
namespace lexertl
{
template<typename char_iterator, typename char_type>
class basic_utf8_in_iterator :
public std::iterator<std::input_iterator_tag, char_type>
{
public:
basic_utf8_in_iterator () :
_char (0)
{
}
explicit basic_utf8_in_iterator (const char_iterator& it_) :
_it (it_),
_char (0)
{
next ();
}
char_type operator * () const
{
return _char;
}
bool operator == (const basic_utf8_in_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf8_in_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf8_in_iterator &operator ++ ()
{
next ();
return *this;
}
basic_utf8_in_iterator operator ++ (int)
{
basic_utf8_in_iterator temp_ = *this;
next ();
return temp_;
}
private:
typedef typename std::iterator_traits<char_iterator>::
difference_type difference_type;
char_iterator _it;
char_type _char;
void next ()
{
const char len_ = len (_it);
char_type ch_ = *_it & 0xff;
switch (len_)
{
case 1:
break;
case 2:
++_it;
ch_ = (ch_ << 6 & 0x7ff) | (*_it & 0x3f);
break;
case 3:
++_it;
ch_ = (ch_ << 12 & 0xffff) | ((*_it & 0xff) << 6 & 0xfff);
++_it;
ch_ |= *_it & 0x3f;
break;
case 4:
++_it;
ch_ = (ch_ << 18 & 0x1fffff) | ((*_it & 0xff) << 12 & 0x3ffff);
++_it;
ch_ |= (*_it & 0xff) << 6 & 0xfff;
++_it;
ch_ |= *_it & 0x3f;
break;
}
++_it;
_char = ch_;
}
char len (const char_iterator &it_) const
{
const unsigned char ch_ = *it_;
return ch_ < 0x80 ? 1 :
ch_ >> 5 == 0x06 ? 2 :
ch_ >> 4 == 0x0e ? 3 :
ch_ >> 3 == 0x1e ? 4 : 0;
}
};
template<typename char_iterator>
class basic_utf8_out_iterator :
public std::iterator<std::input_iterator_tag, char>
{
public:
basic_utf8_out_iterator () :
_count (0),
_index (0)
{
}
explicit basic_utf8_out_iterator (const char_iterator& it_) :
_it (it_),
_count (0),
_index (0)
{
next ();
}
char operator * () const
{
return _bytes[_index];
}
bool operator == (const basic_utf8_out_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf8_out_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf8_out_iterator &operator ++ ()
{
++_index;
if (_index >= _count)
{
next ();
}
return *this;
}
basic_utf8_out_iterator operator ++ (int)
{
basic_utf8_out_iterator temp_ = *this;
++_index;
if (_index >= _count)
{
next ();
}
return temp_;
}
private:
char_iterator _it;
char _bytes[4];
unsigned char _count;
unsigned char _index;
void next ()
{
const std::size_t ch_ = *_it;
_count = len (ch_);
_index = 0;
switch (_count)
{
case 1:
_bytes[0] = static_cast<char>(ch_);
break;
case 2:
_bytes[0] = static_cast<char>((ch_ >> 6) | 0xc0);
_bytes[1] = (ch_ & 0x3f) | 0x80;
break;
case 3:
_bytes[0] = static_cast<char>((ch_ >> 12) | 0xe0);
_bytes[1] = ((ch_ >> 6) & 0x3f) | 0x80;
_bytes[2] = (ch_ & 0x3f) | 0x80;
break;
case 4:
_bytes[0] = static_cast<char>((ch_ >> 18) | 0xf0);
_bytes[1] = ((ch_ >> 12) & 0x3f) | 0x80;
_bytes[2] = ((ch_ >> 6) & 0x3f) | 0x80;
_bytes[3] = (ch_ & 0x3f) | 0x80;
break;
}
++_it;
}
char len (const std::size_t ch_) const
{
return ch_ < 0x80 ? 1 :
ch_ < 0x800 ? 2 :
ch_ < 0x10000 ? 3 :
4;
}
};
template<typename char_iterator, typename char_type>
class basic_utf16_in_iterator :
public std::iterator<std::input_iterator_tag, char_type>
{
public:
basic_utf16_in_iterator () :
_char (0)
{
}
explicit basic_utf16_in_iterator (const char_iterator &it_) :
_it (it_),
_char (0)
{
next ();
}
char_type operator * () const
{
return _char;
}
bool operator == (const basic_utf16_in_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf16_in_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf16_in_iterator &operator ++ ()
{
next ();
return *this;
}
basic_utf16_in_iterator operator ++ (int)
{
basic_utf16_in_iterator temp_ = *this;
next ();
return temp_;
}
private:
typedef typename std::iterator_traits<char_iterator>::
difference_type difference_type;
char_iterator _it;
char_type _char;
void next ()
{
char_type ch_ = *_it & 0xffff;
if (ch_ >= 0xd800 && ch_ <= 0xdbff)
{
const char_type surrogate_ = *++_it & 0xffff;
ch_ = (((ch_ - 0xd800) << 10) | (surrogate_ - 0xdc00)) + 0x10000;
}
++_it;
_char = ch_;
}
};
template<typename char_iterator>
class basic_utf16_out_iterator :
public std::iterator<std::input_iterator_tag, wchar_t>
{
public:
basic_utf16_out_iterator () :
_count (0),
_index (0)
{
}
explicit basic_utf16_out_iterator (const char_iterator& it_) :
_it (it_),
_count (0),
_index (0)
{
next ();
}
wchar_t operator * () const
{
return _chars[_index];
}
bool operator == (const basic_utf16_out_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf16_out_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf16_out_iterator &operator ++ ()
{
++_index;
if (_index >= _count)
{
next ();
}
return *this;
}
basic_utf16_out_iterator operator ++ (int)
{
basic_utf16_out_iterator temp_ = *this;
++_index;
if (_index >= _count)
{
next ();
}
return temp_;
}
private:
char_iterator _it;
wchar_t _chars[2];
unsigned char _count;
unsigned char _index;
void next ()
{
const std::size_t ch_ = *_it;
_count = len (ch_);
_index = 0;
switch (_count)
{
case 1:
_chars[0] = static_cast<wchar_t>(ch_);
break;
case 2:
_chars[0] = static_cast<wchar_t>((ch_ >> 10) + 0xdc00u -
(0x10000 >> 10));
_chars[1] = static_cast<wchar_t>((ch_ & 0x3ff) + 0xdc00u);
break;
}
++_it;
}
char len (const std::size_t ch_) const
{
return ch_ > 0xffff ? 2 : 1;
}
};
}
#endif

View File

@@ -4,6 +4,15 @@
* Licensed under the GNU GPL v2. * Licensed under the GNU GPL v2.
*/ */
// this file is auto generated from grammar/grammar.y
// but it does not work yet
// #include "grammar.h"
#include "lexertl/generator.hpp"
#include "lexertl/lookup.hpp"
#include "lexertl/rules.hpp"
#include "lexertl/state_machine.hpp"
int main() { int main() {
return 0; return 0;
} }