Files
scully/inc/lexertl/lookup.hpp

478 lines
12 KiB
C++
Raw Normal View History

2013-05-31 21:35:44 +02:00
// lookup.hpp
// Copyright (c) 2009-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_LOOKUP_HPP
#define LEXERTL_LOOKUP_HPP
#include <assert.h>
#include "bool.hpp"
#include "match_results.hpp"
#include "state_machine.hpp"
namespace lexertl
{
namespace detail
{
template<bool>
struct bol_state
{
bol_state (const bool)
{
}
};
template<>
struct bol_state<true>
{
bool _bol;
bool _end_bol;
bol_state (const bool bol_) :
_bol (bol_),
_end_bol (bol_)
{
}
};
template<typename id_type, bool>
struct eol_state
{
};
template<typename id_type>
struct eol_state<id_type, true>
{
id_type _EOL_state;
eol_state () :
_EOL_state (0)
{
}
};
template<typename id_type, bool>
struct multi_state_state
{
multi_state_state (const id_type)
{
}
};
template<typename id_type>
struct multi_state_state<id_type, true>
{
id_type _start_state;
multi_state_state (const id_type state_) :
_start_state (state_)
{
}
};
template<typename id_type, bool>
struct recursive_state
{
recursive_state (const id_type *)
{
}
};
template<typename id_type>
struct recursive_state<id_type, true>
{
bool _pop;
id_type _push_dfa;
recursive_state (const id_type *ptr_) :
_pop ((*ptr_ & pop_dfa_bit) != 0),
_push_dfa (*(ptr_ + push_dfa_index))
{
}
};
template<typename id_type, typename index_type, std::size_t flags>
struct lookup_state
{
typedef basic_internals<id_type> internals;
const id_type *_lookup;
id_type _dfa_alphabet;
const id_type *_dfa;
const id_type *_ptr;
bool _end_state;
id_type _id;
id_type _uid;
bol_state<(flags & bol_bit) != 0> _bol_state;
eol_state<id_type, (flags & eol_bit) != 0> _eol_state;
multi_state_state<id_type, (flags & multi_state_bit) != 0>
_multi_state_state;
recursive_state<id_type, (flags & recursive_bit) != 0> _recursive_state;
lookup_state (const internals &internals_, const bool bol_,
const id_type state_) :
_lookup (&internals_._lookup[state_]->front ()),
_dfa_alphabet (internals_._dfa_alphabet[state_]),
_dfa (&internals_._dfa[state_]->front ()),
_ptr (_dfa + _dfa_alphabet),
_end_state (*_ptr != 0),
_id (*(_ptr + id_index)),
_uid (*(_ptr + user_id_index)),
_bol_state (bol_),
_eol_state (),
_multi_state_state (state_),
_recursive_state (_ptr)
{
}
void reset_recursive (const false_ &)
{
// Do nothing
}
void reset_recursive (const true_ &)
{
_recursive_state._pop = (*_ptr & pop_dfa_bit) != 0;
_recursive_state._push_dfa = *(_ptr + push_dfa_index);
}
void bol_start_state (const false_ &)
{
// Do nothing
}
void bol_start_state (const true_ &)
{
if (_bol_state._bol)
{
const id_type state_ = *_dfa;
if (state_)
{
_ptr = &_dfa[state_ * _dfa_alphabet];
}
}
}
template<typename char_type>
bool eol (const char_type, const false_ &)
{
return false;
}
template<typename char_type>
bool eol (const char_type curr_, const true_ &)
{
bool ret_ = false;
_eol_state._EOL_state = _ptr[eol_index];
ret_ = _eol_state._EOL_state && curr_ == '\n';
if (ret_)
{
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
}
return ret_;
}
template<typename char_type>
id_type next_char (const char_type prev_char_, const false_ &)
{
const id_type state_= _ptr[_lookup
[static_cast<index_type>(prev_char_)]];
if (state_ != 0)
{
_ptr = &_dfa[state_ * _dfa_alphabet];
}
return state_;
}
template<typename char_type>
id_type next_char (const char_type prev_char_, const true_ &)
{
const std::size_t bytes_ = sizeof (char_type) < 3 ?
sizeof (char_type) : 3;
const std::size_t shift_[] = {0, 8, 16};
id_type state_= 0;
for (std::size_t i_ = 0; i_ < bytes_; ++i_)
{
state_ = _ptr[_lookup[static_cast<unsigned char>((prev_char_ >>
shift_[bytes_ - 1 - i_]) & 0xff)]];
if (state_ == 0)
{
break;
}
_ptr = &_dfa[state_ * _dfa_alphabet];
}
return state_;
}
template<typename char_type>
void bol (const char_type, const false_ &)
{
// Do nothing
}
template<typename char_type>
void bol (const char_type prev_char_, const true_ &)
{
_bol_state._bol = prev_char_ == '\n';
}
void eol (const id_type, const false_ &)
{
// Do nothing
}
void eol (const id_type err_val_, const true_ &)
{
_eol_state._EOL_state = err_val_;
}
void reset_start_state (const false_ &)
{
// Do nothing
}
void reset_start_state (const true_ &)
{
_multi_state_state._start_state = *(_ptr + next_dfa_index);
}
void reset_end_bol (const false_ &)
{
// Do nothing
}
void reset_end_bol (const true_ &)
{
_bol_state._end_bol = _bol_state._bol;
}
template<typename iter_type>
void end_state (iter_type &end_token_, iter_type &curr_)
{
if (*_ptr)
{
_end_state = true;
reset_end_bol (bool_<(flags & bol_bit) != 0> ());
_id = *(_ptr + id_index);
_uid = *(_ptr + user_id_index);
reset_recursive (bool_<(flags & recursive_bit) != 0> ());
reset_start_state (bool_<(flags & multi_state_bit) != 0> ());
end_token_ = curr_;
}
}
template<typename iter_type, typename char_type>
void check_eol (iter_type &, iter_type &, const id_type,
const char_type, const false_ &)
{
// Do nothing
}
template<typename iter_type, typename char_type>
void check_eol (iter_type &end_token_, iter_type &curr_,
const id_type npos, const char_type eoi_, const true_ &)
{
if (_eol_state._EOL_state != npos && curr_ == eoi_)
{
_eol_state._EOL_state = _ptr[eol_index];
if (_eol_state._EOL_state)
{
_ptr = &_dfa[_eol_state._EOL_state * _dfa_alphabet];
end_state (end_token_, curr_);
}
}
}
template<typename results>
void pop (results &, const false_ &)
{
// Nothing to do
}
template<typename results>
void pop (results &results_, const true_ &)
{
if (_recursive_state._pop)
{
_multi_state_state._start_state = results_.stack.top ().first;
results_.stack.pop ();
}
else if (_recursive_state._push_dfa != results::npos ())
{
results_.stack.push (typename results::id_type_pair
(_recursive_state._push_dfa, _id));
}
}
template<typename results>
bool id_eoi (const id_type eoi_, const results &, const false_ &)
{
return _id == eoi_;
}
template<typename results>
bool id_eoi (const id_type eoi_, const results &results_, const true_ &)
{
return _id == eoi_ || (_recursive_state._pop &&
!results_.stack.empty () && results_.stack.top ().second == eoi_);
}
void start_state (id_type &, const false_ &)
{
// Do nothing
}
void start_state (id_type &start_state_, const true_ &)
{
start_state_ = _multi_state_state._start_state;
}
void bol (bool &, const false_ &)
{
// Do nothing
}
void bol (bool &end_bol_, const true_ &)
{
end_bol_ = _bol_state._end_bol;
}
};
template<typename results>
void inc_end (results &, const false_ &)
{
// Do nothing
}
template<typename results>
void inc_end (results &results_, const true_ &)
{
++results_.end;
}
template<typename iter_type, std::size_t flags, typename id_type,
typename results, bool compressed, bool recursive>
void next (const basic_state_machine<typename std::iterator_traits
<iter_type>::value_type, id_type> &sm_,
results &results_, const bool_<compressed> &compressed_,
const bool_<recursive> &recursive_)
{
const basic_internals<id_type> &internals_ = sm_.data ();
typename results::iter_type end_token_ = results_.end;
skip:
typename results::iter_type curr_ = results_.end;
results_.start = curr_;
again:
if (curr_ == results_.eoi)
{
results_.id = internals_._eoi;
results_.user_id = results::npos ();
return;
}
lookup_state<id_type, typename results::index_type, flags> lu_state_
(internals_, results_.bol, results_.state);
lu_state_.bol_start_state (bool_<(flags & bol_bit) != 0> ());
while (curr_ != results_.eoi)
{
if (!lu_state_.eol (*curr_, bool_<(flags & eol_bit) != 0> ()))
{
const typename results::char_type prev_char_ = *curr_++;
const id_type state_ = lu_state_.next_char (prev_char_,
compressed_);
lu_state_.bol (prev_char_, bool_<(flags & bol_bit) != 0> ());
if (state_ == 0)
{
lu_state_.eol (results::npos (),
bool_<(flags & eol_bit) != 0> ());
break;
}
}
lu_state_.end_state (end_token_, curr_);
}
lu_state_.check_eol (end_token_, curr_, results::npos (), results_.eoi,
bool_<(flags & eol_bit) != 0> ());
if (lu_state_._end_state)
{
// Return longest match
lu_state_.pop (results_, recursive_);
lu_state_.start_state (results_.state,
bool_<(flags & multi_state_bit) != 0> ());
lu_state_.bol (results_.bol, bool_<(flags & bol_bit) != 0> ());
results_.end = end_token_;
if (lu_state_._id == sm_.skip ()) goto skip;
if (lu_state_.id_eoi (internals_._eoi, results_, recursive_))
{
curr_ = end_token_;
goto again;
}
}
else
{
results_.end = end_token_;
results_.bol = *results_.end == '\n';
results_.start = results_.end;
// No match causes char to be skipped
inc_end (results_, bool_<(flags & advance_bit) != 0> ());
lu_state_._id = results::npos ();
lu_state_._uid = results::npos ();
}
results_.id = lu_state_._id;
results_.user_id = lu_state_._uid;
}
}
template<typename iter_type, typename id_type, std::size_t flags>
void lookup (const basic_state_machine<typename std::iterator_traits
<iter_type>::value_type, id_type> &sm_,
match_results<iter_type, id_type, flags> &results_)
{
// If this asserts, you have either not defined all the correct
// flags, or you should be using recursive_match_results instead
// of match_results.
assert ((sm_.data ()._features & flags) == sm_.data ()._features);
detail::next<iter_type, flags, id_type> (sm_, results_, bool_<(sizeof
(typename std::iterator_traits<iter_type>::value_type) > 1)> (),
false_ ());
}
template<typename iter_type, typename id_type, std::size_t flags>
void lookup (const basic_state_machine<typename std::iterator_traits
<iter_type>::value_type, id_type> &sm_,
recursive_match_results<iter_type, id_type, flags> &results_)
{
// If this asserts, you have not defined all the correct flags
assert ((sm_.data ()._features & flags) == sm_.data ()._features);
detail::next<iter_type, flags | recursive_bit, id_type> (sm_, results_,
bool_<(sizeof(typename std::iterator_traits<iter_type>::
value_type) > 1)> (), true_ ());
}
}
#endif