Fix typo in grammar.

Add lexertl.
This commit is contained in:
Markus Hauschild
2013-05-31 21:35:44 +02:00
parent 7be9545e02
commit f3d3389f54
44 changed files with 12465 additions and 2 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,100 @@
// re_token.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKEN_HPP
#define LEXERTL_RE_TOKEN_HPP
#include "../../string_token.hpp"
namespace lexertl
{
namespace detail
{
enum token_type {BEGIN, REGEX, OREXP, SEQUENCE, SUB, EXPRESSION, REPEAT,
DUP, OR, CHARSET, BOL, EOL, MACRO, OPENPAREN, CLOSEPAREN, OPT, AOPT,
ZEROORMORE, AZEROORMORE, ONEORMORE, AONEORMORE, REPEATN, AREPEATN,
END};
template<typename input_char_type, typename char_type>
struct basic_re_token
{
typedef basic_string_token<char_type> string_token;
typedef std::basic_string<input_char_type> string;
token_type _type;
string _extra;
string_token _str;
basic_re_token (const token_type type_ = BEGIN) :
_type (type_),
_extra (),
_str ()
{
}
void clear ()
{
_type = BEGIN;
_extra.clear ();
_str.clear ();
}
basic_re_token &operator = (const basic_re_token &rhs_)
{
_type = rhs_._type;
_extra = rhs_._extra;
_str = rhs_._str;
return *this;
}
char precedence (const token_type type_) const
{
// Moved in here for Solaris compiler.
static const char precedence_table_[END + 1][END + 1] = {
// BEG, REG, ORE, SEQ, SUB, EXP, RPT, DUP, | , CHR, BOL, EOL, MCR, ( , ) , ? , ?? , * , *? , + , +?, {n}?, {n}, END
/*BEGIN*/{' ', '<', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*REGEX*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*OREXP*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* SEQ */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', ' ', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* SUB */{' ', ' ', ' ', ' ', ' ', '=', '<', ' ', '>', '<', '<', '<', '<', '<', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*EXPRE*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* RPT */{' ', ' ', ' ', ' ', ' ', ' ', ' ', '=', '>', '>', '>', '>', '>', '>', '>', '<', '<', '<', '<', '<', '<', '<', '<', '>'},
/*DUPLI*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* | */{' ', ' ', ' ', '=', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '},
/*CHARA*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* BOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* EOL */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/*MACRO*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* ( */{' ', '=', '<', '<', '<', '<', '<', ' ', ' ', '<', '<', '<', '<', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '},
/* ) */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>', '>'},
/* ? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* ?? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* * */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* *? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* + */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* +? */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*{n,m}*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', '<', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/*{nm}?*/{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>', '>', '>', '>', '>', '>', '>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '>'},
/* END */{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '}
};
return precedence_table_[_type][type_];
}
const char *precedence_string () const
{
// Moved in here for Solaris compiler.
static const char *precedence_strings_[END + 1] =
{"BEGIN", "REGEX", "OREXP", "SEQUENCE", "SUB", "EXPRESSION",
"REPEAT", "DUPLICATE", "|", "CHARSET", "^", "$", "MACRO", "(", ")",
"?", "??", "*", "*?", "+", "+?", "{n[,[m]]}", "{n[,[m]]}?", "END"};
return precedence_strings_[_type];
}
};
}
}
#endif

View File

@@ -0,0 +1,829 @@
// tokeniser.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_HPP
#define LEXERTL_RE_TOKENISER_HPP
#include <cstring>
#include "re_token.hpp"
#include "../../runtime_error.hpp"
#include "../../size_t.hpp"
#include <sstream>
#include "../../string_token.hpp"
#include "re_tokeniser_helper.hpp"
namespace lexertl
{
namespace detail
{
template<typename rules_char_type, typename char_type, typename id_type>
class basic_re_tokeniser
{
public:
typedef basic_re_token<rules_char_type, char_type> re_token;
typedef basic_re_tokeniser_helper<rules_char_type, char_type, id_type>
tokeniser_helper;
typedef typename tokeniser_helper::char_state char_state;
typedef typename tokeniser_helper::state state;
typedef basic_string_token<char_type> string_token;
static void next (re_token *lhs_, state &state_, re_token *token_)
{
rules_char_type ch_ = 0;
bool eos_ = state_.next (ch_);
bool skipped_ = false;
token_->clear ();
do
{
// string begin/end
while (!eos_ && ch_ == '"')
{
state_._in_string ^= 1;
eos_ = state_.next (ch_);
}
// (?# ...)
skipped_ = comment (eos_, ch_, state_);
// skip_ws set
skipped_ |= skip (eos_, ch_, state_);
} while (skipped_);
if (eos_)
{
if (state_._in_string)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '\"') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (state_._paren_count)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing ')') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
token_->_type = END;
}
else
{
if (ch_ == '\\')
{
// Even if we are in a string, respect escape sequences...
token_->_type = CHARSET;
escape (state_, token_->_str);
}
else if (state_._in_string)
{
// All other meta characters lose their special meaning
// inside a string.
token_->_type = CHARSET;
token_->_str.insert (typename string_token::range (ch_, ch_));
}
else
{
// Not an escape sequence and not inside a string, so
// check for meta characters.
switch (ch_)
{
case '(':
token_->_type = OPENPAREN;
++state_._paren_count;
read_options (state_);
break;
case ')':
--state_._paren_count;
if (state_._paren_count < 0)
{
std::ostringstream ss_;
ss_ << "Number of open parenthesis < 0 "
"at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
token_->_type = CLOSEPAREN;
if (!state_._flags_stack.empty ())
{
state_._flags = state_._flags_stack.top ();
state_._flags_stack.pop ();
}
break;
case '?':
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AOPT;
state_.increment ();
}
else
{
token_->_type = OPT;
}
break;
case '*':
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AZEROORMORE;
state_.increment ();
}
else
{
token_->_type = ZEROORMORE;
}
break;
case '+':
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AONEORMORE;
state_.increment ();
}
else
{
token_->_type = ONEORMORE;
}
break;
case '{':
open_curly (lhs_, state_, token_);
break;
case '|':
token_->_type = OR;
break;
case '^':
if (!state_._macro && state_._curr - 1 == state_._start)
{
token_->_type = BOL;
}
else
{
token_->_type = CHARSET;
token_->_str.insert (typename string_token::range
(ch_, ch_));
}
break;
case '$':
if (!state_._macro && state_._curr == state_._end)
{
token_->_type = EOL;
}
else
{
token_->_type = CHARSET;
token_->_str.insert (typename string_token::range
(ch_, ch_));
}
break;
case '.':
{
token_->_type = CHARSET;
if (state_._flags & dot_not_newline)
{
token_->_str.insert (typename string_token::range
('\n', '\n'));
}
token_->_str.negate ();
break;
}
case '[':
{
token_->_type = CHARSET;
tokeniser_helper::charset (state_, token_->_str);
break;
}
case '/':
{
std::ostringstream ss_;
ss_ << "Lookahead ('/') is not supported yet in " <<
"rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
break;
}
default:
token_->_type = CHARSET;
if ((state_._flags & icase) &&
(std::isupper (ch_, state_._locale) ||
std::islower (ch_, state_._locale)))
{
char_type upper_ = std::toupper
(ch_, state_._locale);
char_type lower_ = std::tolower
(ch_, state_._locale);
token_->_str.insert (typename string_token::range
(upper_, upper_));
token_->_str.insert (typename string_token::range
(lower_, lower_));
}
else
{
token_->_str.insert (typename string_token::range
(ch_, ch_));
}
break;
}
}
}
}
private:
static bool comment (bool &eos_, rules_char_type &ch_, state &state_)
{
bool skipped_ = false;
if (!eos_ && !state_._in_string && ch_ == '(' &&
!state_.eos () && *state_._curr == '?' &&
state_._curr + 1 < state_._end && *(state_._curr + 1) == '#')
{
std::size_t paren_count_ = 1;
state_.increment ();
state_.increment ();
do
{
eos_ = state_.next (ch_);
if (ch_ == '(')
{
++paren_count_;
}
else if (ch_ == ')')
{
--paren_count_;
}
} while (!eos_ && !(ch_ == ')' && paren_count_ == 0));
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (unterminated comment) " <<
"in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
else
{
eos_ = state_.next (ch_);
}
skipped_ = true;
}
return skipped_;
}
static bool skip (bool &eos_, rules_char_type &ch_, state &state_)
{
bool skipped_ = false;
if (!eos_ && (state_._flags & skip_ws) && !state_._in_string)
{
bool c_comment_ = false;
bool skip_ws_ = false;
do
{
c_comment_ = ch_ == '/' && !state_.eos () &&
*state_._curr == '*';
skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' ||
ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v');
if (c_comment_)
{
state_.increment ();
eos_ = state_.next (ch_);
while (!eos_ && !(ch_ == '*' && !state_.eos () &&
*state_._curr == '/'))
{
eos_ = state_.next (ch_);
}
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (unterminated " <<
"C style comment) in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
else
{
state_.increment ();
eos_ = state_.next (ch_);
}
skipped_ = true;
}
else if (skip_ws_)
{
eos_ = state_.next (ch_);
skipped_ = true;
}
} while (c_comment_ || skip_ws_);
}
return skipped_;
}
static void read_options (state &state_)
{
if (!state_.eos () && *state_._curr == '?')
{
rules_char_type ch_ = 0;
bool eos_ = false;
bool negate_ = false;
state_.increment ();
eos_ = state_.next (ch_);
state_._flags_stack.push (state_._flags);
while (!eos_ && ch_ != ':')
{
switch (ch_)
{
case '-':
negate_ ^= 1;
break;
case 'i':
if (negate_)
{
state_._flags = state_._flags & ~icase;
}
else
{
state_._flags = state_._flags | icase;
}
negate_ = false;
break;
case 's':
if (negate_)
{
state_._flags = state_._flags | dot_not_newline;
}
else
{
state_._flags = state_._flags & ~dot_not_newline;
}
negate_ = false;
break;
case 'x':
if (negate_)
{
state_._flags = state_._flags & ~skip_ws;
}
else
{
state_._flags = state_._flags | skip_ws;
}
negate_ = false;
break;
default:
{
std::ostringstream ss_;
ss_ << "Unknown option at index " <<
state_.index () - 1 << " in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
}
eos_ = state_.next (ch_);
}
// End of string handler will handle early termination
}
else if (!state_._flags_stack.empty ())
{
state_._flags_stack.push (state_._flags);
}
}
static void escape (state &state_, string_token &token_)
{
char_type ch_ = 0;
std::size_t str_len_ = 0;
const char *str_ = tokeniser_helper::escape_sequence (state_,
ch_, str_len_);
if (str_)
{
char_state state2_ (str_ + 1, str_ + str_len_, state_._id,
state_._flags, state_._locale, false);
tokeniser_helper::charset (state2_, token_);
}
else
{
token_.insert (typename string_token::range (ch_, ch_));
}
}
static void open_curly (re_token *lhs_, state &state_,
re_token *token_)
{
if (state_.eos ())
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
else if (*state_._curr == '-')
{
charset_difference (lhs_, state_, token_);
}
else if (*state_._curr == '+')
{
charset_union (lhs_, state_, token_);
}
else if (*state_._curr >= '0' && *state_._curr <= '9')
{
repeat_n (state_, token_);
}
else
{
macro (state_, token_);
}
}
static void charset_difference (re_token *lhs_, state &state_,
re_token *token_)
{
rules_char_type ch_ = 0;
if (lhs_->_type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must precede {-} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
state_.next (ch_);
if (state_.next (ch_))
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
re_token rhs_;
next (lhs_, state_, &rhs_);
if (rhs_._type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must follow {-} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
lhs_->_str.remove (rhs_._str);
if (lhs_->_str.empty ())
{
std::ostringstream ss_;
ss_ << "Empty charset created by {-} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
next (lhs_, state_, token_);
}
static void charset_union (re_token *lhs_, state &state_,
re_token *token_)
{
rules_char_type ch_ = 0;
if (lhs_->_type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must precede {+} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
state_.next (ch_);
if (state_.next (ch_))
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
re_token rhs_;
next (lhs_, state_, &rhs_);
if (rhs_._type != CHARSET)
{
std::ostringstream ss_;
ss_ << "CHARSET must follow {+} at index " <<
state_.index () - 1 << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
lhs_->_str.insert (rhs_._str);
next (lhs_, state_, token_);
}
// SYNTAX:
// {n[,[n]]}
// SEMANTIC RULES:
// {0} - INVALID (throw exception)
// {0,} = *
// {0,0} - INVALID (throw exception)
// {0,1} = ?
// {1,} = +
// {min,max} where min == max - {min}
// {min,max} where max < min - INVALID (throw exception)
static void repeat_n (state &state_, re_token *token_)
{
rules_char_type ch_ = 0;
bool eos_ = state_.next (ch_);
std::size_t min_ = 0;
std::size_t max_ = 0;
while (!eos_ && ch_ >= '0' && ch_ <= '9')
{
min_ *= 10;
min_ += ch_ - '0';
token_->_extra += ch_;
eos_ = state_.next (ch_);
}
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
bool min_max_ = false;
bool repeatn_ = true;
if (ch_ == ',')
{
token_->_extra += ch_;
eos_ = state_.next (ch_);
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') in rule id " <<
state_._id << '.';
throw runtime_error (ss_.str ());
}
if (ch_ == '}')
{
// Small optimisation: Check for '*' equivalency.
if (min_ == 0)
{
token_->_type = ZEROORMORE;
repeatn_ = false;
}
// Small optimisation: Check for '+' equivalency.
else if (min_ == 1)
{
token_->_type = ONEORMORE;
repeatn_ = false;
}
}
else
{
if (ch_ < '0' || ch_ > '9')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
min_max_ = true;
do
{
max_ *= 10;
max_ += ch_ - '0';
token_->_extra += ch_;
eos_ = state_.next (ch_);
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex (missing '}') "
"in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
// Small optimisation: Check for '?' equivalency.
if (min_ == 0 && max_ == 1)
{
token_->_type = OPT;
repeatn_ = false;
}
// Small optimisation: if min == max, then min.
else if (min_ == max_)
{
token_->_extra.erase (token_->_extra.find (','));
min_max_ = false;
max_ = 0;
}
}
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
if (repeatn_)
{
// SEMANTIC VALIDATION follows:
// NOTE: {0,} has already become *
// therefore we don't check for a comma.
if (min_ == 0 && max_ == 0)
{
std::ostringstream ss_;
ss_ << "Cannot have exactly zero repeats preceding index " <<
state_.index () << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
if (min_max_ && max_ < min_)
{
std::ostringstream ss_;
ss_ << "Max less than min preceding index " <<
state_.index () << " in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AREPEATN;
state_.increment ();
}
else
{
token_->_type = REPEATN;
}
}
else if (token_->_type == ZEROORMORE)
{
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AZEROORMORE;
state_.increment ();
}
}
else if (token_->_type == ONEORMORE)
{
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AONEORMORE;
state_.increment ();
}
}
else if (token_->_type == OPT)
{
if (!state_.eos () && *state_._curr == '?')
{
token_->_type = AOPT;
state_.increment ();
}
}
}
static void macro (state &state_, re_token *token_)
{
rules_char_type ch_ = 0;
bool eos_ = false;
state_.next (ch_);
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
!(ch_ >= 'a' && ch_ <= 'z'))
{
std::ostringstream ss_;
ss_ << "Invalid MACRO name at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
do
{
token_->_extra += ch_;
eos_ = state_.next (ch_);
if (eos_)
{
std::ostringstream ss_;
// Pointless returning index if at end of string
ss_ << "Unexpected end of regex " <<
"(missing '}') in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 <<
" in rule id " << state_._id << '.';
throw runtime_error (ss_.str ());
}
token_->_type = MACRO;
}
};
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,115 @@
// tokeniser_state.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_STATE_HPP
#define LEXERTL_RE_TOKENISER_STATE_HPP
#include "../../char_traits.hpp"
#include "../../enums.hpp"
#include <locale>
#include "../../size_t.hpp"
#include <stack>
namespace lexertl
{
namespace detail
{
template<typename ch_type, typename id_type>
struct basic_re_tokeniser_state
{
typedef ch_type char_type;
typedef typename basic_char_traits<char_type>::index_type index_type;
const char_type * const _start;
const char_type * const _end;
const char_type *_curr;
id_type _id;
std::size_t _flags;
std::stack<std::size_t> _flags_stack;
std::locale _locale;
bool _macro;
long _paren_count;
bool _in_string;
id_type _nl_id;
basic_re_tokeniser_state (const char_type *start_,
const char_type * const end_, id_type id_, const std::size_t flags_,
const std::locale locale_, const bool macro_) :
_start (start_),
_end (end_),
_curr (start_),
_id (id_),
_flags (flags_),
_flags_stack (),
_locale (locale_),
_macro (macro_),
_paren_count (0),
_in_string (false),
_nl_id (static_cast<id_type>(~0))
{
}
basic_re_tokeniser_state (const basic_re_tokeniser_state &rhs_)
{
assign (rhs_);
}
// prevent VC++ 7.1 warning:
const basic_re_tokeniser_state &operator =
(const basic_re_tokeniser_state &rhs_)
{
assign (rhs_);
}
void assign (const basic_re_tokeniser_state &rhs_)
{
_start = rhs_._start;
_end = rhs_._end;
_curr = rhs_._curr;
_id = rhs_._id;
_flags = rhs_._flags;
_flags_stack = rhs_._flags_stack;
_locale = rhs_._locale;
_macro = rhs_._macro;
_paren_count = rhs_._paren_count;
_in_string = rhs_._in_string;
_nl_id = rhs_._nl_id;
return this;
}
inline bool next (char_type &ch_)
{
if (_curr >= _end)
{
ch_ = 0;
return true;
}
else
{
ch_ = *_curr;
increment ();
return false;
}
}
inline void increment ()
{
++_curr;
}
inline std::size_t index ()
{
return _curr - _start;
}
inline bool eos ()
{
return _curr >= _end;
}
};
}
}
#endif

View File

@@ -0,0 +1,112 @@
// end_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_END_NODE_HPP
#define LEXERTL_END_NODE_HPP
#include "node.hpp"
#include "../../size_t.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_end_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_end_node (const id_type id_, const id_type user_id_,
const id_type next_dfa_, const id_type push_dfa_,
const bool pop_dfa_) :
basic_node<id_type> (false),
_id (id_),
_user_id (user_id_),
_next_dfa (next_dfa_),
_push_dfa (push_dfa_),
_pop_dfa (pop_dfa_),
_followpos ()
{
basic_node<id_type>::_firstpos.push_back (this);
basic_node<id_type>::_lastpos.push_back (this);
}
virtual ~basic_end_node ()
{
}
virtual node_type what_type () const
{
return node::END;
}
virtual bool traverse (const_node_stack &/*node_stack_*/,
bool_stack &/*perform_op_stack_*/) const
{
return false;
}
virtual const node_vector &followpos () const
{
// _followpos is always empty..!
return _followpos;
}
virtual bool end_state () const
{
return true;
}
virtual id_type id () const
{
return _id;
}
virtual id_type user_id () const
{
return _user_id;
}
virtual id_type next_dfa () const
{
return _next_dfa;
}
virtual id_type push_dfa () const
{
return _push_dfa;
}
virtual bool pop_dfa () const
{
return _pop_dfa;
}
private:
id_type _id;
id_type _user_id;
id_type _next_dfa;
id_type _push_dfa;
bool _pop_dfa;
node_vector _followpos;
virtual void copy_node (node_ptr_vector &/*node_ptr_vector_*/,
node_stack &/*new_node_stack_*/, bool_stack &/*perform_op_stack_*/,
bool &/*down_*/) const
{
// Nothing to do, as end_nodes are not copied.
}
};
}
}
#endif

View File

@@ -0,0 +1,103 @@
// iteration_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_ITERATION_NODE_HPP
#define LEXERTL_ITERATION_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_iteration_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_iteration_node (basic_node<id_type> *next_, const bool greedy_) :
basic_node<id_type> (true),
_next (next_),
_greedy (greedy_)
{
typename node_vector::iterator iter_;
typename node_vector::iterator end_;
_next->append_firstpos (node::_firstpos);
_next->append_lastpos (node::_lastpos);
for (iter_ = node::_lastpos.begin (), end_ = node::_lastpos.end ();
iter_ != end_; ++iter_)
{
(*iter_)->append_followpos (node::_firstpos);
}
for (iter_ = node::_firstpos.begin (), end_ = node::_firstpos.end ();
iter_ != end_; ++iter_)
{
(*iter_)->greedy (greedy_);
}
}
virtual ~basic_iteration_node ()
{
}
virtual node_type what_type () const
{
return node::ITERATION;
}
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const
{
perform_op_stack_.push (true);
node_stack_.push (_next);
return true;
}
private:
// Not owner of this pointer...
basic_node<id_type> *_next;
bool _greedy;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const
{
if (perform_op_stack_.top ())
{
basic_node<id_type> *ptr_ = new_node_stack_.top ();
node_ptr_vector_->push_back
(static_cast<basic_iteration_node<id_type> *>(0));
node_ptr_vector_->back () = new basic_iteration_node
(ptr_, _greedy);
new_node_stack_.top () = node_ptr_vector_->back ();
}
else
{
down_ = true;
}
perform_op_stack_.pop ();
}
// No copy construction.
basic_iteration_node (const basic_iteration_node &);
// No assignment.
const basic_iteration_node &operator = (const basic_iteration_node &);
};
}
}
#endif

View File

@@ -0,0 +1,114 @@
// leaf_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_LEAF_NODE_HPP
#define LEXERTL_LEAF_NODE_HPP
#include "../../enums.hpp" // null_token
#include "node.hpp"
#include "../../size_t.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_leaf_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_leaf_node (const id_type token_, const bool greedy_) :
basic_node<id_type> (token_ == node::null_token ()),
_token (token_),
_set_greedy (!greedy_),
_greedy (greedy_),
_followpos ()
{
if (!node::_nullable)
{
node::_firstpos.push_back (this);
node::_lastpos.push_back (this);
}
}
virtual ~basic_leaf_node ()
{
}
virtual void append_followpos (const node_vector &followpos_)
{
for (typename node_vector::const_iterator iter_ = followpos_.begin (),
end_ = followpos_.end (); iter_ != end_; ++iter_)
{
_followpos.push_back (*iter_);
}
}
virtual node_type what_type () const
{
return node::LEAF;
}
virtual bool traverse (const_node_stack &/*node_stack_*/,
bool_stack &/*perform_op_stack_*/) const
{
return false;
}
virtual id_type token () const
{
return _token;
}
virtual void greedy (const bool greedy_)
{
if (!_set_greedy)
{
_greedy = greedy_;
_set_greedy = true;
}
}
virtual bool greedy () const
{
return _greedy;
}
virtual const node_vector &followpos () const
{
return _followpos;
}
virtual node_vector &followpos ()
{
return _followpos;
}
private:
id_type _token;
bool _set_greedy;
bool _greedy;
node_vector _followpos;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &/*perform_op_stack_*/,
bool &/*down_*/) const
{
node_ptr_vector_->push_back (static_cast<basic_leaf_node *>(0));
node_ptr_vector_->back () = new basic_leaf_node (_token, _greedy);
new_node_stack_.push (node_ptr_vector_->back ());
}
};
}
}
#endif

View File

@@ -0,0 +1,241 @@
// node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_NODE_HPP
#define LEXERTL_NODE_HPP
#include <assert.h>
#include "../../containers/ptr_vector.hpp"
#include "../../runtime_error.hpp"
#include "../../size_t.hpp"
#include <stack>
#include <vector>
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_node
{
public:
enum node_type {LEAF, SEQUENCE, SELECTION, ITERATION, END};
typedef std::stack<bool> bool_stack;
typedef std::stack<basic_node<id_type> *> node_stack;
// stack and vector not owner of node pointers
typedef std::stack<const basic_node<id_type> *> const_node_stack;
typedef std::vector<basic_node<id_type> *> node_vector;
typedef ptr_vector<basic_node<id_type> > node_ptr_vector;
basic_node () :
_nullable (false),
_firstpos (),
_lastpos ()
{
}
basic_node (const bool nullable_) :
_nullable (nullable_),
_firstpos (),
_lastpos ()
{
}
virtual ~basic_node ()
{
}
static id_type null_token ()
{
return static_cast<id_type>(~0);
}
bool nullable () const
{
return _nullable;
}
void append_firstpos (node_vector &firstpos_) const
{
firstpos_.insert (firstpos_.end (),
_firstpos.begin (), _firstpos.end ());
}
void append_lastpos (node_vector &lastpos_) const
{
lastpos_.insert (lastpos_.end (),
_lastpos.begin (), _lastpos.end ());
}
virtual void append_followpos (const node_vector &/*followpos_*/)
{
throw runtime_error ("Internal error node::append_followpos().");
}
basic_node *copy (node_ptr_vector &node_ptr_vector_) const
{
basic_node *new_root_ = 0;
const_node_stack node_stack_;
bool_stack perform_op_stack_;
bool down_ = true;
node_stack new_node_stack_;
node_stack_.push (this);
while (!node_stack_.empty ())
{
while (down_)
{
down_ = node_stack_.top ()->traverse (node_stack_,
perform_op_stack_);
}
while (!down_ && !node_stack_.empty ())
{
const basic_node *top_ = node_stack_.top ();
top_->copy_node (node_ptr_vector_, new_node_stack_,
perform_op_stack_, down_);
if (!down_) node_stack_.pop ();
}
}
assert (new_node_stack_.size () == 1);
new_root_ = new_node_stack_.top ();
new_node_stack_.pop ();
return new_root_;
}
virtual node_type what_type () const = 0;
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const = 0;
node_vector &firstpos ()
{
return _firstpos;
}
const node_vector &firstpos () const
{
return _firstpos;
}
// _lastpos modified externally, so not const &
node_vector &lastpos ()
{
return _lastpos;
}
virtual bool end_state () const
{
return false;
}
virtual id_type id () const
{
throw runtime_error ("Internal error node::id().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual id_type user_id () const
{
throw runtime_error ("Internal error node::user_id().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual id_type next_dfa () const
{
throw runtime_error ("Internal error node::next_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual id_type push_dfa () const
{
throw runtime_error ("Internal error node::push_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual bool pop_dfa () const
{
throw runtime_error ("Internal error node::pop_dfa().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return false;
#endif
}
virtual id_type token () const
{
throw runtime_error ("Internal error node::token().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return id_type ();
#endif
}
virtual void greedy (const bool /*greedy_*/)
{
throw runtime_error ("Internal error node::greedy(bool).");
}
virtual bool greedy () const
{
throw runtime_error ("Internal error node::greedy().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return false;
#endif
}
virtual const node_vector &followpos () const
{
throw runtime_error ("Internal error node::followpos().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return firstpos;
#endif
}
virtual node_vector &followpos ()
{
throw runtime_error ("Internal error node::followpos().");
#ifdef __SUNPRO_CC
// Stop bogus Solaris compiler warning
return firstpos;
#endif
}
protected:
const bool _nullable;
node_vector _firstpos;
node_vector _lastpos;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const = 0;
private:
basic_node (const basic_node &); // No copy construction.
const basic_node &operator = (const basic_node &); // No assignment.
};
}
}
#endif

View File

@@ -0,0 +1,106 @@
// selection_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SELECTION_NODE_HPP
#define LEXERTL_SELECTION_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_selection_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
basic_selection_node (basic_node<id_type> *left_,
basic_node<id_type> *right_) :
basic_node<id_type> (left_->nullable () || right_->nullable ()),
_left (left_),
_right (right_)
{
_left->append_firstpos (node::_firstpos);
_right->append_firstpos (node::_firstpos);
_left->append_lastpos (node::_lastpos);
_right->append_lastpos (node::_lastpos);
}
virtual ~basic_selection_node ()
{
}
virtual node_type what_type () const
{
return node::SELECTION;
}
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const
{
perform_op_stack_.push (true);
switch (_right->what_type ())
{
case node::SEQUENCE:
case node::SELECTION:
case node::ITERATION:
perform_op_stack_.push (false);
break;
default:
break;
}
node_stack_.push (_right);
node_stack_.push (_left);
return true;
}
private:
// Not owner of these pointers...
basic_node<id_type> *_left;
basic_node<id_type> *_right;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const
{
if (perform_op_stack_.top ())
{
basic_node<id_type> *rhs_ = new_node_stack_.top ();
new_node_stack_.pop ();
basic_node<id_type> *lhs_ = new_node_stack_.top ();
node_ptr_vector_->push_back
(static_cast<basic_selection_node *>(0));
node_ptr_vector_->back () = new basic_selection_node (lhs_, rhs_);
new_node_stack_.top () = node_ptr_vector_->back ();
}
else
{
down_ = true;
}
perform_op_stack_.pop ();
}
// No copy construction.
basic_selection_node (const basic_selection_node &);
// No assignment.
const basic_selection_node &operator = (const basic_selection_node &);
};
}
}
#endif

View File

@@ -0,0 +1,126 @@
// sequence_node.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_SEQUENCE_NODE_HPP
#define LEXERTL_SEQUENCE_NODE_HPP
#include "node.hpp"
namespace lexertl
{
namespace detail
{
template<typename id_type>
class basic_sequence_node : public basic_node<id_type>
{
public:
typedef basic_node<id_type> node;
typedef typename node::bool_stack bool_stack;
typedef typename node::const_node_stack const_node_stack;
typedef typename node::node_ptr_vector node_ptr_vector;
typedef typename node::node_stack node_stack;
typedef typename node::node_type node_type;
typedef typename node::node_vector node_vector;
basic_sequence_node (basic_node<id_type> *left_,
basic_node<id_type> *right_) :
basic_node<id_type> (left_->nullable () && right_->nullable ()),
_left (left_),
_right (right_)
{
_left->append_firstpos (node::_firstpos);
if (_left->nullable ())
{
_right->append_firstpos (node::_firstpos);
}
if (_right->nullable ())
{
_left->append_lastpos (node::_lastpos);
}
_right->append_lastpos (node::_lastpos);
node_vector &lastpos_ = _left->lastpos ();
const node_vector &firstpos_ = _right->firstpos ();
for (typename node_vector::iterator iter_ = lastpos_.begin (),
end_ = lastpos_.end (); iter_ != end_; ++iter_)
{
(*iter_)->append_followpos (firstpos_);
}
}
virtual ~basic_sequence_node ()
{
}
virtual node_type what_type () const
{
return node::SEQUENCE;
}
virtual bool traverse (const_node_stack &node_stack_,
bool_stack &perform_op_stack_) const
{
perform_op_stack_.push (true);
switch (_right->what_type ())
{
case node::SEQUENCE:
case node::SELECTION:
case node::ITERATION:
perform_op_stack_.push (false);
break;
default:
break;
}
node_stack_.push (_right);
node_stack_.push (_left);
return true;
}
private:
// Not owner of these pointers...
basic_node<id_type> *_left;
basic_node<id_type> *_right;
virtual void copy_node (node_ptr_vector &node_ptr_vector_,
node_stack &new_node_stack_, bool_stack &perform_op_stack_,
bool &down_) const
{
if (perform_op_stack_.top ())
{
basic_node<id_type> *rhs_ = new_node_stack_.top ();
new_node_stack_.pop ();
basic_node<id_type> *lhs_ = new_node_stack_.top ();
node_ptr_vector_->push_back
(static_cast<basic_sequence_node<id_type> *>(0));
node_ptr_vector_->back () = new basic_sequence_node<id_type>
(lhs_, rhs_);
new_node_stack_.top () = node_ptr_vector_->back ();
}
else
{
down_ = true;
}
perform_op_stack_.pop ();
}
// No copy construction.
basic_sequence_node (const basic_sequence_node &);
// No assignment.
const basic_sequence_node &operator = (const basic_sequence_node &);
};
}
}
#endif