scully/inc/lexertl/parser/tokeniser/re_tokeniser.hpp

// tokeniser.hpp
// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_RE_TOKENISER_HPP
#define LEXERTL_RE_TOKENISER_HPP

#include <cstring>
#include "re_token.hpp"
#include "../../runtime_error.hpp"
#include "../../size_t.hpp"
#include <sstream>
#include "../../string_token.hpp"
#include "re_tokeniser_helper.hpp"

namespace lexertl
{
namespace detail
{
template<typename rules_char_type, typename char_type, typename id_type>
class basic_re_tokeniser
{
public:
    typedef basic_re_token<rules_char_type, char_type> re_token;
    typedef basic_re_tokeniser_helper<rules_char_type, char_type, id_type>
        tokeniser_helper;
    typedef typename tokeniser_helper::char_state char_state;
    typedef typename tokeniser_helper::state state;
    typedef basic_string_token<char_type> string_token;

    static void next (re_token *lhs_, state &state_, re_token *token_)
    {
        rules_char_type ch_ = 0;
        bool eos_ = state_.next (ch_);
        bool skipped_ = false;

        token_->clear ();

        do
        {
            // string begin/end
            while (!eos_ && ch_ == '"')
            {
                state_._in_string ^= 1;
                eos_ = state_.next (ch_);
            }

            // (?# ...)
            skipped_ = comment (eos_, ch_, state_);
            // skip_ws set
            skipped_ |= skip (eos_, ch_, state_);
        } while (skipped_);

        if (eos_)
        {
            if (state_._in_string)
            {
                std::ostringstream ss_;

                // Pointless returning index if at end of string
                ss_ << "Unexpected end of regex (missing '\"') in rule id " <<
                    state_._id << '.';
                throw runtime_error (ss_.str ());
            }

            if (state_._paren_count)
            {
                std::ostringstream ss_;

                // Pointless returning index if at end of string
                ss_ << "Unexpected end of regex (missing ')') in rule id " <<
                    state_._id << '.';
                throw runtime_error (ss_.str ());
            }

            token_->_type = END;
        }
        else
        {
            if (ch_ == '\\')
            {
                // Even if we are in a string, respect escape sequences...
                token_->_type = CHARSET;
                escape (state_, token_->_str);
            }
            else if (state_._in_string)
            {
                // All other meta characters lose their special meaning
                // inside a string.
                token_->_type = CHARSET;
                token_->_str.insert (typename string_token::range (ch_, ch_));
            }
            else
            {
                // Not an escape sequence and not inside a string, so
                // check for meta characters.
                switch (ch_)
                {
                    case '(':
                        token_->_type = OPENPAREN;
                        ++state_._paren_count;
                        read_options (state_);
                        break;
                    case ')':
                        --state_._paren_count;

                        if (state_._paren_count < 0)
                        {
                            std::ostringstream ss_;

                            ss_ << "Number of open parenthesis < 0 "
                                "at index " << state_.index () - 1 <<
                                " in rule id " << state_._id << '.';
                            throw runtime_error (ss_.str ());
                        }

                        token_->_type = CLOSEPAREN;

                        if (!state_._flags_stack.empty ())
                        {
                            state_._flags = state_._flags_stack.top ();
                            state_._flags_stack.pop ();
                        }

                        break;
                    case '?':
                        if (!state_.eos () && *state_._curr == '?')
                        {
                            token_->_type = AOPT;
                            state_.increment ();
                        }
                        else
                        {
                            token_->_type = OPT;
                        }

                        break;
                    case '*':
                        if (!state_.eos () && *state_._curr == '?')
                        {
                            token_->_type = AZEROORMORE;
                            state_.increment ();
                        }
                        else
                        {
                            token_->_type = ZEROORMORE;
                        }

                        break;
                    case '+':
                        if (!state_.eos () && *state_._curr == '?')
                        {
                            token_->_type = AONEORMORE;
                            state_.increment ();
                        }
                        else
                        {
                            token_->_type = ONEORMORE;
                        }

                        break;
                    case '{':
                        open_curly (lhs_, state_, token_);
                        break;
                    case '|':
                        token_->_type = OR;
                        break;
                    case '^':
                        if (!state_._macro && state_._curr - 1 == state_._start)
                        {
                            token_->_type = BOL;
                        }
                        else
                        {
                            token_->_type = CHARSET;
                            token_->_str.insert (typename string_token::range
                                (ch_, ch_));
                        }

                        break;
                    case '$':
                        if (!state_._macro && state_._curr == state_._end)
                        {
                            token_->_type = EOL;
                        }
                        else
                        {
                            token_->_type = CHARSET;
                            token_->_str.insert (typename string_token::range
                                (ch_, ch_));
                        }

                        break;
                    case '.':
                    {
                        token_->_type = CHARSET;

                        if (state_._flags & dot_not_newline)
                        {
                            token_->_str.insert (typename string_token::range
                                ('\n', '\n'));
                        }

                        token_->_str.negate ();
                        break;
                    }
                    case '[':
                    {
                        token_->_type = CHARSET;
                        tokeniser_helper::charset (state_, token_->_str);
                        break;
                    }
                    case '/':
                    {
                        std::ostringstream ss_;

                        ss_ << "Lookahead ('/') is not supported yet in " <<
                            "rule id " << state_._id << '.';
                        throw runtime_error (ss_.str ());
                        break;
                    }
                    default:
                        token_->_type = CHARSET;

                        if ((state_._flags & icase) &&
                            (std::isupper (ch_, state_._locale) ||
                            std::islower (ch_, state_._locale)))
                        {
                            char_type upper_ = std::toupper
                                (ch_, state_._locale);
                            char_type lower_ = std::tolower
                                (ch_, state_._locale);

                            token_->_str.insert (typename string_token::range
                                (upper_, upper_));
                            token_->_str.insert (typename string_token::range
                                (lower_, lower_));
                        }
                        else
                        {
                            token_->_str.insert (typename string_token::range
                                (ch_, ch_));
                        }

                        break;
                }
            }
        }
    }

private:
    static bool comment (bool &eos_, rules_char_type &ch_, state &state_)
    {
        bool skipped_ = false;

        if (!eos_ && !state_._in_string && ch_ == '(' &&
            !state_.eos () && *state_._curr == '?' &&
            state_._curr + 1 < state_._end && *(state_._curr + 1) == '#')
        {
            std::size_t paren_count_ = 1;

            state_.increment ();
            state_.increment ();

            do
            {
                eos_ = state_.next (ch_);

                if (ch_ == '(')
                {
                    ++paren_count_;
                }
                else if (ch_ == ')')
                {
                    --paren_count_;
                }
            } while (!eos_ && !(ch_ == ')' && paren_count_ == 0));

            if (eos_)
            {
                std::ostringstream ss_;

                // Pointless returning index if at end of string
                ss_ << "Unexpected end of regex (unterminated comment) " <<
                    "in rule id " << state_._id << '.';
                throw runtime_error (ss_.str ());
            }
            else
            {
                eos_ = state_.next (ch_);
            }

            skipped_ = true;
        }

        return skipped_;
    }

    static bool skip (bool &eos_, rules_char_type &ch_, state &state_)
    {
        bool skipped_ = false;

        if (!eos_ && (state_._flags & skip_ws) && !state_._in_string)
        {
            bool c_comment_ = false;
            bool skip_ws_ = false;

            do
            {
                c_comment_ = ch_ == '/' && !state_.eos () &&
                    *state_._curr == '*';
                skip_ws_ = !c_comment_ && (ch_ == ' ' || ch_ == '\t' ||
                    ch_ == '\n' || ch_ == '\r' || ch_ == '\f' || ch_ == '\v');

                if (c_comment_)
                {
                    state_.increment ();
                    eos_ = state_.next (ch_);

                    while (!eos_ && !(ch_ == '*' && !state_.eos () &&
                        *state_._curr == '/'))
                    {
                        eos_ = state_.next (ch_);
                    }

                    if (eos_)
                    {
                        std::ostringstream ss_;

                        // Pointless returning index if at end of string
                        ss_ << "Unexpected end of regex (unterminated " <<
                            "C style comment) in rule id " <<
                            state_._id << '.';
                        throw runtime_error (ss_.str ());
                    }
                    else
                    {
                        state_.increment ();
                        eos_ = state_.next (ch_);
                    }

                    skipped_ = true;
                }
                else if (skip_ws_)
                {
                    eos_ = state_.next (ch_);
                    skipped_ = true;
                }
            } while (c_comment_ || skip_ws_);
        }

        return skipped_;
    }

    static void read_options (state &state_)
    {
        if (!state_.eos () && *state_._curr == '?')
        {
            rules_char_type ch_ = 0;
            bool eos_ = false;
            bool negate_ = false;

            state_.increment ();
            eos_ = state_.next (ch_);
            state_._flags_stack.push (state_._flags);

            while (!eos_ && ch_ != ':')
            {
                switch (ch_)
                {
                    case '-':
                        negate_ ^= 1;
                        break;
                    case 'i':
                        if (negate_)
                        {
                            state_._flags = state_._flags & ~icase;
                        }
                        else
                        {
                            state_._flags = state_._flags | icase;
                        }

                        negate_ = false;
                        break;
                    case 's':
                        if (negate_)
                        {
                            state_._flags = state_._flags | dot_not_newline;
                        }
                        else
                        {
                            state_._flags = state_._flags & ~dot_not_newline;
                        }

                        negate_ = false;
                        break;
                    case 'x':
                        if (negate_)
                        {
                            state_._flags = state_._flags & ~skip_ws;
                        }
                        else
                        {
                            state_._flags = state_._flags | skip_ws;
                        }

                        negate_ = false;
                        break;
                    default:
                    {
                        std::ostringstream ss_;

                        ss_ << "Unknown option at index " <<
                            state_.index () - 1 << " in rule id " <<
                            state_._id << '.';
                        throw runtime_error (ss_.str ());
                    }
                }

                eos_ = state_.next (ch_);
            }

            // End of string handler will handle early termination
        }
        else if (!state_._flags_stack.empty ())
        {
            state_._flags_stack.push (state_._flags);
        }
    }

    static void escape (state &state_, string_token &token_)
    {
        char_type ch_ = 0;
        std::size_t str_len_ = 0;
        const char *str_ = tokeniser_helper::escape_sequence (state_,
            ch_, str_len_);

        if (str_)
        {
            char_state state2_ (str_ + 1, str_ + str_len_, state_._id,
                state_._flags, state_._locale, false);

            tokeniser_helper::charset (state2_, token_);
        }
        else
        {
            token_.insert (typename string_token::range (ch_, ch_));
        }
    }

    static void open_curly (re_token *lhs_, state &state_,
        re_token *token_)
    {
        if (state_.eos ())
        {
            std::ostringstream ss_;

            // Pointless returning index if at end of string
            ss_ << "Unexpected end of regex (missing '}') in rule id " <<
                state_._id << '.';
            throw runtime_error (ss_.str ());
        }
        else if (*state_._curr == '-')
        {
            charset_difference (lhs_, state_, token_);
        }
        else if (*state_._curr == '+')
        {
            charset_union (lhs_, state_, token_);
        }
        else if (*state_._curr >= '0' && *state_._curr <= '9')
        {
            repeat_n (state_, token_);
        }
        else
        {
            macro (state_, token_);
        }
    }

    static void charset_difference (re_token *lhs_, state &state_,
        re_token *token_)
    {
        rules_char_type ch_ = 0;

        if (lhs_->_type != CHARSET)
        {
            std::ostringstream ss_;

            ss_ << "CHARSET must precede {-} at index " <<
                state_.index () - 1 << " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        state_.next (ch_);

        if (state_.next (ch_))
        {
            std::ostringstream ss_;

            // Pointless returning index if at end of string
            ss_ << "Unexpected end of regex (missing '}') in rule id " <<
                state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        if (ch_ != '}')
        {
            std::ostringstream ss_;

            ss_ << "Missing '}' at index " << state_.index () - 1 <<
                " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        re_token rhs_;

        next (lhs_, state_, &rhs_);

        if (rhs_._type != CHARSET)
        {
            std::ostringstream ss_;

            ss_ << "CHARSET must follow {-} at index " <<
                state_.index () - 1 << " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        lhs_->_str.remove (rhs_._str);

        if (lhs_->_str.empty ())
        {
            std::ostringstream ss_;

            ss_ << "Empty charset created by {-} at index " <<
                state_.index () - 1 << " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        next (lhs_, state_, token_);
    }

    static void charset_union (re_token *lhs_, state &state_,
        re_token *token_)
    {
        rules_char_type ch_ = 0;

        if (lhs_->_type != CHARSET)
        {
            std::ostringstream ss_;

            ss_ << "CHARSET must precede {+} at index " <<
                state_.index () - 1 << " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        state_.next (ch_);

        if (state_.next (ch_))
        {
            std::ostringstream ss_;

            // Pointless returning index if at end of string
            ss_ << "Unexpected end of regex (missing '}') in rule id " <<
                state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        if (ch_ != '}')
        {
            std::ostringstream ss_;

            ss_ << "Missing '}' at index " << state_.index () - 1 <<
                " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        re_token rhs_;

        next (lhs_, state_, &rhs_);

        if (rhs_._type != CHARSET)
        {
            std::ostringstream ss_;

            ss_ << "CHARSET must follow {+} at index " <<
                state_.index () - 1 << " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        lhs_->_str.insert (rhs_._str);
        next (lhs_, state_, token_);
    }

    // SYNTAX:
    //   {n[,[n]]}
    // SEMANTIC RULES:
    //   {0} - INVALID (throw exception)
    //   {0,} = *
    //   {0,0} - INVALID (throw exception)
    //   {0,1} = ?
    //   {1,} = +
    //   {min,max} where min == max - {min}
    //   {min,max} where max < min - INVALID (throw exception)
    static void repeat_n (state &state_, re_token *token_)
    {
        rules_char_type ch_ = 0;
        bool eos_ = state_.next (ch_);
        std::size_t min_ = 0;
        std::size_t max_ = 0;

        while (!eos_ && ch_ >= '0' && ch_ <= '9')
        {
            min_ *= 10;
            min_ += ch_ - '0';
            token_->_extra += ch_;
            eos_ = state_.next (ch_);
        }

        if (eos_)
        {
            std::ostringstream ss_;

            // Pointless returning index if at end of string
            ss_ << "Unexpected end of regex (missing '}') in rule id " <<
                state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        bool min_max_ = false;
        bool repeatn_ = true;

        if (ch_ == ',')
        {
            token_->_extra += ch_;
            eos_ = state_.next (ch_);

            if (eos_)
            {
                std::ostringstream ss_;

                // Pointless returning index if at end of string
                ss_ << "Unexpected end of regex (missing '}') in rule id " <<
                    state_._id << '.';
                throw runtime_error (ss_.str ());
            }

            if (ch_ == '}')
            {
                // Small optimisation: Check for '*' equivalency.
                if (min_ == 0)
                {
                    token_->_type = ZEROORMORE;
                    repeatn_ = false;
                }
                // Small optimisation: Check for '+' equivalency.
                else if (min_ == 1)
                {
                    token_->_type = ONEORMORE;
                    repeatn_ = false;
                }
            }
            else
            {
                if (ch_ < '0' || ch_ > '9')
                {
                    std::ostringstream ss_;

                    ss_ << "Missing '}' at index " << state_.index () - 1 <<
                        " in rule id " << state_._id << '.';
                    throw runtime_error (ss_.str ());
                }

                min_max_ = true;

                do
                {
                    max_ *= 10;
                    max_ += ch_ - '0';
                    token_->_extra += ch_;
                    eos_ = state_.next (ch_);
                } while (!eos_ && ch_ >= '0' && ch_ <= '9');

                if (eos_)
                {
                    std::ostringstream ss_;

                    // Pointless returning index if at end of string
                    ss_ << "Unexpected end of regex (missing '}') "
                        "in rule id " << state_._id << '.';
                    throw runtime_error (ss_.str ());
                }

                // Small optimisation: Check for '?' equivalency.
                if (min_ == 0 && max_ == 1)
                {
                    token_->_type = OPT;
                    repeatn_ = false;
                }
                // Small optimisation: if min == max, then min.
                else if (min_ == max_)
                {
                    token_->_extra.erase (token_->_extra.find (','));
                    min_max_ = false;
                    max_ = 0;
                }
            }
        }

        if (ch_ != '}')
        {
            std::ostringstream ss_;

            ss_ << "Missing '}' at index " << state_.index () - 1 <<
                " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        if (repeatn_)
        {
            // SEMANTIC VALIDATION follows:
            // NOTE: {0,} has already become *
            // therefore we don't check for a comma.
            if (min_ == 0 && max_ == 0)
            {
                std::ostringstream ss_;

                ss_ << "Cannot have exactly zero repeats preceding index " <<
                    state_.index () << " in rule id " << state_._id << '.';
                throw runtime_error (ss_.str ());
            }

            if (min_max_ && max_ < min_)
            {
                std::ostringstream ss_;

                ss_ << "Max less than min preceding index " <<
                    state_.index () << " in rule id " << state_._id << '.';
                throw runtime_error (ss_.str ());
            }

            if (!state_.eos () && *state_._curr == '?')
            {
                token_->_type = AREPEATN;
                state_.increment ();
            }
            else
            {
                token_->_type = REPEATN;
            }
        }
        else if (token_->_type == ZEROORMORE)
        {
            if (!state_.eos () && *state_._curr == '?')
            {
                token_->_type = AZEROORMORE;
                state_.increment ();
            }
        }
        else if (token_->_type == ONEORMORE)
        {
            if (!state_.eos () && *state_._curr == '?')
            {
                token_->_type = AONEORMORE;
                state_.increment ();
            }
        }
        else if (token_->_type == OPT)
        {
            if (!state_.eos () && *state_._curr == '?')
            {
                token_->_type = AOPT;
                state_.increment ();
            }
        }
    }

    static void macro (state &state_, re_token *token_)
    {
        rules_char_type ch_ = 0;
        bool eos_ = false;

        state_.next (ch_);

        if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
            !(ch_ >= 'a' && ch_ <= 'z'))
        {
            std::ostringstream ss_;

            ss_ << "Invalid MACRO name at index " << state_.index () - 1 <<
                " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        do
        {
            token_->_extra += ch_;
            eos_ = state_.next (ch_);

            if (eos_)
            {
                std::ostringstream ss_;

                // Pointless returning index if at end of string
                ss_ << "Unexpected end of regex " <<
                    "(missing '}') in rule id " << state_._id << '.';
                throw runtime_error (ss_.str ());
            }
        } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
            (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));

        if (ch_ != '}')
        {
            std::ostringstream ss_;

            ss_ << "Missing '}' at index " << state_.index () - 1 <<
                " in rule id " << state_._id << '.';
            throw runtime_error (ss_.str ());
        }

        token_->_type = MACRO;
    }
};
}
}

#endif
Fix typo in grammar. Add lexertl. 2013-05-31 21:35:44 +02:00			`// tokeniser.hpp`
			`// Copyright (c) 2005-2012 Ben Hanson (http://www.benhanson.net/)`
			`//`
			`// Distributed under the Boost Software License, Version 1.0. (See accompanying`
			`// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)`
			`#ifndef LEXERTL_RE_TOKENISER_HPP`
			`#define LEXERTL_RE_TOKENISER_HPP`

			`#include <cstring>`
			`#include "re_token.hpp"`
			`#include "../../runtime_error.hpp"`
			`#include "../../size_t.hpp"`
			`#include <sstream>`
			`#include "../../string_token.hpp"`
			`#include "re_tokeniser_helper.hpp"`

			`namespace lexertl`
			`{`
			`namespace detail`
			`{`
			`template<typename rules_char_type, typename char_type, typename id_type>`
			`class basic_re_tokeniser`
			`{`
			`public:`
			`typedef basic_re_token<rules_char_type, char_type> re_token;`
			`typedef basic_re_tokeniser_helper<rules_char_type, char_type, id_type>`
			`tokeniser_helper;`
			`typedef typename tokeniser_helper::char_state char_state;`
			`typedef typename tokeniser_helper::state state;`
			`typedef basic_string_token<char_type> string_token;`

			`static void next (re_token lhs_, state &state_, re_token token_)`
			`{`
			`rules_char_type ch_ = 0;`
			`bool eos_ = state_.next (ch_);`
			`bool skipped_ = false;`

			`token_->clear ();`

			`do`
			`{`
			`// string begin/end`
			`while (!eos_ && ch_ == '"')`
			`{`
			`state_._in_string ^= 1;`
			`eos_ = state_.next (ch_);`
			`}`

			`// (?# ...)`
			`skipped_ = comment (eos_, ch_, state_);`
			`// skip_ws set`
			`skipped_ \|= skip (eos_, ch_, state_);`
			`} while (skipped_);`

			`if (eos_)`
			`{`
			`if (state_._in_string)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '\"') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (state_._paren_count)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing ')') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`token_->_type = END;`
			`}`
			`else`
			`{`
			`if (ch_ == '\\')`
			`{`
			`// Even if we are in a string, respect escape sequences...`
			`token_->_type = CHARSET;`
			`escape (state_, token_->_str);`
			`}`
			`else if (state_._in_string)`
			`{`
			`// All other meta characters lose their special meaning`
			`// inside a string.`
			`token_->_type = CHARSET;`
			`token_->_str.insert (typename string_token::range (ch_, ch_));`
			`}`
			`else`
			`{`
			`// Not an escape sequence and not inside a string, so`
			`// check for meta characters.`
			`switch (ch_)`
			`{`
			`case '(':`
			`token_->_type = OPENPAREN;`
			`++state_._paren_count;`
			`read_options (state_);`
			`break;`
			`case ')':`
			`--state_._paren_count;`

			`if (state_._paren_count < 0)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Number of open parenthesis < 0 "`
			`"at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`token_->_type = CLOSEPAREN;`

			`if (!state_._flags_stack.empty ())`
			`{`
			`state_._flags = state_._flags_stack.top ();`
			`state_._flags_stack.pop ();`
			`}`

			`break;`
			`case '?':`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AOPT;`
			`state_.increment ();`
			`}`
			`else`
			`{`
			`token_->_type = OPT;`
			`}`

			`break;`
			`case '*':`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AZEROORMORE;`
			`state_.increment ();`
			`}`
			`else`
			`{`
			`token_->_type = ZEROORMORE;`
			`}`

			`break;`
			`case '+':`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AONEORMORE;`
			`state_.increment ();`
			`}`
			`else`
			`{`
			`token_->_type = ONEORMORE;`
			`}`

			`break;`
			`case '{':`
			`open_curly (lhs_, state_, token_);`
			`break;`
			`case '\|':`
			`token_->_type = OR;`
			`break;`
			`case '^':`
			`if (!state_._macro && state_._curr - 1 == state_._start)`
			`{`
			`token_->_type = BOL;`
			`}`
			`else`
			`{`
			`token_->_type = CHARSET;`
			`token_->_str.insert (typename string_token::range`
			`(ch_, ch_));`
			`}`

			`break;`
			`case '$':`
			`if (!state_._macro && state_._curr == state_._end)`
			`{`
			`token_->_type = EOL;`
			`}`
			`else`
			`{`
			`token_->_type = CHARSET;`
			`token_->_str.insert (typename string_token::range`
			`(ch_, ch_));`
			`}`

			`break;`
			`case '.':`
			`{`
			`token_->_type = CHARSET;`

			`if (state_._flags & dot_not_newline)`
			`{`
			`token_->_str.insert (typename string_token::range`
			`('\n', '\n'));`
			`}`

			`token_->_str.negate ();`
			`break;`
			`}`
			`case '[':`
			`{`
			`token_->_type = CHARSET;`
			`tokeniser_helper::charset (state_, token_->_str);`
			`break;`
			`}`
			`case '/':`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Lookahead ('/') is not supported yet in " <<`
			`"rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`break;`
			`}`
			`default:`
			`token_->_type = CHARSET;`

			`if ((state_._flags & icase) &&`
			`(std::isupper (ch_, state_._locale) \|\|`
			`std::islower (ch_, state_._locale)))`
			`{`
			`char_type upper_ = std::toupper`
			`(ch_, state_._locale);`
			`char_type lower_ = std::tolower`
			`(ch_, state_._locale);`

			`token_->_str.insert (typename string_token::range`
			`(upper_, upper_));`
			`token_->_str.insert (typename string_token::range`
			`(lower_, lower_));`
			`}`
			`else`
			`{`
			`token_->_str.insert (typename string_token::range`
			`(ch_, ch_));`
			`}`

			`break;`
			`}`
			`}`
			`}`
			`}`

			`private:`
			`static bool comment (bool &eos_, rules_char_type &ch_, state &state_)`
			`{`
			`bool skipped_ = false;`

			`if (!eos_ && !state_._in_string && ch_ == '(' &&`
			`!state_.eos () && *state_._curr == '?' &&`
			`state_._curr + 1 < state_._end && *(state_._curr + 1) == '#')`
			`{`
			`std::size_t paren_count_ = 1;`

			`state_.increment ();`
			`state_.increment ();`

			`do`
			`{`
			`eos_ = state_.next (ch_);`

			`if (ch_ == '(')`
			`{`
			`++paren_count_;`
			`}`
			`else if (ch_ == ')')`
			`{`
			`--paren_count_;`
			`}`
			`} while (!eos_ && !(ch_ == ')' && paren_count_ == 0));`

			`if (eos_)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (unterminated comment) " <<`
			`"in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`
			`else`
			`{`
			`eos_ = state_.next (ch_);`
			`}`

			`skipped_ = true;`
			`}`

			`return skipped_;`
			`}`

			`static bool skip (bool &eos_, rules_char_type &ch_, state &state_)`
			`{`
			`bool skipped_ = false;`

			`if (!eos_ && (state_._flags & skip_ws) && !state_._in_string)`
			`{`
			`bool c_comment_ = false;`
			`bool skip_ws_ = false;`

			`do`
			`{`
			`c_comment_ = ch_ == '/' && !state_.eos () &&`
			`state_._curr == '';`
			`skip_ws_ = !c_comment_ && (ch_ == ' ' \|\| ch_ == '\t' \|\|`
			`ch_ == '\n' \|\| ch_ == '\r' \|\| ch_ == '\f' \|\| ch_ == '\v');`

			`if (c_comment_)`
			`{`
			`state_.increment ();`
			`eos_ = state_.next (ch_);`

			`while (!eos_ && !(ch_ == '*' && !state_.eos () &&`
			`*state_._curr == '/'))`
			`{`
			`eos_ = state_.next (ch_);`
			`}`

			`if (eos_)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (unterminated " <<`
			`"C style comment) in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`
			`else`
			`{`
			`state_.increment ();`
			`eos_ = state_.next (ch_);`
			`}`

			`skipped_ = true;`
			`}`
			`else if (skip_ws_)`
			`{`
			`eos_ = state_.next (ch_);`
			`skipped_ = true;`
			`}`
			`} while (c_comment_ \|\| skip_ws_);`
			`}`

			`return skipped_;`
			`}`

			`static void read_options (state &state_)`
			`{`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`rules_char_type ch_ = 0;`
			`bool eos_ = false;`
			`bool negate_ = false;`

			`state_.increment ();`
			`eos_ = state_.next (ch_);`
			`state_._flags_stack.push (state_._flags);`

			`while (!eos_ && ch_ != ':')`
			`{`
			`switch (ch_)`
			`{`
			`case '-':`
			`negate_ ^= 1;`
			`break;`
			`case 'i':`
			`if (negate_)`
			`{`
			`state_._flags = state_._flags & ~icase;`
			`}`
			`else`
			`{`
			`state_._flags = state_._flags \| icase;`
			`}`

			`negate_ = false;`
			`break;`
			`case 's':`
			`if (negate_)`
			`{`
			`state_._flags = state_._flags \| dot_not_newline;`
			`}`
			`else`
			`{`
			`state_._flags = state_._flags & ~dot_not_newline;`
			`}`

			`negate_ = false;`
			`break;`
			`case 'x':`
			`if (negate_)`
			`{`
			`state_._flags = state_._flags & ~skip_ws;`
			`}`
			`else`
			`{`
			`state_._flags = state_._flags \| skip_ws;`
			`}`

			`negate_ = false;`
			`break;`
			`default:`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Unknown option at index " <<`
			`state_.index () - 1 << " in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`
			`}`

			`eos_ = state_.next (ch_);`
			`}`

			`// End of string handler will handle early termination`
			`}`
			`else if (!state_._flags_stack.empty ())`
			`{`
			`state_._flags_stack.push (state_._flags);`
			`}`
			`}`

			`static void escape (state &state_, string_token &token_)`
			`{`
			`char_type ch_ = 0;`
			`std::size_t str_len_ = 0;`
			`const char *str_ = tokeniser_helper::escape_sequence (state_,`
			`ch_, str_len_);`

			`if (str_)`
			`{`
			`char_state state2_ (str_ + 1, str_ + str_len_, state_._id,`
			`state_._flags, state_._locale, false);`

			`tokeniser_helper::charset (state2_, token_);`
			`}`
			`else`
			`{`
			`token_.insert (typename string_token::range (ch_, ch_));`
			`}`
			`}`

			`static void open_curly (re_token *lhs_, state &state_,`
			`re_token *token_)`
			`{`
			`if (state_.eos ())`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '}') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`
			`else if (*state_._curr == '-')`
			`{`
			`charset_difference (lhs_, state_, token_);`
			`}`
			`else if (*state_._curr == '+')`
			`{`
			`charset_union (lhs_, state_, token_);`
			`}`
			`else if (state_._curr >= '0' && state_._curr <= '9')`
			`{`
			`repeat_n (state_, token_);`
			`}`
			`else`
			`{`
			`macro (state_, token_);`
			`}`
			`}`

			`static void charset_difference (re_token *lhs_, state &state_,`
			`re_token *token_)`
			`{`
			`rules_char_type ch_ = 0;`

			`if (lhs_->_type != CHARSET)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "CHARSET must precede {-} at index " <<`
			`state_.index () - 1 << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`state_.next (ch_);`

			`if (state_.next (ch_))`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '}') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (ch_ != '}')`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Missing '}' at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`re_token rhs_;`

			`next (lhs_, state_, &rhs_);`

			`if (rhs_._type != CHARSET)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "CHARSET must follow {-} at index " <<`
			`state_.index () - 1 << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`lhs_->_str.remove (rhs_._str);`

			`if (lhs_->_str.empty ())`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Empty charset created by {-} at index " <<`
			`state_.index () - 1 << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`next (lhs_, state_, token_);`
			`}`

			`static void charset_union (re_token *lhs_, state &state_,`
			`re_token *token_)`
			`{`
			`rules_char_type ch_ = 0;`

			`if (lhs_->_type != CHARSET)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "CHARSET must precede {+} at index " <<`
			`state_.index () - 1 << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`state_.next (ch_);`

			`if (state_.next (ch_))`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '}') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (ch_ != '}')`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Missing '}' at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`re_token rhs_;`

			`next (lhs_, state_, &rhs_);`

			`if (rhs_._type != CHARSET)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "CHARSET must follow {+} at index " <<`
			`state_.index () - 1 << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`lhs_->_str.insert (rhs_._str);`
			`next (lhs_, state_, token_);`
			`}`

			`// SYNTAX:`
			`// {n[,[n]]}`
			`// SEMANTIC RULES:`
			`// {0} - INVALID (throw exception)`
			`// {0,} = *`
			`// {0,0} - INVALID (throw exception)`
			`// {0,1} = ?`
			`// {1,} = +`
			`// {min,max} where min == max - {min}`
			`// {min,max} where max < min - INVALID (throw exception)`
			`static void repeat_n (state &state_, re_token *token_)`
			`{`
			`rules_char_type ch_ = 0;`
			`bool eos_ = state_.next (ch_);`
			`std::size_t min_ = 0;`
			`std::size_t max_ = 0;`

			`while (!eos_ && ch_ >= '0' && ch_ <= '9')`
			`{`
			`min_ *= 10;`
			`min_ += ch_ - '0';`
			`token_->_extra += ch_;`
			`eos_ = state_.next (ch_);`
			`}`

			`if (eos_)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '}') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`bool min_max_ = false;`
			`bool repeatn_ = true;`

			`if (ch_ == ',')`
			`{`
			`token_->_extra += ch_;`
			`eos_ = state_.next (ch_);`

			`if (eos_)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '}') in rule id " <<`
			`state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (ch_ == '}')`
			`{`
			`// Small optimisation: Check for '*' equivalency.`
			`if (min_ == 0)`
			`{`
			`token_->_type = ZEROORMORE;`
			`repeatn_ = false;`
			`}`
			`// Small optimisation: Check for '+' equivalency.`
			`else if (min_ == 1)`
			`{`
			`token_->_type = ONEORMORE;`
			`repeatn_ = false;`
			`}`
			`}`
			`else`
			`{`
			`if (ch_ < '0' \|\| ch_ > '9')`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Missing '}' at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`min_max_ = true;`

			`do`
			`{`
			`max_ *= 10;`
			`max_ += ch_ - '0';`
			`token_->_extra += ch_;`
			`eos_ = state_.next (ch_);`
			`} while (!eos_ && ch_ >= '0' && ch_ <= '9');`

			`if (eos_)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex (missing '}') "`
			`"in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`// Small optimisation: Check for '?' equivalency.`
			`if (min_ == 0 && max_ == 1)`
			`{`
			`token_->_type = OPT;`
			`repeatn_ = false;`
			`}`
			`// Small optimisation: if min == max, then min.`
			`else if (min_ == max_)`
			`{`
			`token_->_extra.erase (token_->_extra.find (','));`
			`min_max_ = false;`
			`max_ = 0;`
			`}`
			`}`
			`}`

			`if (ch_ != '}')`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Missing '}' at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (repeatn_)`
			`{`
			`// SEMANTIC VALIDATION follows:`
			`// NOTE: {0,} has already become *`
			`// therefore we don't check for a comma.`
			`if (min_ == 0 && max_ == 0)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Cannot have exactly zero repeats preceding index " <<`
			`state_.index () << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (min_max_ && max_ < min_)`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Max less than min preceding index " <<`
			`state_.index () << " in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AREPEATN;`
			`state_.increment ();`
			`}`
			`else`
			`{`
			`token_->_type = REPEATN;`
			`}`
			`}`
			`else if (token_->_type == ZEROORMORE)`
			`{`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AZEROORMORE;`
			`state_.increment ();`
			`}`
			`}`
			`else if (token_->_type == ONEORMORE)`
			`{`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AONEORMORE;`
			`state_.increment ();`
			`}`
			`}`
			`else if (token_->_type == OPT)`
			`{`
			`if (!state_.eos () && *state_._curr == '?')`
			`{`
			`token_->_type = AOPT;`
			`state_.increment ();`
			`}`
			`}`
			`}`

			`static void macro (state &state_, re_token *token_)`
			`{`
			`rules_char_type ch_ = 0;`
			`bool eos_ = false;`

			`state_.next (ch_);`

			`if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&`
			`!(ch_ >= 'a' && ch_ <= 'z'))`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Invalid MACRO name at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`do`
			`{`
			`token_->_extra += ch_;`
			`eos_ = state_.next (ch_);`

			`if (eos_)`
			`{`
			`std::ostringstream ss_;`

			`// Pointless returning index if at end of string`
			`ss_ << "Unexpected end of regex " <<`
			`"(missing '}') in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`
			`} while (ch_ == '_' \|\| ch_ == '-' \|\| (ch_ >= 'A' && ch_ <= 'Z') \|\|`
			`(ch_ >= 'a' && ch_ <= 'z') \|\| (ch_ >= '0' && ch_ <= '9'));`

			`if (ch_ != '}')`
			`{`
			`std::ostringstream ss_;`

			`ss_ << "Missing '}' at index " << state_.index () - 1 <<`
			`" in rule id " << state_._id << '.';`
			`throw runtime_error (ss_.str ());`
			`}`

			`token_->_type = MACRO;`
			`}`
			`};`
			`}`
			`}`

			`#endif`