Files
scully/inc/lexertl/utf_iterators.hpp
Markus Hauschild f3d3389f54 Fix typo in grammar.
Add lexertl.
2013-05-31 21:35:44 +02:00

381 lines
7.9 KiB
C++

// utf_iterators.hpp
// Copyright (c) 2012 Ben Hanson (http://www.benhanson.net/)
// Inspired by http://utfcpp.sourceforge.net/
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef LEXERTL_UTF_ITERATORS_HPP
#define LEXERTL_UTF_ITERATORS_HPP
#include <iterator>
namespace lexertl
{
template<typename char_iterator, typename char_type>
class basic_utf8_in_iterator :
public std::iterator<std::input_iterator_tag, char_type>
{
public:
basic_utf8_in_iterator () :
_char (0)
{
}
explicit basic_utf8_in_iterator (const char_iterator& it_) :
_it (it_),
_char (0)
{
next ();
}
char_type operator * () const
{
return _char;
}
bool operator == (const basic_utf8_in_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf8_in_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf8_in_iterator &operator ++ ()
{
next ();
return *this;
}
basic_utf8_in_iterator operator ++ (int)
{
basic_utf8_in_iterator temp_ = *this;
next ();
return temp_;
}
private:
typedef typename std::iterator_traits<char_iterator>::
difference_type difference_type;
char_iterator _it;
char_type _char;
void next ()
{
const char len_ = len (_it);
char_type ch_ = *_it & 0xff;
switch (len_)
{
case 1:
break;
case 2:
++_it;
ch_ = (ch_ << 6 & 0x7ff) | (*_it & 0x3f);
break;
case 3:
++_it;
ch_ = (ch_ << 12 & 0xffff) | ((*_it & 0xff) << 6 & 0xfff);
++_it;
ch_ |= *_it & 0x3f;
break;
case 4:
++_it;
ch_ = (ch_ << 18 & 0x1fffff) | ((*_it & 0xff) << 12 & 0x3ffff);
++_it;
ch_ |= (*_it & 0xff) << 6 & 0xfff;
++_it;
ch_ |= *_it & 0x3f;
break;
}
++_it;
_char = ch_;
}
char len (const char_iterator &it_) const
{
const unsigned char ch_ = *it_;
return ch_ < 0x80 ? 1 :
ch_ >> 5 == 0x06 ? 2 :
ch_ >> 4 == 0x0e ? 3 :
ch_ >> 3 == 0x1e ? 4 : 0;
}
};
template<typename char_iterator>
class basic_utf8_out_iterator :
public std::iterator<std::input_iterator_tag, char>
{
public:
basic_utf8_out_iterator () :
_count (0),
_index (0)
{
}
explicit basic_utf8_out_iterator (const char_iterator& it_) :
_it (it_),
_count (0),
_index (0)
{
next ();
}
char operator * () const
{
return _bytes[_index];
}
bool operator == (const basic_utf8_out_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf8_out_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf8_out_iterator &operator ++ ()
{
++_index;
if (_index >= _count)
{
next ();
}
return *this;
}
basic_utf8_out_iterator operator ++ (int)
{
basic_utf8_out_iterator temp_ = *this;
++_index;
if (_index >= _count)
{
next ();
}
return temp_;
}
private:
char_iterator _it;
char _bytes[4];
unsigned char _count;
unsigned char _index;
void next ()
{
const std::size_t ch_ = *_it;
_count = len (ch_);
_index = 0;
switch (_count)
{
case 1:
_bytes[0] = static_cast<char>(ch_);
break;
case 2:
_bytes[0] = static_cast<char>((ch_ >> 6) | 0xc0);
_bytes[1] = (ch_ & 0x3f) | 0x80;
break;
case 3:
_bytes[0] = static_cast<char>((ch_ >> 12) | 0xe0);
_bytes[1] = ((ch_ >> 6) & 0x3f) | 0x80;
_bytes[2] = (ch_ & 0x3f) | 0x80;
break;
case 4:
_bytes[0] = static_cast<char>((ch_ >> 18) | 0xf0);
_bytes[1] = ((ch_ >> 12) & 0x3f) | 0x80;
_bytes[2] = ((ch_ >> 6) & 0x3f) | 0x80;
_bytes[3] = (ch_ & 0x3f) | 0x80;
break;
}
++_it;
}
char len (const std::size_t ch_) const
{
return ch_ < 0x80 ? 1 :
ch_ < 0x800 ? 2 :
ch_ < 0x10000 ? 3 :
4;
}
};
template<typename char_iterator, typename char_type>
class basic_utf16_in_iterator :
public std::iterator<std::input_iterator_tag, char_type>
{
public:
basic_utf16_in_iterator () :
_char (0)
{
}
explicit basic_utf16_in_iterator (const char_iterator &it_) :
_it (it_),
_char (0)
{
next ();
}
char_type operator * () const
{
return _char;
}
bool operator == (const basic_utf16_in_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf16_in_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf16_in_iterator &operator ++ ()
{
next ();
return *this;
}
basic_utf16_in_iterator operator ++ (int)
{
basic_utf16_in_iterator temp_ = *this;
next ();
return temp_;
}
private:
typedef typename std::iterator_traits<char_iterator>::
difference_type difference_type;
char_iterator _it;
char_type _char;
void next ()
{
char_type ch_ = *_it & 0xffff;
if (ch_ >= 0xd800 && ch_ <= 0xdbff)
{
const char_type surrogate_ = *++_it & 0xffff;
ch_ = (((ch_ - 0xd800) << 10) | (surrogate_ - 0xdc00)) + 0x10000;
}
++_it;
_char = ch_;
}
};
template<typename char_iterator>
class basic_utf16_out_iterator :
public std::iterator<std::input_iterator_tag, wchar_t>
{
public:
basic_utf16_out_iterator () :
_count (0),
_index (0)
{
}
explicit basic_utf16_out_iterator (const char_iterator& it_) :
_it (it_),
_count (0),
_index (0)
{
next ();
}
wchar_t operator * () const
{
return _chars[_index];
}
bool operator == (const basic_utf16_out_iterator &rhs_) const
{
return _it == rhs_._it;
}
bool operator != (const basic_utf16_out_iterator &rhs_) const
{
return _it != rhs_._it;
}
basic_utf16_out_iterator &operator ++ ()
{
++_index;
if (_index >= _count)
{
next ();
}
return *this;
}
basic_utf16_out_iterator operator ++ (int)
{
basic_utf16_out_iterator temp_ = *this;
++_index;
if (_index >= _count)
{
next ();
}
return temp_;
}
private:
char_iterator _it;
wchar_t _chars[2];
unsigned char _count;
unsigned char _index;
void next ()
{
const std::size_t ch_ = *_it;
_count = len (ch_);
_index = 0;
switch (_count)
{
case 1:
_chars[0] = static_cast<wchar_t>(ch_);
break;
case 2:
_chars[0] = static_cast<wchar_t>((ch_ >> 10) + 0xdc00u -
(0x10000 >> 10));
_chars[1] = static_cast<wchar_t>((ch_ & 0x3ff) + 0xdc00u);
break;
}
++_it;
}
char len (const std::size_t ch_) const
{
return ch_ > 0xffff ? 2 : 1;
}
};
}
#endif