Author: grumbel Date: 2008-07-03 05:45:57 +0200 (Thu, 03 Jul 2008) New Revision: 3638
Added: trunk/pingus/src/utf8_iterator.cpp trunk/pingus/src/utf8_iterator.hpp Modified: trunk/pingus/SConstruct Log: Added UTF8Iterator class Modified: trunk/pingus/SConstruct =================================================================== --- trunk/pingus/SConstruct 2008-07-03 03:03:45 UTC (rev 3637) +++ trunk/pingus/SConstruct 2008-07-03 03:45:57 UTC (rev 3638) @@ -239,6 +239,7 @@ 'src/tinygettext/dictionary_manager.cpp', 'src/tinygettext/dictionary.cpp', 'src/tinygettext/language_def.cpp', +'src/utf8_iterator.cpp', 'src/math/vector2f.cpp', 'src/math/vector2i.cpp', 'src/math/vector3f.cpp', Added: trunk/pingus/src/utf8_iterator.cpp =================================================================== --- trunk/pingus/src/utf8_iterator.cpp 2008-07-03 03:03:45 UTC (rev 3637) +++ trunk/pingus/src/utf8_iterator.cpp 2008-07-03 03:45:57 UTC (rev 3638) @@ -0,0 +1,119 @@ +// Pingus - A free Lemmings clone +// Copyright (C) 2008 Matthias Braun <[EMAIL PROTECTED]>, +// Ingo Ruhnke <[EMAIL PROTECTED]> +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +#include <iostream> +#include <stdexcept> +#include "utf8_iterator.hpp" + +// FIXME: Get rid of exceptions in this code +UTF8Iterator::UTF8Iterator(const std::string& text_) + : text(text_), + pos(0) +{ + try { + chr = decode_utf8(text, pos); + } catch (std::exception) { + std::cout << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl; + chr = 0; + } +} + +bool +UTF8Iterator::done() const +{ + return pos > text.size(); +} + +UTF8Iterator& +UTF8Iterator::operator++() { + try { + chr = decode_utf8(text, pos); + } catch (std::exception) { + std::cout << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl; + chr = 0; + ++pos; + } + + return *this; +} + +uint32_t +UTF8Iterator::operator*() const { + return chr; +} + +/** + * returns true if this byte matches a bitmask of 10xx.xxxx, i.e. it is the 2nd, 3rd or 4th byte of a multibyte utf8 string + */ +bool +UTF8Iterator::has_multibyte_mark(unsigned char c) { + return ((c & 0300) == 0200); +} + +/** + * gets unicode character at byte position @a p of UTF-8 encoded @a + * text, then advances @a p to the next character. + * + * @throws std::runtime_error if decoding fails. + * See unicode standard section 3.10 table 3-5 and 3-6 for details. + */ +uint32_t +UTF8Iterator::decode_utf8(const std::string& text, size_t& p) +{ + uint32_t c1 = (unsigned char) text[p+0]; + + if (has_multibyte_mark(c1)) std::runtime_error("Malformed utf-8 sequence"); + + if ((c1 & 0200) == 0000) { + // 0xxx.xxxx: 1 byte sequence + p+=1; + return c1; + } + else if ((c1 & 0340) == 0300) { + // 110x.xxxx: 2 byte sequence + if(p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence"); + uint32_t c2 = (unsigned char) text[p+1]; + if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence"); + p+=2; + return (c1 & 0037) << 6 | (c2 & 0077); + } + else if ((c1 & 0360) == 0340) { + // 1110.xxxx: 3 byte sequence + if(p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence"); + uint32_t c2 = (unsigned char) text[p+1]; + uint32_t c3 = (unsigned char) text[p+2]; + if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence"); + if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence"); + p+=3; + return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077); + } + else if ((c1 & 0370) == 0360) { + // 1111.0xxx: 4 byte sequence + if(p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence"); + uint32_t c2 = (unsigned char) text[p+1]; + uint32_t c3 = (unsigned char) text[p+2]; + uint32_t c4 = (unsigned char) text[p+4]; + if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence"); + if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence"); + if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 sequence"); + p+=4; + return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077); + } + throw std::runtime_error("Malformed utf-8 sequence"); +} + +/* EOF */ Property changes on: trunk/pingus/src/utf8_iterator.cpp ___________________________________________________________________ Name: svn:keywords + Id Name: svn:eol-style + native Added: trunk/pingus/src/utf8_iterator.hpp =================================================================== --- trunk/pingus/src/utf8_iterator.hpp 2008-07-03 03:03:45 UTC (rev 3637) +++ trunk/pingus/src/utf8_iterator.hpp 2008-07-03 03:45:57 UTC (rev 3638) @@ -0,0 +1,53 @@ +// Pingus - A free Lemmings clone +// Copyright (C) 2008 Matthias Braun <[EMAIL PROTECTED]>, +// Ingo Ruhnke <[EMAIL PROTECTED]> +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +#ifndef HEADER_PINGUS_UTF8_ITERATOR_HPP +#define HEADER_PINGUS_UTF8_ITERATOR_HPP + +#include <string> + +class UTF8Iterator +{ +private: + const std::string& text; + std::string::size_type pos; + uint32_t chr; + + /** + * returns true if this byte matches a bitmask of 10xx.xxxx, i.e. it is the 2nd, 3rd or 4th byte of a multibyte utf8 string + */ + bool has_multibyte_mark(unsigned char c); + + /** + * gets unicode character at byte position @a p of UTF-8 encoded @a + * text, then advances @a p to the next character. + * + * @throws std::runtime_error if decoding fails. + * See unicode standard section 3.10 table 3-5 and 3-6 for details. + */ + uint32_t decode_utf8(const std::string& text, size_t& p); + +public: + UTF8Iterator(const std::string& text_); + bool done() const; + UTF8Iterator& operator++(); + uint32_t operator*() const; +}; + +#endif + +/* EOF */ Property changes on: trunk/pingus/src/utf8_iterator.hpp ___________________________________________________________________ Name: svn:keywords + Id Name: svn:eol-style + native _______________________________________________ pingus-cvs mailing list [email protected] http://lists.nongnu.org/mailman/listinfo/pingus-cvs
