Modified: incubator/stdcxx/trunk/util/codecvt.cpp URL: http://svn.apache.org/viewvc/incubator/stdcxx/trunk/util/codecvt.cpp?view=diff&rev=448754&r1=448753&r2=448754 ============================================================================== --- incubator/stdcxx/trunk/util/codecvt.cpp (original) +++ incubator/stdcxx/trunk/util/codecvt.cpp Thu Sep 21 17:42:16 2006 @@ -2,20 +2,27 @@ * * codecvt.cpp * - * $Id: //stdlib/dev/source/stdlib/util/codecvt.cpp#4 $ + * $Id$ * *************************************************************************** * - * Copyright (c) 1994-2005 Quovadx, Inc., acting through its Rogue Wave - * Software division. Licensed under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance with the - * License. You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0. Unless required by - * applicable law or agreed to in writing, software distributed under - * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License - * for the specific language governing permissions and limitations under - * the License. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * Copyright 2001-2006 Rogue Wave Software. * **************************************************************************/ @@ -26,264 +33,347 @@ #include "scanner.h" // for scanner #include <cassert> // for assert() +#include <climits> // for UCHAR_MAX +#include <cstring> // for memset() #include <fstream> // for ifstream, ofstream -// all characters should go into the codecvt_mb_set -void Def:: -gen_valid_codecvt_mb_set () +typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2; + + +std::size_t Def:: +gen_mbchar_tables (codecvt_offsets_map_t &tab, + std::map<std::string, unsigned> &off_map, + const std::string &charp /* = "" */, + unsigned tabno /* = 0 */) { - if (!valid_codecvt_mb_set_.empty()) - return; - - for (n_cmap_iter2 n_cmap_it = charmap_.get_n_cmap2().begin(); - n_cmap_it != charmap_.get_n_cmap2().end(); n_cmap_it++) { - std::string valid = n_cmap_it->first.substr - (0, n_cmap_it->first.size() - 1); - while (valid.size() > 0){ - valid_codecvt_mb_set_.insert (valid); - valid = valid.substr(0, valid.size() - 1); + // upon the first call (but not during subsequent recursive calls) + // generate a set of multibyte prefixes from the set of all known + // multibyte characters + static unsigned ntabs = 0; + static std::set<std::string>* pfx_set = 0; + + const n_cmap_citer2 mb_map_end = charmap_.get_mb_cmap ().end (); + + if (0 == pfx_set) { + pfx_set = new std::set<std::string>; + + // iterate over the range of valid multibyte characters + // obtained from the charmap and generate a complete + // subset of non-empty multibyte prefixes from each + unsigned off = 0; + + const n_cmap_citer2 mb_map_begin = charmap_.get_mb_cmap ().begin (); + + for (n_cmap_citer2 it = mb_map_begin; it != mb_map_end; ++it, ++off) { + + // insert the ordinal number of each multibyte character + // into a map for fast lookup later + off_map.insert (std::make_pair (it->first, off)); + + // generate non-empty prefixes up to one byte less + // in length than the complete multibyte character + for (std::string prefix = it->first; 1 < prefix.size (); ) { + prefix = prefix.substr (0, prefix.size () - 1); + pfx_set->insert (prefix); + } } } -} + // number of valid characters inserted into the tables + std::size_t nchars = 0; -void Def:: -create_wchar_utf8_table () -{ - if (!wchar_utf8_to_ext_.empty()) - return; + // an array of offsets to the multibyte character or to the next + // array containing such offsets (defined recursively for up to + // MB_CUR_MAX levels of nesting) + codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t; - n_cmap_iter2 n_cmap_it; - for (n_cmap_it = charmap_.get_n_cmap2().begin(); - n_cmap_it != charmap_.get_n_cmap2().end(); - n_cmap_it ++) { - std::string wchar_utf8 = utf8_encode (n_cmap_it->second); - wchar_utf8_to_ext_.insert (std::make_pair (wchar_utf8, - n_cmap_it->first)); - } -} + std::string mb_char (charp + '\0'); + for (unsigned i = 0; i <= UCHAR_MAX; ++i) { -void Def:: -gen_valid_codecvt_wchar_set () { + unsigned char cur_char = (unsigned char)i; - if (!valid_codecvt_wchar_set_.empty()) - return; + mb_char [mb_char.size () - 1] = char (cur_char); - create_wchar_utf8_table(); + if (mb_map_end == charmap_.get_mb_cmap ().find (mb_char)) { + // mb_char is not a complete, valid multibyte character + // check to see if it's a prefix of one + + if (pfx_set->find (mb_char) == pfx_set->end ()) { + // mb_char is not a prefix of a valid multibyte + // character, mark it invalide + offsets->off [cur_char] = UINT_MAX; + } + else { + // mb_char is a prefix of a valid multibyte character, + // set the MSB to denote that it "continues" in the + // table at the next higher offset + offsets->off [cur_char] = ++ntabs | 0x80000000; - for (wchar_utf8_iter it = wchar_utf8_to_ext_.begin(); - it != wchar_utf8_to_ext_.end(); it++) { - std::string str = it->first.substr (0, it->first.size () - 1); - while (str.size() > 0) { - valid_codecvt_wchar_set_.insert (str); - str = str.substr (0, str.size() - 1); + // generate that table + nchars += gen_mbchar_tables (tab, off_map, mb_char, ntabs); + } + } + else { + // mb_char is a complete, valid miltibyte character + // insert its ordinal number (offset) into the array + offsets->off [cur_char] = off_map.find (mb_char)->second; + ++nchars; } } -} + // insert the completely populated table into the map + tab.insert (std::make_pair (tabno, offsets)); -void Def:: -gen_valid_codecvt_utf8_set () { - - if (!valid_codecvt_utf8_set_.empty()) - return; - - for (ucs4_cmap_iter it = charmap_.get_ucs4_cmap().begin(); - it != charmap_.get_ucs4_cmap().end(); it++) { - std::string str = utf8_encode(it->second); - str = str.substr (0, str.size () - 1); - while (str.size() > 0) { - valid_codecvt_utf8_set_.insert (str); - str = str.substr (0, str.size() - 1); - } + if (0 == ntabs) { + // clean up on return from the topmost (non-recursive) call + delete pfx_set; + pfx_set = 0; } + + return nchars; } -void Def:: -generate_codecvt_table (const std::string &charp, - unsigned int tab_num) +std::size_t Def:: +gen_wchar_tables (codecvt_offsets_map_t &tab, + const std::string &charp /* = "" */, + unsigned int tabno /* = 0 */) { - gen_valid_codecvt_mb_set(); - ctype_offset_tab_t tab; - n_cmap_iter2 n_cmap_it; + // upon the first call (but not during subsequent recursive calls) + // generate a set of multibyte prefixes from the set of all known + // multibyte characters + static unsigned ntabs = 0; + static std::set<std::string> *pfx_set = 0; + static std::map<std::string, unsigned> *off_map = 0; + static std::map<std::string, std::string> *utf_map = 0; - for (unsigned int i = 0; i <= UCHAR_MAX; i++) { + if (0 == utf_map) { + pfx_set = new std::set<std::string>; + off_map = new std::map<std::string, unsigned>; + utf_map = new std::map<std::string, std::string>; - unsigned char cur_char = (unsigned char)i; + const n_cmap_citer2 first = charmap_.get_mb_cmap ().begin (); + const n_cmap_citer2 last = charmap_.get_mb_cmap ().end (); - std::string mb_char = charp; - mb_char += char (cur_char); + unsigned off = 0; - n_cmap_it = charmap_.get_n_cmap2 ().find (mb_char); + for (n_cmap_citer2 it = first; it != last; ++it) { - if (n_cmap_it == charmap_.get_n_cmap2 ().end ()) { + off_map->insert (std::make_pair (it->first, off)); - if ( valid_codecvt_mb_set_.find (mb_char) - != valid_codecvt_mb_set_.end ()) { + off += it->first.size () + 1; - ++next_codecvt_tab_num_; - tab.off [cur_char] = next_codecvt_tab_num_ | 0x80000000; - generate_codecvt_table (mb_char, next_codecvt_tab_num_); - } - else { - tab.off [cur_char] = UINT_MAX; + std::string utf = utf8_encode (it->second); + + utf_map->insert (std::make_pair (utf, it->first)); + + while (1 < utf.size ()) { + utf = utf.substr (0, utf.size () - 1); + pfx_set->insert (utf); } } - else { - // get the offset for this character and put it in the table - tab.off[cur_char] = wchar_off_map_.find (mb_char)->second; - } } - mb_char_offs_.insert (std::make_pair (tab_num, tab)); -} + codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t; + // number of valid characters inserted into the tables + std::size_t nchars = 0; -void Def:: -generate_wchar_codecvt_table (const std::string &charp, - unsigned int tab_num) -{ - gen_valid_codecvt_wchar_set(); + std::string mb_char (charp + '\0'); - ctype_offset_tab_t tab; - wchar_utf8_iter wu_it; + for (unsigned i = 0; i <= UCHAR_MAX; ++i) { - for (unsigned int i = 0; i <= UCHAR_MAX; i++){ unsigned char cur_char = (unsigned char)i; - std::string mb_char (charp); - mb_char += (char)cur_char; - wu_it = wchar_utf8_to_ext_.find (mb_char); - if (wu_it != wchar_utf8_to_ext_.end()) { - tab.off[cur_char] = (mb_char_off_map_.find - (wu_it->second))->second; - } - else { - valid_codecvt_wchar_set_iter wit = valid_codecvt_wchar_set_.find (mb_char); - if (wit != valid_codecvt_wchar_set_.end()) { - ++next_wchar_codecvt_tab_num_; - tab.off[cur_char] = next_wchar_codecvt_tab_num_ | 0x80000000; - generate_wchar_codecvt_table (mb_char, - next_wchar_codecvt_tab_num_); + mb_char [mb_char.size () - 1] = char (cur_char); + + const wchar_utf8_iter it = utf_map->find (mb_char); + if (it == utf_map->end ()) { + if (pfx_set->find (mb_char) == pfx_set->end ()) { + offsets->off [cur_char] = UINT_MAX; } else { - tab.off[cur_char] = UINT_MAX; + offsets->off [cur_char] = ++ntabs | 0x80000000; + + nchars += gen_wchar_tables (tab, mb_char, ntabs); } } + else { + offsets->off [cur_char] = off_map->find (it->second)->second; + + ++nchars; + } + } + + tab.insert (std::make_pair (tabno, offsets)); + + if (0 == ntabs) { + // clean up + delete pfx_set; + delete utf_map; + + pfx_set = 0; + utf_map = 0; } - wchar_offs_.insert (std::make_pair (tab_num, tab)); + return nchars; } -void Def:: -gen_utf8_map() +std::size_t Def:: +gen_utf8_tables (codecvt_offsets_map_t &tab, + std::map<std::string, unsigned> &off_map, + const std::string &charp /* = "" */, + unsigned tabno /* = 0 */) { - if (!utf8_map_.empty()) - return; + static unsigned ntabs = 0; + static std::set<std::string> *pfx_set = 0; + static std::map<std::string, wchar_t> *utf_map = 0; + + if (0 == pfx_set) { + pfx_set = new std::set<std::string>; + + const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin (); + const ucs4_cmap_iter last = charmap_.get_ucs4_cmap ().end (); + + for (ucs4_cmap_iter it = first; it != last; ++it) { + + for (std::string prefix = utf8_encode (it->second); + 1 < prefix.size (); ) { + prefix = prefix.substr (0, prefix.size () - 1); + pfx_set->insert (prefix); + } + } + } + + // the set of complete utf8 strings in the current character map + typedef std::map<std::string, wchar_t>::iterator utf8_map_iter; + + if (0 == utf_map) { + utf_map = new std::map<std::string, wchar_t>; - for (ucs4_cmap_iter it = charmap_.get_ucs4_cmap().begin(); - it != charmap_.get_ucs4_cmap().end(); it++) { - utf8_map_.insert (std::make_pair(utf8_encode (it->second), - it->second)); + const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin (); + const ucs4_cmap_iter last = charmap_.get_ucs4_cmap ().end (); + + for (ucs4_cmap_iter it = first; it != last; ++it) { + const std::string utf = utf8_encode (it->second); + utf_map->insert (std::make_pair (utf, it->second)); + } } -} + codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t; -void Def:: -generate_utf8_codecvt_table (const std::string &charp, - unsigned int tab_num) -{ - gen_valid_codecvt_utf8_set(); - gen_utf8_map(); + // number of valid characters inserted into the tables + std::size_t nchars = 0; - ctype_offset_tab_t tab; - utf8_map_iter utf8_it; + std::string mb_char = charp + '\0'; + + for (unsigned int i = 0; i <= UCHAR_MAX; ++i) { - for (unsigned int i = 0; i <= UCHAR_MAX; i++){ unsigned char cur_char = (unsigned char)i; - std::string mb_char = charp; - mb_char += (char)cur_char; - if ((utf8_it = utf8_map_.find (mb_char)) - != utf8_map_.end()) { + + mb_char [mb_char.size () - 1] = char (cur_char); + + const utf8_map_iter where = utf_map->find (mb_char); + + if (where == utf_map->end ()) { + if (pfx_set->find (mb_char) == pfx_set->end ()) { + offsets->off [cur_char] = UINT_MAX; + } + else { + offsets->off [cur_char] = ++ntabs | 0x80000000; + nchars += gen_utf8_tables (tab, off_map, mb_char, ntabs); + } + } + else { // first get the symbolic name std::string str - = charmap_.get_rucs4_cmap().find(utf8_it->second)->second; + = charmap_.get_rucs4_cmap ().find (where->second)->second; + // then get the internal encoding of the character - wchar_t int_enc = charmap_.get_w_cmap().find (str)->second; + const wchar_t int_enc = charmap_.get_w_cmap().find (str)->second; + // then get the external encoding to use in a lookup in // mb_char_off_map - str = charmap_.get_rn_cmap2().find (int_enc)->second; - tab.off[cur_char] = (mb_char_off_map_.find - (str))->second; - } - else { - if (valid_codecvt_utf8_set_.find (mb_char) - != valid_codecvt_utf8_set_.end()) { - ++next_utf8_codecvt_tab_num_; - tab.off[cur_char] = next_utf8_codecvt_tab_num_ | 0x80000000; - generate_utf8_codecvt_table (mb_char, - next_utf8_codecvt_tab_num_); - } - else { - tab.off[cur_char] = UINT_MAX; - } + str = charmap_.get_rmb_cmap ().find (int_enc)->second; + + offsets->off [cur_char] = off_map.find (str)->second; + + ++nchars; } } - utf8_offs_.insert (std::make_pair (tab_num, tab)); + + tab.insert (std::make_pair (tabno, offsets)); + + if (0 == ntabs) { + // clean up + delete pfx_set; + delete utf_map; + + pfx_set = 0; + utf_map = 0; + } + return nchars; } void Def:: -generate_xliteration_data () +gen_xlit_data () { // data offset points to the beginning of the data containing // the narrow strings character encodings unsigned int data_offset = 0; // traverse the map and construct the map of offsets - xlit_map_t::iterator it = xlit_map_.begin (); - for (; it != xlit_map_.end (); it++) { + xlit_map_t::const_iterator it = xlit_map_.begin (); + for (; it != xlit_map_.end (); ++it) { // insert pair(wchar_t value, offset of first string in data block) xlit_data_offset_map_.insert ( std::make_pair (it->first,data_offset)); // advance the data_offset value to the next "first" string - std::list<std::string>::iterator sit = + std::list<std::string>::const_iterator sit = it->second.begin (); - for (; sit != it->second.end (); sit++) { + for (; sit != it->second.end (); ++sit) { data_offset += sit->size () + 1; } - data_offset++; + ++data_offset; } // create a new table (first), populate it with default values // and insert it in the tables map xlit_offset_table_t table0; unsigned int k; - for (k = 0; k < _RWSTD_UCHAR_MAX + 1; k++) - table0.offset_table [k] = _RWSTD_UINT_MAX; + for (k = 0; k < UCHAR_MAX + 1; ++k) + table0.offset_table [k] = UINT_MAX; // insert it into the map xlit_table_map_.insert (std::make_pair(0, table0)); + const xlit_map_t::const_iterator xlit_map_end = xlit_map_.end (); + // traverse the map again and build the tables - for (it = xlit_map_.begin (); it != xlit_map_.end (); it++) { + for (it = xlit_map_.begin (); it != xlit_map_end; ++it) { + // encode the wchar_t value to UTF-8 - std::string utf8_rep (utf8_encode (it->first)); + const std::string utf8_rep (utf8_encode (it->first)); data_offset = xlit_data_offset_map_.find (it->first)->second; // traverse the utf8 representation string and create the // necessary tables and populate the indexes unsigned int table_idx = 0; - std::string::iterator string_it = utf8_rep.begin (); - for (; string_it != utf8_rep.end (); string_it++) { + + const std::string::const_iterator utf8_rep_end = utf8_rep.end (); + std::string::const_iterator string_it = utf8_rep.begin (); + + for (; string_it != utf8_rep_end; ++string_it) { // get the table corresponding to the current index and locate // the value at that index - xlit_table_map_t::iterator res = xlit_table_map_.find (table_idx); + const xlit_table_map_t::iterator res = + xlit_table_map_.find (table_idx); + assert (res != xlit_table_map_.end ()); // offset in table @@ -291,12 +381,12 @@ // res is the iterator pointing to the correct table in the map // check the index and if not populated, create a new table - if (res->second.offset_table [off_idx] == _RWSTD_UINT_MAX) { + if (res->second.offset_table [off_idx] == UINT_MAX) { // if this is the last position in the string, then // fill the table position with the offset of the string data if ((string_it + 1) == utf8_rep.end ()) { - xlit_data_offset_map_t::iterator data_it = + xlit_data_offset_map_t::const_iterator data_it = xlit_data_offset_map_.find (it->first); assert (data_it != xlit_data_offset_map_.end ()); @@ -307,8 +397,8 @@ // create a new table and append it to the map xlit_offset_table_t table; - for (unsigned int i = 0; i < _RWSTD_UCHAR_MAX + 1; i++) - table.offset_table [i] = _RWSTD_UINT_MAX; + for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i) + table.offset_table [i] = UINT_MAX; // insert it into the map unsigned int tmp = xlit_table_map_.size (); @@ -329,96 +419,98 @@ void Def:: write_codecvt (std::string dir_name) { - next_wchar_codecvt_tab_num_ = 0; - next_utf8_codecvt_tab_num_ = 0; - // if it has been already written if (codecvt_written_) return; // compose the directory name ((dir_name += _RWSTD_PATH_SEP) += "..") += _RWSTD_PATH_SEP; - dir_name += charmap_.get_code_set_name(); + dir_name += charmap_.get_code_set_name (); - // check for its existence - std::ifstream in (dir_name.c_str(), std::ios::in); - - if (in) { + // check to see if the codecvt database already exists and + // avoid recreating it if it does (as an optimization) + if (std::ifstream (dir_name.c_str ())) { issue_diag (I_OPENWR, false, 0, "%s exists, skipping\n", dir_name.c_str ()); return; } - issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ()); + ////////////////////////////////////////////////////////////////// + // generate multibyte conversion tables + issue_diag (I_STAGE, false, 0, "generating multibyte tables\n"); - // create the stream with exceptions enabled - std::ofstream out (dir_name.c_str(), std::ios::binary); - out.exceptions (std::ios::failbit | std::ios::badbit); - - std::size_t temp_off = 0; - std::size_t count_off = 0; - n_cmap_iter2 iter; - for (iter = charmap_.get_n_cmap2().begin(); - iter != charmap_.get_n_cmap2().end(); iter++, count_off ++) { - mb_char_off_map_.insert (std::make_pair (iter->first, - temp_off)); - wchar_off_map_.insert (std::make_pair (iter->first, - count_off)); - temp_off += iter->first.size() + 1; - - } + codecvt_offsets_map_t mbchar_offs; + std::map<std::string, unsigned> off_map; + const std::size_t n_mbchars = gen_mbchar_tables (mbchar_offs, off_map); - next_codecvt_tab_num_ = 0; - next_wchar_codecvt_tab_num_ = 0; - - generate_codecvt_table ("", 0); - generate_wchar_codecvt_table ("", 0); - generate_utf8_codecvt_table ("", 0); + // generate wchar_t conversion tables + issue_diag (I_STAGE, false, 0, "generating wchar_t tables\n"); + + codecvt_offsets_map_t wchar_offs; + const std::size_t n_wchars = gen_wchar_tables (wchar_offs); + + // generate UTF-8 conversion conversion tables + issue_diag (I_STAGE, false, 0, "generating UTF-8 tables\n"); + + codecvt_offsets_map_t uchar_offs; + const std::size_t n_uchars = gen_utf8_tables (uchar_offs, off_map); + + // not needed beyond this point, clear it out + off_map.clear (); // generate the transliteration tables and the transliteration data - generate_xliteration_data (); + issue_diag (I_STAGE, false, 0, "generating transliteration tables\n"); + gen_xlit_data (); + + ////////////////////////////////////////////////////////////////// + // populate the codecvt structure before writing it out + // in binary form to the file (the codecvt database) + _RW::__rw_codecvt_t codecvt_out; + std::memset (&codecvt_out, 0, sizeof codecvt_out); + + // calculate byte offsets within the structure + codecvt_out.n_to_w_tab_off = 0; + codecvt_out.w_to_n_tab_off = codecvt_out.n_to_w_tab_off + + mbchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned); - // calculate all offsets - codecvt_out_.n_to_w_tab_off = 0; - codecvt_out_.w_to_n_tab_off = codecvt_out_.n_to_w_tab_off - + mb_char_offs_.size() * (UCHAR_MAX + 1) - * sizeof (unsigned int); - - codecvt_out_.utf8_to_ext_tab_off = codecvt_out_.w_to_n_tab_off - + wchar_offs_.size() * (UCHAR_MAX + 1) - * sizeof (unsigned int); + codecvt_out.utf8_to_ext_tab_off = codecvt_out.w_to_n_tab_off + + wchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned); // insert the transliteration tables here - codecvt_out_.xliteration_off = codecvt_out_.utf8_to_ext_tab_off - + utf8_offs_.size() * (UCHAR_MAX + 1) - * sizeof (unsigned int); - - codecvt_out_.wchar_off = codecvt_out_.xliteration_off + - xlit_table_map_.size () * (_RWSTD_UCHAR_MAX + 1) * - sizeof (unsigned int); + codecvt_out.xliteration_off = codecvt_out.utf8_to_ext_tab_off + + uchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned); - codecvt_out_.codeset_off = codecvt_out_.wchar_off - + charmap_.get_n_cmap2().size() * 2 * sizeof (wchar_t); + codecvt_out.wchar_off = codecvt_out.xliteration_off + + xlit_table_map_.size () * (UCHAR_MAX + 1) * sizeof (unsigned); - codecvt_out_.charmap_off = codecvt_out_.codeset_off - + charmap_.get_code_set_name().size() + 1; + codecvt_out.codeset_off = codecvt_out.wchar_off + + charmap_.get_mb_cmap ().size () * 2 * sizeof (wchar_t); + + codecvt_out.charmap_off = codecvt_out.codeset_off + + charmap_.get_code_set_name ().size () + 1 /* NUL */; - std::size_t mb_offset = codecvt_out_.charmap_off - + charmap_.get_charmap_name().size() + 1; + const std::size_t mb_offset = codecvt_out.charmap_off + + charmap_.get_charmap_name ().size () + 1 /* NUL */; // compute the size of narrow strings map which added to // mb_offset will give the start of the transliteration data std::size_t xlit_data_offset = mb_offset; - for (iter = charmap_.get_n_cmap2().begin(); - iter != charmap_.get_n_cmap2().end(); iter++) { + + mb_cmap_iter iter; + + for (iter = charmap_.get_mb_cmap ().begin(); + iter != charmap_.get_mb_cmap().end(); ++iter) { xlit_data_offset += iter->first.size() + 1; } // now traverse again the utf8 tables for transliteration data // and recompute the offsets: + const xlit_table_map_t::const_iterator xlit_table_map_end = + xlit_table_map_.end (); + xlit_table_map_t::iterator xit = xlit_table_map_.begin (); - for (; xit != xlit_table_map_.end (); xit++) { - for (unsigned int i = 0; i < _RWSTD_UCHAR_MAX + 1; i++) { + for (; xit != xlit_table_map_end; ++xit) { + for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i) { if (xit->second.offset_table [i] & 0x80000000) continue; // add the offset for xliteration data @@ -426,78 +518,111 @@ } } - wchar_offs_iter wchar_offs_it; - for (wchar_offs_it = wchar_offs_.begin(); - wchar_offs_it != wchar_offs_.end(); wchar_offs_it ++) { - for (unsigned int i = 0; i <= UCHAR_MAX; i++) { - if (!((wchar_offs_it->second).off[i] & 0x80000000)) - (wchar_offs_it->second).off[i] += mb_offset; - } - } - - utf8_offs_iter utf8_offs_it; - for (utf8_offs_it = utf8_offs_.begin(); - utf8_offs_it != utf8_offs_.end(); utf8_offs_it ++) { - for (unsigned int i = 0; i <= UCHAR_MAX; i++) { - if (!((utf8_offs_it->second).off[i] & 0x80000000)) - (utf8_offs_it->second).off[i] += mb_offset; - } - } - codecvt_out_.mb_cur_max = charmap_.get_mb_cur_max(); + codecvt_out.mb_cur_max = charmap_.get_mb_cur_max(); + issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ()); + // create the stream with exceptions enabled + std::ofstream out (dir_name.c_str(), std::ios::binary); + out.exceptions (std::ios::failbit | std::ios::badbit); + // write the codecvt_out structure - out.write ((char*)&codecvt_out_, sizeof(codecvt_out_)); + out.write ((char*)&codecvt_out, sizeof codecvt_out); - issue_diag (I_WRITE, false, 0, "writing char to wchar_t table\n"); + typedef codecvt_offsets_map_t::iterator off_iter_t; - // write the narrow_to_wide tables - mb_char_offs_iter mb_char_offs_it; - for (mb_char_offs_it = mb_char_offs_.begin(); - mb_char_offs_it != mb_char_offs_.end(); mb_char_offs_it++) { - for (unsigned int c = 0; c <= UCHAR_MAX; c++) { - out.write ((const char*)&mb_char_offs_it->second.off[c], - sizeof (mb_char_offs_it->second.off[c])); + ////////////////////////////////////////////////////////////////// + // write out the multibyte to wchar_t tables + issue_diag (I_WRITE, false, 0, + "writing %lu multibyte tables (%lu characters)\n", + mbchar_offs.size (), n_mbchars); + + for (off_iter_t it = mbchar_offs.begin (); it != mbchar_offs.end (); ++it) { + for (unsigned i = 0; i <= UCHAR_MAX; ++i) { + + const unsigned off = it->second->off [i]; + + out.write ((const char*)&off, sizeof off); } + + delete it->second->off; } - issue_diag (I_WRITE, false, 0, "writing wchar_t to char table\n"); + // not needed beyond this point, clear it out + mbchar_offs.clear (); + + ////////////////////////////////////////////////////////////////// + // write out the wchar_t to multibyte conversion tables + issue_diag (I_WRITE, false, 0, + "writing %lu wchar_t tables (%lu characters)\n", + wchar_offs.size (), n_wchars); + + for (off_iter_t it = wchar_offs.begin (); it != wchar_offs.end (); ++it) { + for (unsigned i = 0; i <= UCHAR_MAX; ++i) { + + // adjust offsets to multibyte characters (but not those + // to other tables or invalid encodings) + unsigned off = it->second->off [i]; + + if (!(off & 0x80000000)) + off += mb_offset; - // now write the wide_to_narrow tables - for (wchar_offs_it = wchar_offs_.begin(); - wchar_offs_it != wchar_offs_.end(); wchar_offs_it++) { - for (unsigned int c = 0; c <= UCHAR_MAX; c++) { - out.write ((const char*)&wchar_offs_it->second.off[c], - sizeof (wchar_offs_it->second.off[c])); + out.write ((const char*)&off, sizeof off); } + + delete it->second->off; } - issue_diag (I_WRITE, false, 0, "writing UTF-8 to char table\n"); + // not needed beyond this point, clear it out + wchar_offs.clear (); + + ////////////////////////////////////////////////////////////////// + // write out the UTF-8 to (libc) multibyte tables + issue_diag (I_WRITE, false, 0, + "writing %lu UTF-8 tables (%lu characters)\n", + uchar_offs.size (), n_uchars); + + for (off_iter_t it = uchar_offs.begin (); it != uchar_offs.end (); ++it) { + for (unsigned i = 0; i <= UCHAR_MAX; ++i) { + + // adjust offsets to multibyte characters (but not those + // to other tables or invalid encodings) + unsigned off = it->second->off [i]; - // write the utf8_to_external tables - for (utf8_offs_it = utf8_offs_it = utf8_offs_.begin(); - utf8_offs_it != utf8_offs_.end(); utf8_offs_it++) { - for (unsigned int c = 0; c <= UCHAR_MAX; c++) { - out.write ((const char*)&utf8_offs_it->second.off[c], - sizeof (utf8_offs_it->second.off[c])); + if (!(off & 0x80000000)) + off += mb_offset; + + out.write ((const char*)&off, sizeof off); } + + delete it->second->off; } - issue_diag (I_WRITE, false, 0, "writing transliteration lookup table\n"); + // not needed beyond this point, clear it out + uchar_offs.clear (); + + ////////////////////////////////////////////////////////////////// + // write out the transliteration UTF-8 lookup tables + issue_diag (I_WRITE, false, 0, + "writing transliteration table (size %lu)\n", + xlit_table_map_.size ()); - // write the transliteration UTF-8 lookup tables xit = xlit_table_map_.begin (); - for (; xit != xlit_table_map_.end (); xit++) { - unsigned int* ptable = &xit->second.offset_table [0]; - for (unsigned int i = 0; i < _RWSTD_UCHAR_MAX + 1; i++, ptable++) + for (; xit != xlit_table_map_end; ++xit) { + const unsigned int* ptable = &xit->second.offset_table [0]; + for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i, ++ptable) out.write ((const char*)ptable, sizeof (unsigned int)); } - issue_diag (I_WRITE, false, 0, "writing UCS to wchar_t table\n"); + issue_diag (I_WRITE, false, 0, + "writing the UCS table (%lu characters)\n", + charmap_.get_mb_cmap ().size ()); + + const mb_cmap_iter n_cmap2_end = charmap_.get_mb_cmap ().end (); // write the locale-encoded wchar_t and the UCS4 wchar_t - for (iter = charmap_.get_n_cmap2().begin(); - iter != charmap_.get_n_cmap2().end(); iter++) { + for (iter = charmap_.get_mb_cmap ().begin(); + iter != n_cmap2_end; ++iter) { out.write ((const char*)&iter->second, sizeof (iter->second)); out.write ((const char*)& (charmap_.get_ucs4_cmap().find (charmap_.get_rw_cmap().find @@ -511,19 +636,21 @@ // write out the narrow character strings - for (iter = charmap_.get_n_cmap2().begin(); - iter != charmap_.get_n_cmap2().end(); iter++) { + for (iter = charmap_.get_mb_cmap().begin(); + iter != n_cmap2_end; ++iter) { out.write (iter->first.c_str(), iter->first.size() + 1); } - issue_diag (I_WRITE, false, 0, "writing transliteration table\n"); + issue_diag (I_WRITE, false, 0, + "writing transliteration data (size %lu)\n", + xlit_map_.size ()); // write out the transliteration data - xlit_map_t::iterator xlit_data_it = xlit_map_.begin (); - for (; xlit_data_it != xlit_map_.end (); xlit_data_it++) { - std::list<std::string>::iterator sit = + xlit_map_t::const_iterator xlit_data_it = xlit_map_.begin (); + for (; xlit_data_it != xlit_map_.end (); ++xlit_data_it) { + std::list<std::string>::const_iterator sit = xlit_data_it->second.begin (); - for (; sit != xlit_data_it->second.end (); sit++) { + for (; sit != xlit_data_it->second.end (); ++sit) { out.write (sit->c_str (), sit->size () + 1); } out.write ("\0", 1);
