Paul Boddie: > Do we have a volunteer? ;-)
I won't volunteer to do a real implementation - the Unicode type in Python is currently around 7000 lines long and there is other code to change in, for example, regular expressions. Here's a demonstration C++ implementation that stores an array of surrogate positions for indexing. For text in which every character is a surrogate, this could lead to requiring 3 times as much storage (with a size_t requiring 64 bits for each 32 bit surrogate pair). Indexing is reasonably efficient, using a binary search through the surrogate positions so is proportional to log(number of surrogates). Code (not thoroughly tested): /** @file surstr.cxx ** A simple Unicode string class that stores in UTF-16 ** but indexes by character. **/ // Copyright 2007 by Neil Hodgson <[EMAIL PROTECTED]> // This source code is public domain. #include <string.h> #include <stdio.h> class surstr { public: typedef wchar_t codePoint; enum { SURROGATE_LEAD_FIRST = 0xD800 }; enum { SURROGATE_LEAD_LAST = 0xDBFF }; enum { measure_length=0xffffffffU}; codePoint *text; size_t length; size_t *surrogates; // Memory use would be decreased by allocating text and // surrogates as one block but the code is clearer this way size_t lengthSurrogates; static bool IsLead(codePoint cp) { return cp >= SURROGATE_LEAD_FIRST && cp <= SURROGATE_LEAD_LAST; } void FindSurrogates() { lengthSurrogates = 0; for (size_t i=0; i < length; i++) { if (IsLead(text[i])) { lengthSurrogates++; } } surrogates = new size_t[lengthSurrogates]; size_t surr = 0; for (size_t i=0; i < length; i++) { if (IsLead(text[i])) { surrogates[surr] = i - surr; surr++; } } } size_t LinearIndexFromPosition(size_t position) const { // For checking that the binary search version works for (size_t i=0; i<lengthSurrogates; i++) { if (surrogates[i] >= position) { return position + i; } } return position + lengthSurrogates; } size_t IndexFromPosition(size_t position) const { // Use a binary search to find index in log(lengthSurrogates) if (lengthSurrogates == 0) return position; if (position > surrogates[lengthSurrogates - 1]) return position + lengthSurrogates; size_t lower = 0; size_t upper = lengthSurrogates-1; do { size_t middle = (upper + lower + 1) / 2; // Round high size_t posMiddle = surrogates[middle]; if (position < posMiddle) { upper = middle - 1; } else { lower = middle; } } while (lower < upper); if (surrogates[lower] >= position) return position + lower; else return position + lower + 1; } size_t Length() const { return length - lengthSurrogates; } surstr() : text(0), length(0), surrogates(0), lengthSurrogates(0) {} // start and end are in code points surstr(codePoint *text_, size_t start=0, size_t end=measure_length) : text(0), length(0), surrogates(0), lengthSurrogates(0) { // Assert text_[start:end] only contains whole surrogate pairs if (end == measure_length) { end = 0; while (text_[end]) end++; } length = end - start; text = new codePoint[length]; memcpy(text, text_, sizeof(codePoint) * length); FindSurrogates(); } // start and end are in characters surstr(const surstr &source, size_t start=0, size_t end=measure_length) { size_t startIndex = source.IndexFromPosition(start); size_t endIndex; if (end == measure_length) endIndex = source.IndexFromPosition(source.Length()); else endIndex = source.IndexFromPosition(end); length = endIndex - startIndex; text = new codePoint[length]; memcpy(text, source.text + startIndex, sizeof(codePoint) * length); if (start == 0 && end == measure_length) { surrogates = new size_t[source.lengthSurrogates]; memcpy(surrogates, source.surrogates, sizeof(size_t) * source.lengthSurrogates); lengthSurrogates = source.lengthSurrogates; } else { FindSurrogates(); } } ~surstr() { delete []text; text = 0; delete []surrogates; surrogates = 0; } void print() { for (size_t i=0;i<length;i++) { if (text[i] < 0x7f) { printf("%c", text[i]); } else { printf("\\u%04x", text[i]); } } } }; void slicer(surstr &a) { printf("Length in characters = %d, code units = %d ==> ", a.Length(), a.length); a.print(); printf("\n"); for (size_t pos = 0; pos < a.Length(); pos++) { if (a.IndexFromPosition(pos) != a.LinearIndexFromPosition(pos)) { printf(" Failed at position %d -> %d", pos, a.IndexFromPosition(pos)); } printf(" [%0d] ", pos); surstr b(a, pos, pos+1); b.print(); printf("\n"); } } int main() { surstr n(L""); slicer(n); surstr a(L"a"); slicer(a); surstr b(L"a\u6C348\U0001D11E-\U00010338!"); slicer(b); printf("\n"); surstr c(L"a\u6C348\U0001D11E\U0001D11E-\U00010338!" L"\U0001D11E\U0001D11Ea\u6C348\U0001D11E-\U00010338!"); slicer(c); printf("\n"); } Test run: Length in characters = 0, code units = 0 ==> Length in characters = 1, code units = 1 ==> a [0] a Length in characters = 7, code units = 9 ==> a\u6c348\ud834\udd1e-\ud800\udf38! [0] a [1] \u6c34 [2] 8 [3] \ud834\udd1e [4] - [5] \ud800\udf38 [6] ! Length in characters = 17, code units = 24 ==> a\u6c348\ud834\udd1e\ud834\udd1e-\ud800\udf38!\ud834\udd1e\ud834\udd1ea\u6c348\ud834\udd1e-\ud800\udf38! [0] a [1] \u6c34 [2] 8 [3] \ud834\udd1e [4] \ud834\udd1e [5] - [6] \ud800\udf38 [7] ! [8] \ud834\udd1e [9] \ud834\udd1e [10] a [11] \u6c34 [12] 8 [13] \ud834\udd1e [14] - [15] \ud800\udf38 [16] ! Neil -- http://mail.python.org/mailman/listinfo/python-list