[pypy-commit] pypy unicode-utf8: copy source code from github repo (pypy/fast-utf8-methods, 0a7e7ba813), add rpython wrapper to access c api

plan_rich Tue, 14 Mar 2017 01:12:58 -0700

Author: Richard Plangger <[email protected]>
Branch: unicode-utf8
Changeset: r90673:5ca17ebc466d
Date: 2017-03-14 09:10 +0100
http://bitbucket.org/pypy/pypy/changeset/5ca17ebc466d/


Log:    copy source code from github repo (pypy/fast-utf8-methods,
        0a7e7ba813), add rpython wrapper to access c api

diff --git a/rpython/rlib/rutf8/capi.py b/rpython/rlib/rutf8/capi.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/capi.py
@@ -0,0 +1,56 @@
+import py
+import sys
+from rpython.tool.version import rpythonroot
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from rpython.rtyper.tool import rffi_platform as platform
+
+ROOT = py.path.local(rpythonroot).join('rpython', 'rlib', 'rutf8')
+SRC = ROOT.join('src')
+
+if sys.platform.startswith('linux'):
+    _libs = ['dl']
+else:
+    _libs = []
+eci_kwds = dict(
+    include_dirs = [SRC],
+    includes = ['utf8.h'],
+    libraries = _libs,
+    separate_module_files = [SRC.join('utf8.c')],)
+global_eci = ExternalCompilationInfo(**eci_kwds)
+
+IDXTAB = lltype.ForwardReference()
+IDXTAB.become(rffi.CStruct("fu8_idxtab",
+                           ('character_step', rffi.INT),
+                           ('byte_positions', lltype.Ptr(rffi.SIZE_T)),
+                           ('bytepos_table_length', rffi.SIZE_T)))
+IDXTABPP = lltype.Ptr(lltype.Ptr(IDXTAB))
+
+def setup():
+    compile_extra = ['-DRPYTHON_LL2CTYPES']
+    platform.verify_eci(ExternalCompilationInfo(
+        compile_extra=compile_extra,
+        **eci_kwds))
+
+    eci = global_eci
+    count_utf8_code_points = rffi.llexternal("fu8_count_utf8_codepoints",
+                                  [rffi.CCHARP, rffi.SIZE_T],
+                                  rffi.SSIZE_T, compilation_info=eci,
+                                  _nowrapper=True)
+    index2byteposition = rffi.llexternal("fu8_idx2bytepos",
+                                  [rffi.SIZE_T, rffi.CCHARP, rffi.SIZE_T, 
IDXTABPP],
+                                  rffi.SSIZE_T, compilation_info=eci,
+                                  _nowrapper=True)
+
+    return CInterface(locals())
+
+
+class CInterface(object):
+    def __init__(self, namespace):
+        for k, v in namespace.iteritems():
+            setattr(self, k, v)
+
+    def _freeze_(self):
+        return True
+
+
diff --git a/rpython/rlib/rutf8/src/utf8-avx.c 
b/rpython/rlib/rutf8/src/utf8-avx.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8-avx.c
@@ -0,0 +1,254 @@
+#include "utf8.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <immintrin.h>
+
+#define BIT(B,P) ((B >> (P-1)) & 0x1)
+
+void _print_mmy(const char * msg, __m256i chunk)
+{
+    printf("%s:", msg);
+    // unpack the first 8 bytes, padding with zeros
+    uint64_t a = _mm256_extract_epi64(chunk, 0);
+    uint64_t b = _mm256_extract_epi64(chunk, 1);
+    uint64_t c = _mm256_extract_epi64(chunk, 2);
+    uint64_t d = _mm256_extract_epi64(chunk, 3);
+    printf("%.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x  %.2x%.2x%.2x%.2x 
%.2x%.2x%.2x%.2x    "
+           "%.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x  %.2x%.2x%.2x%.2x 
%.2x%.2x%.2x%.2x",
+            (unsigned char)((a >> 0) & 0xff),
+            (unsigned char)((a >> 8) & 0xff),
+            (unsigned char)((a >> 16) & 0xff),
+            (unsigned char)((a >> 24) & 0xff),
+
+            (unsigned char)((a >> 32) & 0xff),
+            (unsigned char)((a >> 40) & 0xff),
+            (unsigned char)((a >> 48) & 0xff),
+            (unsigned char)((a >> 56) & 0xff),
+
+            (unsigned char)((b >> 0) & 0xff),
+            (unsigned char)((b >> 8) & 0xff),
+            (unsigned char)((b >> 16) & 0xff),
+            (unsigned char)((b >> 24) & 0xff),
+
+            (unsigned char)((b >> 32) & 0xff),
+            (unsigned char)((b >> 40) & 0xff),
+            (unsigned char)((b >> 48) & 0xff),
+            (unsigned char)((b >> 56) & 0xff),
+
+            (unsigned char)((c >> 0) & 0xff),
+            (unsigned char)((c >> 8) & 0xff),
+            (unsigned char)((c >> 16) & 0xff),
+            (unsigned char)((c >> 24) & 0xff),
+
+            (unsigned char)((c >> 32) & 0xff),
+            (unsigned char)((c >> 40) & 0xff),
+            (unsigned char)((c >> 48) & 0xff),
+            (unsigned char)((c >> 56) & 0xff),
+
+            (unsigned char)((d >> 0) & 0xff),
+            (unsigned char)((d >> 8) & 0xff),
+            (unsigned char)((d >> 16) & 0xff),
+            (unsigned char)((d >> 24) & 0xff),
+
+            (unsigned char)((d >> 32) & 0xff),
+            (unsigned char)((d >> 40) & 0xff),
+            (unsigned char)((d >> 48) & 0xff),
+            (unsigned char)((d >> 56) & 0xff)
+     );
+
+    printf("\n");
+}
+
+ssize_t count_utf8_codepoints_avx(const uint8_t * encoded, size_t len)
+{
+    __builtin_prefetch(encoded, 0, 0);
+    size_t num_codepoints = 0;
+    __m256i chunk;
+
+    if (len == 0) {
+        return 0;
+    }
+    __m256i zero = _mm256_set1_epi8(0x00);
+    while (len >= 32) {
+        chunk = _mm256_loadu_si256((__m256i*)encoded);
+        if (_mm256_movemask_epi8(chunk) == 0) {
+            // valid ascii chars!
+            len -= 32;
+            encoded += 32;
+            num_codepoints += 32;
+            continue;
+        }
+        __builtin_prefetch(encoded+32, 0, 0);
+
+        __m256i count = _mm256_set1_epi8(0x1);
+        //_print_mm256x("chunk", chunk);
+        // fight against the fact that there is no comparison on unsigned 
values
+        __m256i chunk_signed = _mm256_add_epi8(chunk, _mm256_set1_epi8(0x80));
+        //_print_mm256x("shunk", chunk_signed);
+
+        // ERROR checking
+        // checking procedure works the following way:
+        //
+        // 1) mark all continuation bytes with either 0x1, 0x3, 0x7 (one, two 
or three bytes continuation)
+        // 2) then check that there is no byte that has an invalid continuation
+        __m256i twobytemarker = _mm256_cmpgt_epi8(  chunk_signed, 
_mm256_set1_epi8(0xc0-1-0x80));
+        __m256i threebytemarker = _mm256_cmpgt_epi8(chunk_signed, 
_mm256_set1_epi8(0xe0-1-0x80));
+        __m256i fourbytemarker = _mm256_cmpgt_epi8( chunk_signed, 
_mm256_set1_epi8(0xf0-1-0x80));
+
+        // the general idea of the following code collects 0xff for each byte 
position
+        // in the variable contbytes.
+        // at the end check if each position in contbytes set to 0xff is a 
valid continuation byte
+
+        // check that 0xc0 > 0xc2
+        __m256i validtwobm = _mm256_cmpgt_epi8(chunk_signed, 
_mm256_set1_epi8(0xc2-1-0x80));
+        if (_mm256_movemask_epi8(_mm256_xor_si256(validtwobm, twobytemarker)) 
!= 0) {
+            // two byte marker should not be in range [0xc0-0xc2)
+            return -1;
+        }
+
+        __m256i state2 = _mm256_andnot_si256(threebytemarker, twobytemarker);
+        __m256i contbytes = _mm256_slli_si256(_mm256_blendv_epi8(state2, 
_mm256_set1_epi8(0x1), twobytemarker), 1);
+
+        if (_mm256_movemask_epi8(threebytemarker) != 0) {
+            // contains at least one 3 byte marker
+            __m256i istate3 = _mm256_andnot_si256(fourbytemarker, 
threebytemarker);
+            __m256i state3 = _mm256_slli_si256(_mm256_blendv_epi8(zero, 
_mm256_set1_epi8(0x3), istate3), 1);
+            state3 = _mm256_or_si256(state3, _mm256_slli_si256(state3, 1));
+
+            contbytes = _mm256_or_si256(contbytes, state3);
+
+            // range check
+            __m256i equal_e0 = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero, 
chunk_signed, istate3),
+                                              _mm256_set1_epi8(0xe0-0x80));
+            if (_mm256_movemask_epi8(equal_e0) != 0) {
+                __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x7f), 
chunk_signed, _mm256_slli_si256(equal_e0, 1));
+                __m256i check_surrogate = 
_mm256_cmpgt_epi8(_mm256_set1_epi8(0xa0-0x80), mask); // lt
+                if (_mm256_movemask_epi8(check_surrogate) != 0) {
+                    // invalid surrograte character!!!
+                    return -1;
+                }
+            }
+
+            // verify that there are now surrogates
+            if (!ALLOW_SURROGATES) {
+                __m256i equal_ed = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero, 
chunk_signed, istate3),
+                                                  _mm256_set1_epi8(0xed-0x80));
+                if (_mm256_movemask_epi8(equal_ed) != 0) {
+                    __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x80), 
chunk_signed, _mm256_slli_si256(equal_ed, 1));
+                    __m256i check_surrogate = _mm256_cmpgt_epi8(mask, 
_mm256_set1_epi8(0xa0-1-0x80));
+                    if (_mm256_movemask_epi8(check_surrogate) != 0) {
+                        // invalid surrograte character!!!
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (_mm256_movemask_epi8(fourbytemarker) != 0) {
+            // contain a 4 byte marker
+            __m256i istate4 = _mm256_slli_si256(_mm256_blendv_epi8(zero, 
_mm256_set1_epi8(0x7), fourbytemarker), 1);
+            __m256i state4 =_mm256_or_si256(istate4, 
_mm256_slli_si256(istate4, 1));
+            state4 =_mm256_or_si256(state4, _mm256_slli_si256(istate4, 2));
+
+            contbytes = _mm256_or_si256(contbytes, state4);
+
+            // range check, filter out f0 and 
+            __m256i equal_f0 = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero, 
chunk_signed, fourbytemarker),
+                                              _mm256_set1_epi8(0xf0-0x80));
+            if (_mm256_movemask_epi8(equal_f0) != 0) {
+                __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x7f), 
chunk_signed, _mm256_slli_si256(equal_f0, 1));
+                __m256i check_surrogate = 
_mm256_cmpgt_epi8(_mm256_set1_epi8(0x90-0x80), mask);
+                if (_mm256_movemask_epi8(check_surrogate) != 0) {
+                    return -1;
+                }
+            }
+
+            __m256i equal_f4 = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero, 
chunk_signed, fourbytemarker),
+                                              _mm256_set1_epi8(0xf4-0x80));
+            if (_mm256_movemask_epi8(equal_f4) != 0) {
+                __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x80), 
chunk_signed, _mm256_slli_si256(equal_f4, 1));
+                __m256i check_surrogate = _mm256_cmpgt_epi8(mask, 
_mm256_set1_epi8(0x90-1-0x80));
+                if (_mm256_movemask_epi8(check_surrogate) != 0) {
+                    return -1;
+                }
+            }
+
+            __m256i equal_f5_gt = _mm256_cmpgt_epi8(_mm256_blendv_epi8(zero, 
chunk_signed, fourbytemarker),
+                                              _mm256_set1_epi8(0xf4-0x80));
+            if (_mm256_movemask_epi8(equal_f5_gt) != 0) {
+                return -1;
+            }
+        }
+
+        // now check that contbytes and the actual byte values have a valid
+        // continuation at each position the marker indicates to have one
+        __m256i check_cont = _mm256_cmpgt_epi8(contbytes, zero);
+        __m256i contpos = _mm256_and_si256(_mm256_set1_epi8(0xc0), chunk);
+        contpos = _mm256_cmpeq_epi8(_mm256_set1_epi8(0x80), contpos);
+        __m256i validcont = _mm256_xor_si256(check_cont, contpos);
+        if (_mm256_movemask_epi8(validcont) != 0) {
+            // uff, nope, that is really not utf8
+            return -1;
+        }
+
+        // CORRECT, calculate the length
+        // copy 0x00 over to each place which is a continuation byte
+        count = _mm256_blendv_epi8(count, zero, contpos);
+
+        // count the code points using 2x 32 bit hadd and one last 16 hadd
+        // the result will end up at the lowest position
+        count = _mm256_hadd_epi32(count, zero);
+        count = _mm256_hadd_epi32(count, zero);
+        count = _mm256_hadd_epi16(count, zero);
+        uint16_t c = _mm256_extract_epi16(count, 0);
+        uint16_t c2 = _mm256_extract_epi16(count, 8);
+        uint16_t points = (c & 0xff) + ((c >> 8) & 0xff) + (c2 & 0xff) + ((c2 
>> 8) & 0xff);
+
+        // these cases need to be handled:
+        //                      16 byte boundary -> | <- 16 byte boundary
+        // -----------------------------------------+--------------------
+        // 1) 2 byte code point. e.g. ...  c2       | 80 ...
+        // 2) 3 byte code point. e.g. ...  e6       | 80 80 ...
+        // 3) 3 byte code point. e.g. ...  e6 80    | 80 ...
+        // 4) 4 byte code point. e.g. ...  f2       | 80 80 80 ...
+        // 5) 4 byte code point. e.g. ...  f2 80    | 80 80 ...
+        // 6) 4 byte code point. e.g. ...  f2 80 80 | 80 ...
+        //
+        int mask_chunk = _mm256_movemask_epi8(chunk);
+        int mask_conti = _mm256_movemask_epi8(contpos);
+
+        // little endian
+        int lenoff = 32;
+        int minus_codepoints = 0;
+        if (BIT(mask_chunk, 32) != 0 && BIT(mask_conti, 32) == 0) { // 1), 2), 
4)
+            minus_codepoints = 1;
+            lenoff -= 1;
+        } else if (BIT(mask_chunk, 31) != 0 && BIT(mask_conti, 31) == 0 &&
+                   BIT(mask_conti, 32) == 1) { // 3), 5)
+            minus_codepoints = 1;
+            lenoff -= 2;
+        } else if (BIT(mask_chunk, 30) != 0 && BIT(mask_conti, 30) == 0 &&
+                   BIT(mask_conti, 31) == 1 && BIT(mask_conti, 32) == 1) { // 
6)
+            minus_codepoints = 1;
+            lenoff -= 3;
+        }
+
+        num_codepoints += points - minus_codepoints;
+        len -= lenoff;
+        encoded += lenoff;
+    }
+
+    if (len == 0) {
+        return num_codepoints;
+    }
+
+    ssize_t result = count_utf8_codepoints_seq(encoded, len);
+    if (result == -1) {
+        return -1;
+    }
+
+    return num_codepoints + result;
+    return -1;
+}
diff --git a/rpython/rlib/rutf8/src/utf8-scalar.c 
b/rpython/rlib/rutf8/src/utf8-scalar.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8-scalar.c
@@ -0,0 +1,70 @@
+#include "utf8.h"
+
+int _check_continuation(const uint8_t ** encoded, const uint8_t * endptr, int 
count) {
+    ssize_t size = endptr - *encoded;
+
+    if (size < count) {
+        // not enough bytes to be a valid 2 byte utf8 code point
+        return -1;
+    }
+    for (int i = 0; i < count; i++) {
+        uint8_t byte = *(*encoded)++;
+        if ((byte & 0xc0) != 0x80) { 
+            // continuation byte does NOT match 0x10xxxxxx
+            return -1;
+        }
+    }
+    return 0;
+}
+
+ssize_t count_utf8_codepoints_seq(const uint8_t * encoded, size_t len) {
+    size_t num_codepoints = 0;
+    uint8_t byte = 0;
+    const uint8_t * endptr = encoded + len;
+
+    while (encoded < endptr) {
+        byte = *encoded++;
+        if (byte < 0x80) {
+            num_codepoints += 1;
+            continue;
+        } else {
+                //asm("int $3");
+            if ((byte & 0xe0) == 0xc0) {
+                // one continuation byte
+                if (byte < 0xc2) {
+                    return -1;
+                }
+                if (_check_continuation(&encoded, endptr, 1) != 0) {
+                    return -1;
+                }
+            } else if ((byte & 0xf0) == 0xe0) {
+                // two continuation byte
+                if (_check_continuation(&encoded, endptr, 2) != 0) {
+                    return -1;
+                }
+                uint8_t byte1 = encoded[-2];
+                //surrogates shouldn't be valid UTF-8!
+                if ((byte == 0xe0 && byte1 < 0xa0) ||
+                    (byte == 0xed && byte1 > 0x9f && !ALLOW_SURROGATES)) {
+                    return -1;
+                }
+            } else if ((byte & 0xf8) == 0xf0) {
+                // three continuation byte
+                if (_check_continuation(&encoded, endptr, 3) != 0) {
+                    return -1;
+                }
+                uint8_t byte1 = encoded[-3];
+                if ((byte == 0xf0 && byte1 < 0x90) ||
+                    (byte == 0xf4 && byte1 > 0x8f) ||
+                    (byte >= 0xf5)) {
+                    return -1;
+                }
+            } else {
+                // TODO
+                return -1;
+            }
+            num_codepoints += 1;
+        }
+    }
+    return num_codepoints;
+}
diff --git a/rpython/rlib/rutf8/src/utf8-sse4.c 
b/rpython/rlib/rutf8/src/utf8-sse4.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8-sse4.c
@@ -0,0 +1,238 @@
+#include "utf8.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <xmmintrin.h>
+#include <smmintrin.h>
+
+#define BIT(B,P) ((B >> (P-1)) & 0x1)
+
+void _print_mmx(const char * msg, __m128i chunk)
+{
+    printf("%s:", msg);
+    // unpack the first 8 bytes, padding with zeros
+    uint64_t a = _mm_extract_epi64(chunk, 0);
+    uint64_t b = _mm_extract_epi64(chunk, 1);
+    printf("%.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x  %.2x%.2x%.2x%.2x 
%.2x%.2x%.2x%.2x",
+            (unsigned char)((a >> 0) & 0xff),
+            (unsigned char)((a >> 8) & 0xff),
+            (unsigned char)((a >> 16) & 0xff),
+            (unsigned char)((a >> 24) & 0xff),
+
+            (unsigned char)((a >> 32) & 0xff),
+            (unsigned char)((a >> 40) & 0xff),
+            (unsigned char)((a >> 48) & 0xff),
+            (unsigned char)((a >> 56) & 0xff),
+
+            (unsigned char)((b >> 0) & 0xff),
+            (unsigned char)((b >> 8) & 0xff),
+            (unsigned char)((b >> 16) & 0xff),
+            (unsigned char)((b >> 24) & 0xff),
+
+            (unsigned char)((b >> 32) & 0xff),
+            (unsigned char)((b >> 40) & 0xff),
+            (unsigned char)((b >> 48) & 0xff),
+            (unsigned char)((b >> 56) & 0xff)
+     );
+
+    printf("\n");
+}
+
+
+ssize_t count_utf8_codepoints_sse4(const uint8_t * encoded, size_t len)
+{
+    __builtin_prefetch(encoded, 0, 0);
+    size_t num_codepoints = 0;
+    __m128i chunk;
+
+    if (len == 0) {
+        return 0;
+    }
+    __m128i zero = _mm_set1_epi8(0x00);
+
+    while (len >= 16) {
+        chunk = _mm_loadu_si128((__m128i*)encoded);
+        if (_mm_movemask_epi8(chunk) == 0) {
+            // valid ascii chars!
+            len -= 16;
+            encoded += 16;
+            num_codepoints += 16;
+            continue;
+        }
+        __builtin_prefetch(encoded+16, 0, 0);
+
+        __m128i count = _mm_set1_epi8(0x1);
+        //_print_mmx("chunk", chunk);
+        // fight against the fact that there is no comparison on unsigned 
values
+        __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
+        //_print_mmx("shunk", chunk_signed);
+
+        // ERROR checking
+        // checking procedure works the following way:
+        //
+        // 1) mark all continuation bytes with either 0x1, 0x3, 0x7 (one, two 
or three bytes continuation)
+        // 2) then check that there is no byte that has an invalid continuation
+        __m128i twobytemarker = _mm_cmplt_epi8(_mm_set1_epi8(0xc0-1-0x80), 
chunk_signed);
+        __m128i threebytemarker = _mm_cmplt_epi8(_mm_set1_epi8(0xe0-1-0x80), 
chunk_signed);
+        __m128i fourbytemarker = _mm_cmplt_epi8(_mm_set1_epi8(0xf0-1-0x80), 
chunk_signed);
+
+        // the general idea of the following code collects 0xff for each byte 
position
+        // in the variable contbytes.
+        // at the end check if each position in contbytes set to 0xff is a 
valid continuation byte
+
+        // check that 0xc0 > 0xc2
+        __m128i validtwobm = _mm_cmplt_epi8(_mm_set1_epi8(0xc2-1-0x80), 
chunk_signed);
+        if (_mm_movemask_epi8(_mm_xor_si128(validtwobm, twobytemarker)) != 0) {
+            // two byte marker should not be in range [0xc0-0xc2)
+            return -1;
+        }
+
+        __m128i state2 = _mm_andnot_si128(threebytemarker, twobytemarker);
+        __m128i contbytes = _mm_slli_si128(_mm_blendv_epi8(state2, 
_mm_set1_epi8(0x1), twobytemarker), 1);
+
+        if (_mm_movemask_epi8(threebytemarker) != 0) {
+            // contains at least one 3 byte marker
+            __m128i istate3 = _mm_andnot_si128(fourbytemarker, 
threebytemarker);
+            __m128i state3 = _mm_slli_si128(_mm_blendv_epi8(zero, 
_mm_set1_epi8(0x3), istate3), 1);
+            state3 = _mm_or_si128(state3, _mm_slli_si128(state3, 1));
+
+            contbytes = _mm_or_si128(contbytes, state3);
+
+            // range check
+            __m128i equal_e0 = _mm_cmpeq_epi8(_mm_blendv_epi8(zero, 
chunk_signed, istate3),
+                                              _mm_set1_epi8(0xe0-0x80));
+            if (_mm_movemask_epi8(equal_e0) != 0) {
+                __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x7f), 
chunk_signed, _mm_slli_si128(equal_e0, 1));
+                __m128i check_surrogate = _mm_cmplt_epi8(mask, 
_mm_set1_epi8(0xa0-0x80));
+                if (_mm_movemask_epi8(check_surrogate) != 0) {
+                    // invalid surrograte character!!!
+                    return -1;
+                }
+            }
+
+            // verify that there are now surrogates
+            if (!ALLOW_SURROGATES) {
+                __m128i equal_ed = _mm_cmpeq_epi8(_mm_blendv_epi8(zero, 
chunk_signed, istate3),
+                                                  _mm_set1_epi8(0xed-0x80));
+                if (_mm_movemask_epi8(equal_ed) != 0) {
+                    __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x80), 
chunk_signed, _mm_slli_si128(equal_ed, 1));
+                    __m128i check_surrogate = _mm_cmpgt_epi8(mask, 
_mm_set1_epi8(0xa0-1-0x80));
+                    if (_mm_movemask_epi8(check_surrogate) != 0) {
+                        // invalid surrograte character!!!
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (_mm_movemask_epi8(fourbytemarker) != 0) {
+            // contain a 4 byte marker
+            __m128i istate4 = _mm_slli_si128(_mm_blendv_epi8(zero, 
_mm_set1_epi8(0x7), fourbytemarker), 1);
+            __m128i state4 =_mm_or_si128(istate4, _mm_slli_si128(istate4, 1));
+            state4 =_mm_or_si128(state4, _mm_slli_si128(istate4, 2));
+
+            contbytes = _mm_or_si128(contbytes, state4);
+
+            // range check, filter out f0 and 
+            __m128i equal_f0 = _mm_cmpeq_epi8(_mm_blendv_epi8(zero, 
chunk_signed, fourbytemarker),
+                                              _mm_set1_epi8(0xf0-0x80));
+            if (_mm_movemask_epi8(equal_f0) != 0) {
+                __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x7f), 
chunk_signed, _mm_slli_si128(equal_f0, 1));
+                __m128i check_surrogate = _mm_cmplt_epi8(mask, 
_mm_set1_epi8(0x90-0x80));
+                if (_mm_movemask_epi8(check_surrogate) != 0) {
+                    return -1;
+                }
+            }
+
+            __m128i equal_f4 = _mm_cmpeq_epi8(_mm_blendv_epi8(zero, 
chunk_signed, fourbytemarker),
+                                              _mm_set1_epi8(0xf4-0x80));
+            if (_mm_movemask_epi8(equal_f4) != 0) {
+                __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x80), 
chunk_signed, _mm_slli_si128(equal_f4, 1));
+                __m128i check_surrogate = _mm_cmpgt_epi8(mask, 
_mm_set1_epi8(0x90-1-0x80));
+                if (_mm_movemask_epi8(check_surrogate) != 0) {
+                    return -1;
+                }
+            }
+
+            __m128i equal_f5_gt = _mm_cmpgt_epi8(_mm_blendv_epi8(zero, 
chunk_signed, fourbytemarker),
+                                              _mm_set1_epi8(0xf4-0x80));
+            if (_mm_movemask_epi8(equal_f5_gt) != 0) {
+                return -1;
+            }
+        }
+
+        // now check that contbytes and the actual byte values have a valid
+        // continuation at each position the marker indicates to have one
+        __m128i check_cont = _mm_cmpgt_epi8(contbytes, zero);
+        __m128i contpos = _mm_and_si128(_mm_set1_epi8(0xc0), chunk);
+        contpos = _mm_cmpeq_epi8(_mm_set1_epi8(0x80), contpos);
+        __m128i validcont = _mm_xor_si128(check_cont, contpos);
+        if (_mm_movemask_epi8(validcont) != 0) {
+            // uff, nope, that is really not utf8
+            return -1;
+        }
+
+        // CORRECT, calculate the length
+        // copy 0x00 over to each place which is a continuation byte
+        count = _mm_blendv_epi8(count, zero, contpos);
+
+        // count the code points using 2x 32 bit hadd and one last 16 hadd
+        // the result will end up at the lowest position
+        count = _mm_hadd_epi32(count, count);
+        count = _mm_hadd_epi32(count, count);
+        count = _mm_hadd_epi16(count, count);
+        uint16_t c = _mm_extract_epi16(count, 0);
+
+        // these cases need to be handled:
+        //                      16 byte boundary -> | <- 16 byte boundary
+        // -----------------------------------------+--------------------
+        // 1) 2 byte code point. e.g. ...  c2       | 80 ...
+        // 2) 3 byte code point. e.g. ...  e6       | 80 80 ...
+        // 3) 3 byte code point. e.g. ...  e6 80    | 80 ...
+        // 4) 4 byte code point. e.g. ...  f2       | 80 80 80 ...
+        // 5) 4 byte code point. e.g. ...  f2 80    | 80 80 ...
+        // 6) 4 byte code point. e.g. ...  f2 80 80 | 80 ...
+        //
+        int mask_chunk = _mm_movemask_epi8(chunk);
+        int mask_conti = _mm_movemask_epi8(contpos);
+
+        // little endian
+        int lenoff = 16;
+        int minus_codepoints = 0;
+        if (BIT(mask_chunk, 16) != 0 && BIT(mask_conti, 16) == 0) { // 1), 2), 
4)
+            minus_codepoints = 1;
+            lenoff -= 1;
+        } else if (BIT(mask_chunk, 15) != 0 && BIT(mask_conti, 15) == 0 &&
+                   BIT(mask_conti, 16) == 1) { // 3), 5)
+            minus_codepoints = 1;
+            lenoff -= 2;
+        } else if (BIT(mask_chunk, 14) != 0 && BIT(mask_conti, 14) == 0 &&
+                   BIT(mask_conti, 15) == 1 && BIT(mask_conti, 16) == 1) { // 
6)
+            minus_codepoints = 1;
+            lenoff -= 3;
+        }
+
+        num_codepoints += (c & 0xff) + ((c >> 8) & 0xff) - minus_codepoints;
+        len -= lenoff;
+        encoded += lenoff;
+    }
+
+    if (len == 0) {
+        return num_codepoints;
+    }
+
+    ssize_t result = count_utf8_codepoints_seq(encoded, len);
+    if (result == -1) {
+        return -1;
+    }
+
+    return num_codepoints + result;
+}
+
+ssize_t fu8_idx2bytepos_sse4(size_t index,
+                             const uint8_t * utf8, size_t len,
+                             struct fu8_idxtab * t)
+{
+    return 0;
+}
diff --git a/rpython/rlib/rutf8/src/utf8.c b/rpython/rlib/rutf8/src/utf8.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8.c
@@ -0,0 +1,240 @@
+#include "utf8.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "utf8-scalar.c" // copy code for scalar operations
+
+
+int instruction_set = -1;
+#define ISET_SSE4 0x1
+#define ISET_AVX 0x2
+#define ISET_AVX2 0x4
+
+void detect_instructionset(void)
+{
+    long eax;
+    long ebx;
+    long ecx;
+    long edx;
+    long op = 1;
+    asm ("cpuid"
+            : "=a" (eax),
+              "=b" (ebx),
+              "=c" (ecx),
+              "=d" (edx)
+            : "a" (op));
+
+    instruction_set = 0;
+    if (ecx & (1<<19)) { // sse4.1
+        instruction_set |= ISET_SSE4;
+    }
+    if(__builtin_cpu_supports("avx")) {
+        instruction_set |= ISET_AVX;
+    }
+    if(__builtin_cpu_supports("avx2")) {
+        instruction_set |= ISET_AVX2;
+    }
+}
+
+ssize_t count_utf8_codepoints(const uint8_t * encoded, size_t len)
+{
+    if (instruction_set == -1) {
+        detect_instructionset();
+    }
+
+    if (len >= 32 && (instruction_set & ISET_AVX2) != 0) {
+        // to the MOON!
+        return count_utf8_codepoints_avx(encoded, len);
+    }
+    if (len >= 16 && (instruction_set == ISET_SSE4) != 0) {
+        // speed!!
+        return count_utf8_codepoints_sse4(encoded, len);
+    }
+
+    // oh no, just do it sequentially!
+    return count_utf8_codepoints_seq(encoded, len);
+}
+
+typedef struct fu8_idxtab {
+    int character_step;
+    size_t * byte_positions;
+    size_t bytepos_table_length;
+} fu8_idxtab_t;
+
+#include <stdlib.h>
+
+fu8_idxtab_t * _fu8_alloc_idxtab(int cp_count, int character_step)
+{
+    if (cp_count <= character_step) {
+        return NULL;
+    }
+    long s = (cp_count/character_step) * sizeof(size_t);
+    char * c = calloc(1, sizeof(fu8_idxtab_t)+s);
+    fu8_idxtab_t * i = (fu8_idxtab_t*)c;
+    i->character_step = character_step;
+    i->byte_positions = (size_t*)(c + sizeof(fu8_idxtab_t));
+    i->bytepos_table_length = cp_count/character_step;
+    return i;
+}
+
+void fu8_free_idxtab(struct fu8_idxtab * t)
+{
+    // why manage this in C?
+    // it might at some point have a different data structure,
+    // then we can handle this easily here without modifying the API
+    free(t); t = NULL;
+}
+
+void _fu8_itab_set_bucket(struct fu8_idxtab * tab, int bucket, size_t off, 
size_t cpidx)
+{
+    size_t oldval = tab->byte_positions[bucket];
+    if (oldval != 0) {
+        assert(oldval != off && "table mismatch");
+    }
+    assert(bucket >= 0 && bucket < tab->bytepos_table_length && "index out of 
bounds");
+    tab->byte_positions[bucket] = off;
+}
+
+ssize_t _fu8_build_idxtab(size_t cpidx, size_t cpidx_off, size_t cplen,
+                          const uint8_t * utf8, size_t bytelen, size_t byteoff,
+                          struct fu8_idxtab ** tab) {
+    size_t code_point_index = cpidx_off;
+    const uint8_t * utf8_start_position = utf8 + byteoff;
+    const uint8_t * utf8_end_position = utf8 + bytelen - byteoff;
+
+    struct fu8_idxtab * itab = tab[0];
+    if (itab == NULL) {
+        tab[0] = itab = _fu8_alloc_idxtab(cplen, 1000);
+    }
+
+    int bucket_step = -1;
+    int bucket = -1;
+    if (itab) {
+        bucket_step = itab->character_step;
+        bucket = cpidx_off / bucket_step;
+        //printf("bucket %d step %d iindex_off %ld\n", bucket, bucket_step, 
cpidx_off);
+    }
+
+    while (utf8 < utf8_end_position) {
+        //printf("%d %llx ok\n", code_point_index, utf8);
+        if (code_point_index == cpidx) {
+            //printf("return %llx %llx %llx\n", utf8_start_position, utf8, 
utf8_end_position);
+            return utf8 - utf8_start_position;
+        }
+
+        if (bucket_step != -1 && code_point_index != 0 && (code_point_index % 
bucket_step) == 0) {
+            _fu8_itab_set_bucket(itab, bucket++, byteoff + utf8 - 
utf8_start_position, code_point_index);
+        }
+
+        uint8_t c = *utf8++;
+        //printf("%x\n", c);
+        code_point_index += 1;
+        if ((c & 0xc0) == 0) {
+            continue;
+        }
+        if ((c & 0xe0) == 0xc0) {
+            utf8 += 1;
+            continue;
+        }
+        if ((c & 0xf0) == 0xe0) {
+            utf8 += 2;
+            continue;
+        }
+        if ((c & 0xf8) == 0xf0) {
+            utf8 += 3;
+            continue;
+        }
+    }
+
+    return -1; // out of bounds!!
+}
+
+size_t _fu8_idxtab_lookup_bytepos_i(struct fu8_idxtab * tab, size_t cpidx);
+
+ssize_t _fu8_idx2bytepos(size_t index,
+                        const uint8_t * utf8, size_t bytelen, size_t cplen,
+                        struct fu8_idxtab ** tab)
+{
+
+    assert(index != 0 && "index must not be 0");
+    // note that itab STILL can be NULL
+
+}
+
+size_t _fu8_idxtab_lookup_bytepos_i(struct fu8_idxtab * tab, size_t cpidx)
+{
+    if (cpidx == 0 || tab == NULL) {
+        return 0;
+    }
+    int step = tab->character_step;
+    int tidx = cpidx / step;
+    size_t val = tab->byte_positions[tidx];
+    while (tidx > 0) {
+        if (val != 0) {
+            //printf("%llx at %d %d/%d\n", val, tidx, cpidx, step);
+            return val;
+        }
+        tidx--;
+        val = tab->byte_positions[tidx];
+    }
+    // no clue, start at the beginning!
+    return 0;
+
+    //int lp, rp; // left position, right position
+    //int mp; // middle position
+    //int count;
+    //lp = 0;
+    //rp = 16;
+
+    //if (cpidx == 0) {
+    //    return -1;
+    //}
+
+    //size_t valid_left = -1;
+
+    //do {
+    //    count = (rp - lp);
+    //    mp = lp + count / 2;
+
+    //    size_t lval = tab->codepoint_positions[lp];
+    //    size_t mval = tab->codepoint_positions[mp];
+    //    size_t rval = tab->codepoint_positions[rp];
+    //    printf("l %d m %d r %d\nlv %d mv %d rv %d\n", lp, mp, rp, lval, 
mval, rval);
+    //    if (lval != 0 && lval <= cpidx) {
+    //        valid_left = lp;
+    //    } else if (lval == 0) {
+    //        // nothing is known about the left most value
+    //        break;
+    //    }
+
+    //    if (mval == cpidx) {
+    //        return mp;
+    //    }
+
+    //    if (mval == 0 || mval < cpidx) {
+    //        // nothing is known about the middle value,
+    //        // or mval is smaller the searched code point index
+    //        rp = mp;
+    //        continue;
+    //    } else {
+    //        lp = mp;
+    //        continue;
+    //    }
+
+    //} while (count > 1);
+
+    //return valid_left;
+}
+
+ssize_t fu8_idx2bytepos(size_t index,
+                        const uint8_t * utf8, size_t bytelen,
+                        size_t cplen,
+                        struct fu8_idxtab ** tab)
+{
+    if (index == 0) { return 0; }
+    if (index >= cplen) { return -1; }
+    size_t off = _fu8_idxtab_lookup_bytepos_i(tab[0], index);
+    //printf("found %llx\n", off);
+    return _fu8_build_idxtab(index, 0, cplen, utf8, bytelen, 0, tab);
+}
diff --git a/rpython/rlib/rutf8/src/utf8.h b/rpython/rlib/rutf8/src/utf8.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stddef.h>
+
+/**
+ * Returns -1 if the given string is not a valid utf8 encoded string.
+ * Otherwise returns the amount code point in the given string.
+ * len: length in bytes (8-bit)
+ *
+ * The above documentation also applies for several vectorized implementations
+ * found below.
+ *
+ * count_utf8_codepoints dispatches amongst several
+ * implementations (e.g. seq, SSE4, AVX)
+ */
+// TODO rename (fu8 prefix)
+ssize_t fu8_count_utf8_codepoints(const uint8_t * encoded, size_t len);
+ssize_t fu8_count_utf8_codepoints_seq(const uint8_t * encoded, size_t len);
+ssize_t fu8_count_utf8_codepoints_sse4(const uint8_t * encoded, size_t len);
+ssize_t fu8_count_utf8_codepoints_avx(const uint8_t * encoded, size_t len);
+
+
+struct fu8_idxtab;
+
+/**
+ * Looks up the byte position of the utf8 code point at the index.
+ * Assumptions:
+ *
+ *  * utf8 parameter is utf8 encoded, otherwise the result is undefined.
+ *  * passing one struct fu8_idxtab instance to several different utf8 strings
+ *    yields undefined behaviour
+ *
+ * Return values:
+ *
+ * -1, if the index is out of bounds of utf8
+ *  X, where X >= 0. X is the byte postion for the code point at index
+ *
+ * If table is not NULL, this routine builds up a lookup
+ * table to speed up indexing.
+ *
+ */
+ssize_t fu8_idx2bytepos(size_t index,
+                        const uint8_t * utf8, size_t bytelen,
+                        size_t cplen,
+                        struct fu8_idxtab ** tab);
+void fu8_free_idxtab(struct fu8_idxtab * t);
+ssize_t fu8_idx2bytepso_sse4(size_t index,
+                             const uint8_t * utf8, size_t len,
+                             struct fu8_idxtab ** t);
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: copy source code from github repo (pypy/fast-utf8-methods, 0a7e7ba813), add rpython wrapper to access c api

Reply via email to