Author: Richard Plangger <[email protected]>
Branch: unicode-utf8
Changeset: r90673:5ca17ebc466d
Date: 2017-03-14 09:10 +0100
http://bitbucket.org/pypy/pypy/changeset/5ca17ebc466d/
Log: copy source code from github repo (pypy/fast-utf8-methods,
0a7e7ba813), add rpython wrapper to access c api
diff --git a/rpython/rlib/rutf8/capi.py b/rpython/rlib/rutf8/capi.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/capi.py
@@ -0,0 +1,56 @@
+import py
+import sys
+from rpython.tool.version import rpythonroot
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from rpython.rtyper.tool import rffi_platform as platform
+
+ROOT = py.path.local(rpythonroot).join('rpython', 'rlib', 'rutf8')
+SRC = ROOT.join('src')
+
+if sys.platform.startswith('linux'):
+ _libs = ['dl']
+else:
+ _libs = []
+eci_kwds = dict(
+ include_dirs = [SRC],
+ includes = ['utf8.h'],
+ libraries = _libs,
+ separate_module_files = [SRC.join('utf8.c')],)
+global_eci = ExternalCompilationInfo(**eci_kwds)
+
+IDXTAB = lltype.ForwardReference()
+IDXTAB.become(rffi.CStruct("fu8_idxtab",
+ ('character_step', rffi.INT),
+ ('byte_positions', lltype.Ptr(rffi.SIZE_T)),
+ ('bytepos_table_length', rffi.SIZE_T)))
+IDXTABPP = lltype.Ptr(lltype.Ptr(IDXTAB))
+
+def setup():
+ compile_extra = ['-DRPYTHON_LL2CTYPES']
+ platform.verify_eci(ExternalCompilationInfo(
+ compile_extra=compile_extra,
+ **eci_kwds))
+
+ eci = global_eci
+ count_utf8_code_points = rffi.llexternal("fu8_count_utf8_codepoints",
+ [rffi.CCHARP, rffi.SIZE_T],
+ rffi.SSIZE_T, compilation_info=eci,
+ _nowrapper=True)
+ index2byteposition = rffi.llexternal("fu8_idx2bytepos",
+ [rffi.SIZE_T, rffi.CCHARP, rffi.SIZE_T,
IDXTABPP],
+ rffi.SSIZE_T, compilation_info=eci,
+ _nowrapper=True)
+
+ return CInterface(locals())
+
+
+class CInterface(object):
+ def __init__(self, namespace):
+ for k, v in namespace.iteritems():
+ setattr(self, k, v)
+
+ def _freeze_(self):
+ return True
+
+
diff --git a/rpython/rlib/rutf8/src/utf8-avx.c
b/rpython/rlib/rutf8/src/utf8-avx.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8-avx.c
@@ -0,0 +1,254 @@
+#include "utf8.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <immintrin.h>
+
+#define BIT(B,P) ((B >> (P-1)) & 0x1)
+
+void _print_mmy(const char * msg, __m256i chunk)
+{
+ printf("%s:", msg);
+ // unpack the first 8 bytes, padding with zeros
+ uint64_t a = _mm256_extract_epi64(chunk, 0);
+ uint64_t b = _mm256_extract_epi64(chunk, 1);
+ uint64_t c = _mm256_extract_epi64(chunk, 2);
+ uint64_t d = _mm256_extract_epi64(chunk, 3);
+ printf("%.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x
%.2x%.2x%.2x%.2x "
+ "%.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x
%.2x%.2x%.2x%.2x",
+ (unsigned char)((a >> 0) & 0xff),
+ (unsigned char)((a >> 8) & 0xff),
+ (unsigned char)((a >> 16) & 0xff),
+ (unsigned char)((a >> 24) & 0xff),
+
+ (unsigned char)((a >> 32) & 0xff),
+ (unsigned char)((a >> 40) & 0xff),
+ (unsigned char)((a >> 48) & 0xff),
+ (unsigned char)((a >> 56) & 0xff),
+
+ (unsigned char)((b >> 0) & 0xff),
+ (unsigned char)((b >> 8) & 0xff),
+ (unsigned char)((b >> 16) & 0xff),
+ (unsigned char)((b >> 24) & 0xff),
+
+ (unsigned char)((b >> 32) & 0xff),
+ (unsigned char)((b >> 40) & 0xff),
+ (unsigned char)((b >> 48) & 0xff),
+ (unsigned char)((b >> 56) & 0xff),
+
+ (unsigned char)((c >> 0) & 0xff),
+ (unsigned char)((c >> 8) & 0xff),
+ (unsigned char)((c >> 16) & 0xff),
+ (unsigned char)((c >> 24) & 0xff),
+
+ (unsigned char)((c >> 32) & 0xff),
+ (unsigned char)((c >> 40) & 0xff),
+ (unsigned char)((c >> 48) & 0xff),
+ (unsigned char)((c >> 56) & 0xff),
+
+ (unsigned char)((d >> 0) & 0xff),
+ (unsigned char)((d >> 8) & 0xff),
+ (unsigned char)((d >> 16) & 0xff),
+ (unsigned char)((d >> 24) & 0xff),
+
+ (unsigned char)((d >> 32) & 0xff),
+ (unsigned char)((d >> 40) & 0xff),
+ (unsigned char)((d >> 48) & 0xff),
+ (unsigned char)((d >> 56) & 0xff)
+ );
+
+ printf("\n");
+}
+
+ssize_t count_utf8_codepoints_avx(const uint8_t * encoded, size_t len)
+{
+ __builtin_prefetch(encoded, 0, 0);
+ size_t num_codepoints = 0;
+ __m256i chunk;
+
+ if (len == 0) {
+ return 0;
+ }
+ __m256i zero = _mm256_set1_epi8(0x00);
+ while (len >= 32) {
+ chunk = _mm256_loadu_si256((__m256i*)encoded);
+ if (_mm256_movemask_epi8(chunk) == 0) {
+ // valid ascii chars!
+ len -= 32;
+ encoded += 32;
+ num_codepoints += 32;
+ continue;
+ }
+ __builtin_prefetch(encoded+32, 0, 0);
+
+ __m256i count = _mm256_set1_epi8(0x1);
+ //_print_mm256x("chunk", chunk);
+ // fight against the fact that there is no comparison on unsigned
values
+ __m256i chunk_signed = _mm256_add_epi8(chunk, _mm256_set1_epi8(0x80));
+ //_print_mm256x("shunk", chunk_signed);
+
+ // ERROR checking
+ // checking procedure works the following way:
+ //
+ // 1) mark all continuation bytes with either 0x1, 0x3, 0x7 (one, two
or three bytes continuation)
+ // 2) then check that there is no byte that has an invalid continuation
+ __m256i twobytemarker = _mm256_cmpgt_epi8( chunk_signed,
_mm256_set1_epi8(0xc0-1-0x80));
+ __m256i threebytemarker = _mm256_cmpgt_epi8(chunk_signed,
_mm256_set1_epi8(0xe0-1-0x80));
+ __m256i fourbytemarker = _mm256_cmpgt_epi8( chunk_signed,
_mm256_set1_epi8(0xf0-1-0x80));
+
+ // the general idea of the following code collects 0xff for each byte
position
+ // in the variable contbytes.
+ // at the end check if each position in contbytes set to 0xff is a
valid continuation byte
+
+ // check that 0xc0 > 0xc2
+ __m256i validtwobm = _mm256_cmpgt_epi8(chunk_signed,
_mm256_set1_epi8(0xc2-1-0x80));
+ if (_mm256_movemask_epi8(_mm256_xor_si256(validtwobm, twobytemarker))
!= 0) {
+ // two byte marker should not be in range [0xc0-0xc2)
+ return -1;
+ }
+
+ __m256i state2 = _mm256_andnot_si256(threebytemarker, twobytemarker);
+ __m256i contbytes = _mm256_slli_si256(_mm256_blendv_epi8(state2,
_mm256_set1_epi8(0x1), twobytemarker), 1);
+
+ if (_mm256_movemask_epi8(threebytemarker) != 0) {
+ // contains at least one 3 byte marker
+ __m256i istate3 = _mm256_andnot_si256(fourbytemarker,
threebytemarker);
+ __m256i state3 = _mm256_slli_si256(_mm256_blendv_epi8(zero,
_mm256_set1_epi8(0x3), istate3), 1);
+ state3 = _mm256_or_si256(state3, _mm256_slli_si256(state3, 1));
+
+ contbytes = _mm256_or_si256(contbytes, state3);
+
+ // range check
+ __m256i equal_e0 = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero,
chunk_signed, istate3),
+ _mm256_set1_epi8(0xe0-0x80));
+ if (_mm256_movemask_epi8(equal_e0) != 0) {
+ __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x7f),
chunk_signed, _mm256_slli_si256(equal_e0, 1));
+ __m256i check_surrogate =
_mm256_cmpgt_epi8(_mm256_set1_epi8(0xa0-0x80), mask); // lt
+ if (_mm256_movemask_epi8(check_surrogate) != 0) {
+ // invalid surrograte character!!!
+ return -1;
+ }
+ }
+
+ // verify that there are now surrogates
+ if (!ALLOW_SURROGATES) {
+ __m256i equal_ed = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero,
chunk_signed, istate3),
+ _mm256_set1_epi8(0xed-0x80));
+ if (_mm256_movemask_epi8(equal_ed) != 0) {
+ __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x80),
chunk_signed, _mm256_slli_si256(equal_ed, 1));
+ __m256i check_surrogate = _mm256_cmpgt_epi8(mask,
_mm256_set1_epi8(0xa0-1-0x80));
+ if (_mm256_movemask_epi8(check_surrogate) != 0) {
+ // invalid surrograte character!!!
+ return -1;
+ }
+ }
+ }
+ }
+
+ if (_mm256_movemask_epi8(fourbytemarker) != 0) {
+ // contain a 4 byte marker
+ __m256i istate4 = _mm256_slli_si256(_mm256_blendv_epi8(zero,
_mm256_set1_epi8(0x7), fourbytemarker), 1);
+ __m256i state4 =_mm256_or_si256(istate4,
_mm256_slli_si256(istate4, 1));
+ state4 =_mm256_or_si256(state4, _mm256_slli_si256(istate4, 2));
+
+ contbytes = _mm256_or_si256(contbytes, state4);
+
+ // range check, filter out f0 and
+ __m256i equal_f0 = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero,
chunk_signed, fourbytemarker),
+ _mm256_set1_epi8(0xf0-0x80));
+ if (_mm256_movemask_epi8(equal_f0) != 0) {
+ __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x7f),
chunk_signed, _mm256_slli_si256(equal_f0, 1));
+ __m256i check_surrogate =
_mm256_cmpgt_epi8(_mm256_set1_epi8(0x90-0x80), mask);
+ if (_mm256_movemask_epi8(check_surrogate) != 0) {
+ return -1;
+ }
+ }
+
+ __m256i equal_f4 = _mm256_cmpeq_epi8(_mm256_blendv_epi8(zero,
chunk_signed, fourbytemarker),
+ _mm256_set1_epi8(0xf4-0x80));
+ if (_mm256_movemask_epi8(equal_f4) != 0) {
+ __m256i mask = _mm256_blendv_epi8(_mm256_set1_epi8(0x80),
chunk_signed, _mm256_slli_si256(equal_f4, 1));
+ __m256i check_surrogate = _mm256_cmpgt_epi8(mask,
_mm256_set1_epi8(0x90-1-0x80));
+ if (_mm256_movemask_epi8(check_surrogate) != 0) {
+ return -1;
+ }
+ }
+
+ __m256i equal_f5_gt = _mm256_cmpgt_epi8(_mm256_blendv_epi8(zero,
chunk_signed, fourbytemarker),
+ _mm256_set1_epi8(0xf4-0x80));
+ if (_mm256_movemask_epi8(equal_f5_gt) != 0) {
+ return -1;
+ }
+ }
+
+ // now check that contbytes and the actual byte values have a valid
+ // continuation at each position the marker indicates to have one
+ __m256i check_cont = _mm256_cmpgt_epi8(contbytes, zero);
+ __m256i contpos = _mm256_and_si256(_mm256_set1_epi8(0xc0), chunk);
+ contpos = _mm256_cmpeq_epi8(_mm256_set1_epi8(0x80), contpos);
+ __m256i validcont = _mm256_xor_si256(check_cont, contpos);
+ if (_mm256_movemask_epi8(validcont) != 0) {
+ // uff, nope, that is really not utf8
+ return -1;
+ }
+
+ // CORRECT, calculate the length
+ // copy 0x00 over to each place which is a continuation byte
+ count = _mm256_blendv_epi8(count, zero, contpos);
+
+ // count the code points using 2x 32 bit hadd and one last 16 hadd
+ // the result will end up at the lowest position
+ count = _mm256_hadd_epi32(count, zero);
+ count = _mm256_hadd_epi32(count, zero);
+ count = _mm256_hadd_epi16(count, zero);
+ uint16_t c = _mm256_extract_epi16(count, 0);
+ uint16_t c2 = _mm256_extract_epi16(count, 8);
+ uint16_t points = (c & 0xff) + ((c >> 8) & 0xff) + (c2 & 0xff) + ((c2
>> 8) & 0xff);
+
+ // these cases need to be handled:
+ // 16 byte boundary -> | <- 16 byte boundary
+ // -----------------------------------------+--------------------
+ // 1) 2 byte code point. e.g. ... c2 | 80 ...
+ // 2) 3 byte code point. e.g. ... e6 | 80 80 ...
+ // 3) 3 byte code point. e.g. ... e6 80 | 80 ...
+ // 4) 4 byte code point. e.g. ... f2 | 80 80 80 ...
+ // 5) 4 byte code point. e.g. ... f2 80 | 80 80 ...
+ // 6) 4 byte code point. e.g. ... f2 80 80 | 80 ...
+ //
+ int mask_chunk = _mm256_movemask_epi8(chunk);
+ int mask_conti = _mm256_movemask_epi8(contpos);
+
+ // little endian
+ int lenoff = 32;
+ int minus_codepoints = 0;
+ if (BIT(mask_chunk, 32) != 0 && BIT(mask_conti, 32) == 0) { // 1), 2),
4)
+ minus_codepoints = 1;
+ lenoff -= 1;
+ } else if (BIT(mask_chunk, 31) != 0 && BIT(mask_conti, 31) == 0 &&
+ BIT(mask_conti, 32) == 1) { // 3), 5)
+ minus_codepoints = 1;
+ lenoff -= 2;
+ } else if (BIT(mask_chunk, 30) != 0 && BIT(mask_conti, 30) == 0 &&
+ BIT(mask_conti, 31) == 1 && BIT(mask_conti, 32) == 1) { //
6)
+ minus_codepoints = 1;
+ lenoff -= 3;
+ }
+
+ num_codepoints += points - minus_codepoints;
+ len -= lenoff;
+ encoded += lenoff;
+ }
+
+ if (len == 0) {
+ return num_codepoints;
+ }
+
+ ssize_t result = count_utf8_codepoints_seq(encoded, len);
+ if (result == -1) {
+ return -1;
+ }
+
+ return num_codepoints + result;
+ return -1;
+}
diff --git a/rpython/rlib/rutf8/src/utf8-scalar.c
b/rpython/rlib/rutf8/src/utf8-scalar.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8-scalar.c
@@ -0,0 +1,70 @@
+#include "utf8.h"
+
+int _check_continuation(const uint8_t ** encoded, const uint8_t * endptr, int
count) {
+ ssize_t size = endptr - *encoded;
+
+ if (size < count) {
+ // not enough bytes to be a valid 2 byte utf8 code point
+ return -1;
+ }
+ for (int i = 0; i < count; i++) {
+ uint8_t byte = *(*encoded)++;
+ if ((byte & 0xc0) != 0x80) {
+ // continuation byte does NOT match 0x10xxxxxx
+ return -1;
+ }
+ }
+ return 0;
+}
+
+ssize_t count_utf8_codepoints_seq(const uint8_t * encoded, size_t len) {
+ size_t num_codepoints = 0;
+ uint8_t byte = 0;
+ const uint8_t * endptr = encoded + len;
+
+ while (encoded < endptr) {
+ byte = *encoded++;
+ if (byte < 0x80) {
+ num_codepoints += 1;
+ continue;
+ } else {
+ //asm("int $3");
+ if ((byte & 0xe0) == 0xc0) {
+ // one continuation byte
+ if (byte < 0xc2) {
+ return -1;
+ }
+ if (_check_continuation(&encoded, endptr, 1) != 0) {
+ return -1;
+ }
+ } else if ((byte & 0xf0) == 0xe0) {
+ // two continuation byte
+ if (_check_continuation(&encoded, endptr, 2) != 0) {
+ return -1;
+ }
+ uint8_t byte1 = encoded[-2];
+ //surrogates shouldn't be valid UTF-8!
+ if ((byte == 0xe0 && byte1 < 0xa0) ||
+ (byte == 0xed && byte1 > 0x9f && !ALLOW_SURROGATES)) {
+ return -1;
+ }
+ } else if ((byte & 0xf8) == 0xf0) {
+ // three continuation byte
+ if (_check_continuation(&encoded, endptr, 3) != 0) {
+ return -1;
+ }
+ uint8_t byte1 = encoded[-3];
+ if ((byte == 0xf0 && byte1 < 0x90) ||
+ (byte == 0xf4 && byte1 > 0x8f) ||
+ (byte >= 0xf5)) {
+ return -1;
+ }
+ } else {
+ // TODO
+ return -1;
+ }
+ num_codepoints += 1;
+ }
+ }
+ return num_codepoints;
+}
diff --git a/rpython/rlib/rutf8/src/utf8-sse4.c
b/rpython/rlib/rutf8/src/utf8-sse4.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8-sse4.c
@@ -0,0 +1,238 @@
+#include "utf8.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <xmmintrin.h>
+#include <smmintrin.h>
+
+#define BIT(B,P) ((B >> (P-1)) & 0x1)
+
+void _print_mmx(const char * msg, __m128i chunk)
+{
+ printf("%s:", msg);
+ // unpack the first 8 bytes, padding with zeros
+ uint64_t a = _mm_extract_epi64(chunk, 0);
+ uint64_t b = _mm_extract_epi64(chunk, 1);
+ printf("%.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x %.2x%.2x%.2x%.2x
%.2x%.2x%.2x%.2x",
+ (unsigned char)((a >> 0) & 0xff),
+ (unsigned char)((a >> 8) & 0xff),
+ (unsigned char)((a >> 16) & 0xff),
+ (unsigned char)((a >> 24) & 0xff),
+
+ (unsigned char)((a >> 32) & 0xff),
+ (unsigned char)((a >> 40) & 0xff),
+ (unsigned char)((a >> 48) & 0xff),
+ (unsigned char)((a >> 56) & 0xff),
+
+ (unsigned char)((b >> 0) & 0xff),
+ (unsigned char)((b >> 8) & 0xff),
+ (unsigned char)((b >> 16) & 0xff),
+ (unsigned char)((b >> 24) & 0xff),
+
+ (unsigned char)((b >> 32) & 0xff),
+ (unsigned char)((b >> 40) & 0xff),
+ (unsigned char)((b >> 48) & 0xff),
+ (unsigned char)((b >> 56) & 0xff)
+ );
+
+ printf("\n");
+}
+
+
+ssize_t count_utf8_codepoints_sse4(const uint8_t * encoded, size_t len)
+{
+ __builtin_prefetch(encoded, 0, 0);
+ size_t num_codepoints = 0;
+ __m128i chunk;
+
+ if (len == 0) {
+ return 0;
+ }
+ __m128i zero = _mm_set1_epi8(0x00);
+
+ while (len >= 16) {
+ chunk = _mm_loadu_si128((__m128i*)encoded);
+ if (_mm_movemask_epi8(chunk) == 0) {
+ // valid ascii chars!
+ len -= 16;
+ encoded += 16;
+ num_codepoints += 16;
+ continue;
+ }
+ __builtin_prefetch(encoded+16, 0, 0);
+
+ __m128i count = _mm_set1_epi8(0x1);
+ //_print_mmx("chunk", chunk);
+ // fight against the fact that there is no comparison on unsigned
values
+ __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
+ //_print_mmx("shunk", chunk_signed);
+
+ // ERROR checking
+ // checking procedure works the following way:
+ //
+ // 1) mark all continuation bytes with either 0x1, 0x3, 0x7 (one, two
or three bytes continuation)
+ // 2) then check that there is no byte that has an invalid continuation
+ __m128i twobytemarker = _mm_cmplt_epi8(_mm_set1_epi8(0xc0-1-0x80),
chunk_signed);
+ __m128i threebytemarker = _mm_cmplt_epi8(_mm_set1_epi8(0xe0-1-0x80),
chunk_signed);
+ __m128i fourbytemarker = _mm_cmplt_epi8(_mm_set1_epi8(0xf0-1-0x80),
chunk_signed);
+
+ // the general idea of the following code collects 0xff for each byte
position
+ // in the variable contbytes.
+ // at the end check if each position in contbytes set to 0xff is a
valid continuation byte
+
+ // check that 0xc0 > 0xc2
+ __m128i validtwobm = _mm_cmplt_epi8(_mm_set1_epi8(0xc2-1-0x80),
chunk_signed);
+ if (_mm_movemask_epi8(_mm_xor_si128(validtwobm, twobytemarker)) != 0) {
+ // two byte marker should not be in range [0xc0-0xc2)
+ return -1;
+ }
+
+ __m128i state2 = _mm_andnot_si128(threebytemarker, twobytemarker);
+ __m128i contbytes = _mm_slli_si128(_mm_blendv_epi8(state2,
_mm_set1_epi8(0x1), twobytemarker), 1);
+
+ if (_mm_movemask_epi8(threebytemarker) != 0) {
+ // contains at least one 3 byte marker
+ __m128i istate3 = _mm_andnot_si128(fourbytemarker,
threebytemarker);
+ __m128i state3 = _mm_slli_si128(_mm_blendv_epi8(zero,
_mm_set1_epi8(0x3), istate3), 1);
+ state3 = _mm_or_si128(state3, _mm_slli_si128(state3, 1));
+
+ contbytes = _mm_or_si128(contbytes, state3);
+
+ // range check
+ __m128i equal_e0 = _mm_cmpeq_epi8(_mm_blendv_epi8(zero,
chunk_signed, istate3),
+ _mm_set1_epi8(0xe0-0x80));
+ if (_mm_movemask_epi8(equal_e0) != 0) {
+ __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x7f),
chunk_signed, _mm_slli_si128(equal_e0, 1));
+ __m128i check_surrogate = _mm_cmplt_epi8(mask,
_mm_set1_epi8(0xa0-0x80));
+ if (_mm_movemask_epi8(check_surrogate) != 0) {
+ // invalid surrograte character!!!
+ return -1;
+ }
+ }
+
+ // verify that there are now surrogates
+ if (!ALLOW_SURROGATES) {
+ __m128i equal_ed = _mm_cmpeq_epi8(_mm_blendv_epi8(zero,
chunk_signed, istate3),
+ _mm_set1_epi8(0xed-0x80));
+ if (_mm_movemask_epi8(equal_ed) != 0) {
+ __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x80),
chunk_signed, _mm_slli_si128(equal_ed, 1));
+ __m128i check_surrogate = _mm_cmpgt_epi8(mask,
_mm_set1_epi8(0xa0-1-0x80));
+ if (_mm_movemask_epi8(check_surrogate) != 0) {
+ // invalid surrograte character!!!
+ return -1;
+ }
+ }
+ }
+ }
+
+ if (_mm_movemask_epi8(fourbytemarker) != 0) {
+ // contain a 4 byte marker
+ __m128i istate4 = _mm_slli_si128(_mm_blendv_epi8(zero,
_mm_set1_epi8(0x7), fourbytemarker), 1);
+ __m128i state4 =_mm_or_si128(istate4, _mm_slli_si128(istate4, 1));
+ state4 =_mm_or_si128(state4, _mm_slli_si128(istate4, 2));
+
+ contbytes = _mm_or_si128(contbytes, state4);
+
+ // range check, filter out f0 and
+ __m128i equal_f0 = _mm_cmpeq_epi8(_mm_blendv_epi8(zero,
chunk_signed, fourbytemarker),
+ _mm_set1_epi8(0xf0-0x80));
+ if (_mm_movemask_epi8(equal_f0) != 0) {
+ __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x7f),
chunk_signed, _mm_slli_si128(equal_f0, 1));
+ __m128i check_surrogate = _mm_cmplt_epi8(mask,
_mm_set1_epi8(0x90-0x80));
+ if (_mm_movemask_epi8(check_surrogate) != 0) {
+ return -1;
+ }
+ }
+
+ __m128i equal_f4 = _mm_cmpeq_epi8(_mm_blendv_epi8(zero,
chunk_signed, fourbytemarker),
+ _mm_set1_epi8(0xf4-0x80));
+ if (_mm_movemask_epi8(equal_f4) != 0) {
+ __m128i mask = _mm_blendv_epi8(_mm_set1_epi8(0x80),
chunk_signed, _mm_slli_si128(equal_f4, 1));
+ __m128i check_surrogate = _mm_cmpgt_epi8(mask,
_mm_set1_epi8(0x90-1-0x80));
+ if (_mm_movemask_epi8(check_surrogate) != 0) {
+ return -1;
+ }
+ }
+
+ __m128i equal_f5_gt = _mm_cmpgt_epi8(_mm_blendv_epi8(zero,
chunk_signed, fourbytemarker),
+ _mm_set1_epi8(0xf4-0x80));
+ if (_mm_movemask_epi8(equal_f5_gt) != 0) {
+ return -1;
+ }
+ }
+
+ // now check that contbytes and the actual byte values have a valid
+ // continuation at each position the marker indicates to have one
+ __m128i check_cont = _mm_cmpgt_epi8(contbytes, zero);
+ __m128i contpos = _mm_and_si128(_mm_set1_epi8(0xc0), chunk);
+ contpos = _mm_cmpeq_epi8(_mm_set1_epi8(0x80), contpos);
+ __m128i validcont = _mm_xor_si128(check_cont, contpos);
+ if (_mm_movemask_epi8(validcont) != 0) {
+ // uff, nope, that is really not utf8
+ return -1;
+ }
+
+ // CORRECT, calculate the length
+ // copy 0x00 over to each place which is a continuation byte
+ count = _mm_blendv_epi8(count, zero, contpos);
+
+ // count the code points using 2x 32 bit hadd and one last 16 hadd
+ // the result will end up at the lowest position
+ count = _mm_hadd_epi32(count, count);
+ count = _mm_hadd_epi32(count, count);
+ count = _mm_hadd_epi16(count, count);
+ uint16_t c = _mm_extract_epi16(count, 0);
+
+ // these cases need to be handled:
+ // 16 byte boundary -> | <- 16 byte boundary
+ // -----------------------------------------+--------------------
+ // 1) 2 byte code point. e.g. ... c2 | 80 ...
+ // 2) 3 byte code point. e.g. ... e6 | 80 80 ...
+ // 3) 3 byte code point. e.g. ... e6 80 | 80 ...
+ // 4) 4 byte code point. e.g. ... f2 | 80 80 80 ...
+ // 5) 4 byte code point. e.g. ... f2 80 | 80 80 ...
+ // 6) 4 byte code point. e.g. ... f2 80 80 | 80 ...
+ //
+ int mask_chunk = _mm_movemask_epi8(chunk);
+ int mask_conti = _mm_movemask_epi8(contpos);
+
+ // little endian
+ int lenoff = 16;
+ int minus_codepoints = 0;
+ if (BIT(mask_chunk, 16) != 0 && BIT(mask_conti, 16) == 0) { // 1), 2),
4)
+ minus_codepoints = 1;
+ lenoff -= 1;
+ } else if (BIT(mask_chunk, 15) != 0 && BIT(mask_conti, 15) == 0 &&
+ BIT(mask_conti, 16) == 1) { // 3), 5)
+ minus_codepoints = 1;
+ lenoff -= 2;
+ } else if (BIT(mask_chunk, 14) != 0 && BIT(mask_conti, 14) == 0 &&
+ BIT(mask_conti, 15) == 1 && BIT(mask_conti, 16) == 1) { //
6)
+ minus_codepoints = 1;
+ lenoff -= 3;
+ }
+
+ num_codepoints += (c & 0xff) + ((c >> 8) & 0xff) - minus_codepoints;
+ len -= lenoff;
+ encoded += lenoff;
+ }
+
+ if (len == 0) {
+ return num_codepoints;
+ }
+
+ ssize_t result = count_utf8_codepoints_seq(encoded, len);
+ if (result == -1) {
+ return -1;
+ }
+
+ return num_codepoints + result;
+}
+
+ssize_t fu8_idx2bytepos_sse4(size_t index,
+ const uint8_t * utf8, size_t len,
+ struct fu8_idxtab * t)
+{
+ return 0;
+}
diff --git a/rpython/rlib/rutf8/src/utf8.c b/rpython/rlib/rutf8/src/utf8.c
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8.c
@@ -0,0 +1,240 @@
+#include "utf8.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "utf8-scalar.c" // copy code for scalar operations
+
+
+int instruction_set = -1;
+#define ISET_SSE4 0x1
+#define ISET_AVX 0x2
+#define ISET_AVX2 0x4
+
+void detect_instructionset(void)
+{
+ long eax;
+ long ebx;
+ long ecx;
+ long edx;
+ long op = 1;
+ asm ("cpuid"
+ : "=a" (eax),
+ "=b" (ebx),
+ "=c" (ecx),
+ "=d" (edx)
+ : "a" (op));
+
+ instruction_set = 0;
+ if (ecx & (1<<19)) { // sse4.1
+ instruction_set |= ISET_SSE4;
+ }
+ if(__builtin_cpu_supports("avx")) {
+ instruction_set |= ISET_AVX;
+ }
+ if(__builtin_cpu_supports("avx2")) {
+ instruction_set |= ISET_AVX2;
+ }
+}
+
+ssize_t count_utf8_codepoints(const uint8_t * encoded, size_t len)
+{
+ if (instruction_set == -1) {
+ detect_instructionset();
+ }
+
+ if (len >= 32 && (instruction_set & ISET_AVX2) != 0) {
+ // to the MOON!
+ return count_utf8_codepoints_avx(encoded, len);
+ }
+ if (len >= 16 && (instruction_set == ISET_SSE4) != 0) {
+ // speed!!
+ return count_utf8_codepoints_sse4(encoded, len);
+ }
+
+ // oh no, just do it sequentially!
+ return count_utf8_codepoints_seq(encoded, len);
+}
+
+typedef struct fu8_idxtab {
+ int character_step;
+ size_t * byte_positions;
+ size_t bytepos_table_length;
+} fu8_idxtab_t;
+
+#include <stdlib.h>
+
+fu8_idxtab_t * _fu8_alloc_idxtab(int cp_count, int character_step)
+{
+ if (cp_count <= character_step) {
+ return NULL;
+ }
+ long s = (cp_count/character_step) * sizeof(size_t);
+ char * c = calloc(1, sizeof(fu8_idxtab_t)+s);
+ fu8_idxtab_t * i = (fu8_idxtab_t*)c;
+ i->character_step = character_step;
+ i->byte_positions = (size_t*)(c + sizeof(fu8_idxtab_t));
+ i->bytepos_table_length = cp_count/character_step;
+ return i;
+}
+
+void fu8_free_idxtab(struct fu8_idxtab * t)
+{
+ // why manage this in C?
+ // it might at some point have a different data structure,
+ // then we can handle this easily here without modifying the API
+ free(t); t = NULL;
+}
+
+void _fu8_itab_set_bucket(struct fu8_idxtab * tab, int bucket, size_t off,
size_t cpidx)
+{
+ size_t oldval = tab->byte_positions[bucket];
+ if (oldval != 0) {
+ assert(oldval != off && "table mismatch");
+ }
+ assert(bucket >= 0 && bucket < tab->bytepos_table_length && "index out of
bounds");
+ tab->byte_positions[bucket] = off;
+}
+
+ssize_t _fu8_build_idxtab(size_t cpidx, size_t cpidx_off, size_t cplen,
+ const uint8_t * utf8, size_t bytelen, size_t byteoff,
+ struct fu8_idxtab ** tab) {
+ size_t code_point_index = cpidx_off;
+ const uint8_t * utf8_start_position = utf8 + byteoff;
+ const uint8_t * utf8_end_position = utf8 + bytelen - byteoff;
+
+ struct fu8_idxtab * itab = tab[0];
+ if (itab == NULL) {
+ tab[0] = itab = _fu8_alloc_idxtab(cplen, 1000);
+ }
+
+ int bucket_step = -1;
+ int bucket = -1;
+ if (itab) {
+ bucket_step = itab->character_step;
+ bucket = cpidx_off / bucket_step;
+ //printf("bucket %d step %d iindex_off %ld\n", bucket, bucket_step,
cpidx_off);
+ }
+
+ while (utf8 < utf8_end_position) {
+ //printf("%d %llx ok\n", code_point_index, utf8);
+ if (code_point_index == cpidx) {
+ //printf("return %llx %llx %llx\n", utf8_start_position, utf8,
utf8_end_position);
+ return utf8 - utf8_start_position;
+ }
+
+ if (bucket_step != -1 && code_point_index != 0 && (code_point_index %
bucket_step) == 0) {
+ _fu8_itab_set_bucket(itab, bucket++, byteoff + utf8 -
utf8_start_position, code_point_index);
+ }
+
+ uint8_t c = *utf8++;
+ //printf("%x\n", c);
+ code_point_index += 1;
+ if ((c & 0xc0) == 0) {
+ continue;
+ }
+ if ((c & 0xe0) == 0xc0) {
+ utf8 += 1;
+ continue;
+ }
+ if ((c & 0xf0) == 0xe0) {
+ utf8 += 2;
+ continue;
+ }
+ if ((c & 0xf8) == 0xf0) {
+ utf8 += 3;
+ continue;
+ }
+ }
+
+ return -1; // out of bounds!!
+}
+
+size_t _fu8_idxtab_lookup_bytepos_i(struct fu8_idxtab * tab, size_t cpidx);
+
+ssize_t _fu8_idx2bytepos(size_t index,
+ const uint8_t * utf8, size_t bytelen, size_t cplen,
+ struct fu8_idxtab ** tab)
+{
+
+ assert(index != 0 && "index must not be 0");
+ // note that itab STILL can be NULL
+
+}
+
+size_t _fu8_idxtab_lookup_bytepos_i(struct fu8_idxtab * tab, size_t cpidx)
+{
+ if (cpidx == 0 || tab == NULL) {
+ return 0;
+ }
+ int step = tab->character_step;
+ int tidx = cpidx / step;
+ size_t val = tab->byte_positions[tidx];
+ while (tidx > 0) {
+ if (val != 0) {
+ //printf("%llx at %d %d/%d\n", val, tidx, cpidx, step);
+ return val;
+ }
+ tidx--;
+ val = tab->byte_positions[tidx];
+ }
+ // no clue, start at the beginning!
+ return 0;
+
+ //int lp, rp; // left position, right position
+ //int mp; // middle position
+ //int count;
+ //lp = 0;
+ //rp = 16;
+
+ //if (cpidx == 0) {
+ // return -1;
+ //}
+
+ //size_t valid_left = -1;
+
+ //do {
+ // count = (rp - lp);
+ // mp = lp + count / 2;
+
+ // size_t lval = tab->codepoint_positions[lp];
+ // size_t mval = tab->codepoint_positions[mp];
+ // size_t rval = tab->codepoint_positions[rp];
+ // printf("l %d m %d r %d\nlv %d mv %d rv %d\n", lp, mp, rp, lval,
mval, rval);
+ // if (lval != 0 && lval <= cpidx) {
+ // valid_left = lp;
+ // } else if (lval == 0) {
+ // // nothing is known about the left most value
+ // break;
+ // }
+
+ // if (mval == cpidx) {
+ // return mp;
+ // }
+
+ // if (mval == 0 || mval < cpidx) {
+ // // nothing is known about the middle value,
+ // // or mval is smaller the searched code point index
+ // rp = mp;
+ // continue;
+ // } else {
+ // lp = mp;
+ // continue;
+ // }
+
+ //} while (count > 1);
+
+ //return valid_left;
+}
+
+ssize_t fu8_idx2bytepos(size_t index,
+ const uint8_t * utf8, size_t bytelen,
+ size_t cplen,
+ struct fu8_idxtab ** tab)
+{
+ if (index == 0) { return 0; }
+ if (index >= cplen) { return -1; }
+ size_t off = _fu8_idxtab_lookup_bytepos_i(tab[0], index);
+ //printf("found %llx\n", off);
+ return _fu8_build_idxtab(index, 0, cplen, utf8, bytelen, 0, tab);
+}
diff --git a/rpython/rlib/rutf8/src/utf8.h b/rpython/rlib/rutf8/src/utf8.h
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8/src/utf8.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stddef.h>
+
+/**
+ * Returns -1 if the given string is not a valid utf8 encoded string.
+ * Otherwise returns the amount code point in the given string.
+ * len: length in bytes (8-bit)
+ *
+ * The above documentation also applies for several vectorized implementations
+ * found below.
+ *
+ * count_utf8_codepoints dispatches amongst several
+ * implementations (e.g. seq, SSE4, AVX)
+ */
+// TODO rename (fu8 prefix)
+ssize_t fu8_count_utf8_codepoints(const uint8_t * encoded, size_t len);
+ssize_t fu8_count_utf8_codepoints_seq(const uint8_t * encoded, size_t len);
+ssize_t fu8_count_utf8_codepoints_sse4(const uint8_t * encoded, size_t len);
+ssize_t fu8_count_utf8_codepoints_avx(const uint8_t * encoded, size_t len);
+
+
+struct fu8_idxtab;
+
+/**
+ * Looks up the byte position of the utf8 code point at the index.
+ * Assumptions:
+ *
+ * * utf8 parameter is utf8 encoded, otherwise the result is undefined.
+ * * passing one struct fu8_idxtab instance to several different utf8 strings
+ * yields undefined behaviour
+ *
+ * Return values:
+ *
+ * -1, if the index is out of bounds of utf8
+ * X, where X >= 0. X is the byte postion for the code point at index
+ *
+ * If table is not NULL, this routine builds up a lookup
+ * table to speed up indexing.
+ *
+ */
+ssize_t fu8_idx2bytepos(size_t index,
+ const uint8_t * utf8, size_t bytelen,
+ size_t cplen,
+ struct fu8_idxtab ** tab);
+void fu8_free_idxtab(struct fu8_idxtab * t);
+ssize_t fu8_idx2bytepso_sse4(size_t index,
+ const uint8_t * utf8, size_t len,
+ struct fu8_idxtab ** t);
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit