This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 2c113c655bd5293fb8f95164d357073ee6fb777d Author: Andy Seaborne <[email protected]> AuthorDate: Wed Oct 30 13:55:40 2024 +0000 Support decoding 5 and 6 byte UTF-8 --- .../java/org/apache/jena/atlas/io/BlockUTF8.java | 29 +++- .../org/apache/jena/atlas/io/TestBlockUTF8.java | 183 ++++++++++----------- 2 files changed, 112 insertions(+), 100 deletions(-) diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java b/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java index 356297be59..d13b4cf468 100644 --- a/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java +++ b/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java @@ -28,7 +28,7 @@ import java.nio.CharBuffer; * This code is just the UTF-8 encoding rules - it does not check for legality * of the Unicode data. The standard codecs do, so do not round-trip with binary * compatibility. (Example: a single element of a surrogate pair will - * be encoded/decoded without lose.) + * be encoded/decoded without loss.) * * The usual Charset encoders/decoders can be expensive to start up - they are also * not thread safe. Sometimes we want to convert 10's of chars and UTF-8 can be @@ -112,8 +112,7 @@ public class BlockUTF8 continue; } if ( (x & 0xE0) == 0xC0 ) { - // 10 => extension byte - // 110..... => 2 bytes + // 110zzzzz => 2 bytes // Unroll common path //int ch = readMultiBytes(bb, x & 0x1F, 2); int x2 = bb.get(); @@ -126,22 +125,36 @@ public class BlockUTF8 continue; } if ( (x & 0xF0) == 0xE0 ) { - // 1110.... => 3 bytes : 16 bits : not outside 16bit chars + // 1110zzzz => 3 bytes : 16 bits : not outside 16bit chars int ch = readMultiBytes(bb, x & 0x0F, 3); cb.put((char)ch); idx += 3; continue; } if ( (x & 0xF8) == 0xF0 ) { - // Looking like 4 byte character. // 11110zzz => 4 bytes. - int ch = readMultiBytes(bb, x & 0x08, 4); - + int ch = readMultiBytes(bb, x & 0x07, 4); char chars[] = Character.toChars(ch); cb.put(chars); idx += 4; continue; } + if ( (x & 0xFC) == 0xF8 ) { + // 111110zz => 5 bytes. + int ch = readMultiBytes(bb, x & 0x03, 5); + char chars[] = Character.toChars(ch); + cb.put(chars); + idx += 5; + continue; + } + if ( (x & 0xFE) == 0xFC ) { + // 1111110z => 6 bytes. + int ch = readMultiBytes(bb, x & 0x01, 6); + char chars[] = Character.toChars(ch); + cb.put(chars); + idx += 6; + continue; + } exception("Illegal UTF-8: 0x%04X",x); } } @@ -212,7 +225,7 @@ public class BlockUTF8 } if ( ch <= 0x7FFFFFFF ) { // 32 bits : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - int x1 = (((ch >> (32 - 1)) & 0x1) | 0xFC); + int x1 = (((ch >> (31 - 1)) & 0x1) | 0xFC); outputBytes(bb, x1, 6, ch); continue; } diff --git a/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java b/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java index fdf9ec13cd..6f709a63e6 100644 --- a/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java +++ b/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java @@ -101,14 +101,12 @@ public class TestBlockUTF8 @Test public void binary_10() { testBinary(binaryBytes2, CharBuffer.wrap(binaryStr3)); } @Test public void binary_11() { testBinary(binaryBytes3, CharBuffer.wrap(binaryStr3)); } - - static void testIn(String x) - { + static void testIn(String x) { testIn(x, allocByteBufferArray, allocCharBufferArray); testIn(x, allocByteBufferDirect, allocCharBufferDirect); } - static void testIn(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> allocCB) - { + + static void testIn(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> allocCB) { // Test as binary. testInOutBinary(x); @@ -118,7 +116,7 @@ public class TestBlockUTF8 // To bytes.stringAsBytes int N = x.length(); CharBuffer cb = CharBuffer.wrap(x.toCharArray()); - ByteBuffer bb = allocBB.allocate(4*N); + ByteBuffer bb = allocBB.allocate(4 * N); BlockUTF8.fromChars(cb, bb); bb.flip(); @@ -131,12 +129,12 @@ public class TestBlockUTF8 assertEquals(x, str); } - // Testing, but not against what Java would do (it replaces bad chars, we want binary). - static void testInOutBinary(String x) - { + // Testing, but not against what Java would do (it replaces bad chars, we want + // binary). + static void testInOutBinary(String x) { int N = x.length(); CharBuffer cb = CharBuffer.wrap(x.toCharArray()); - ByteBuffer bb = ByteBuffer.allocate(4*N); + ByteBuffer bb = ByteBuffer.allocate(4 * N); BlockUTF8.fromChars(cb, bb); bb.flip(); CharBuffer cb2 = CharBuffer.allocate(N); @@ -147,14 +145,13 @@ public class TestBlockUTF8 // And re-code as bytes. CharBuffer cb3 = CharBuffer.wrap(x.toCharArray()); - ByteBuffer bb3 = ByteBuffer.allocate(4*N); + ByteBuffer bb3 = ByteBuffer.allocate(4 * N); BlockUTF8.fromChars(cb3, bb3); bb3.flip(); assertArrayEquals(bb.array(), bb3.array()); } - static void testOut(String x) - { + static void testOut(String x) { testOut(x, allocByteBufferArray, allocCharBufferArray); testOut(x, allocByteBufferDirect, allocCharBufferDirect); } @@ -173,82 +170,84 @@ public class TestBlockUTF8 @Override public CharBuffer allocate(int len) { return ByteBuffer.allocateDirect(2*len).asCharBuffer(); } }; - static void testOut(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> allocCB) - { - testBinary(stringAsBytes(x)); - - int N = x.length(); - // First - get bytes the Java way. - ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)); - CharBuffer cb = allocCB.allocate(N); - - BlockUTF8.toChars(bytes, cb); - cb.flip(); - bytes.flip(); - - ByteBuffer bytes2 = allocBB.allocate(bytes.capacity()); - BlockUTF8.fromChars(cb, bytes2); - bytes2.flip(); - assertTrue("Chars", sameBytes(bytes, bytes2)); - } - - static void testBinary(byte[] binary, CharBuffer chars) - { - int N = binary.length; - ByteBuffer bytes = ByteBuffer.wrap(binary); - CharBuffer cb = CharBuffer.allocate(N); - BlockUTF8.toChars(bytes, cb); - cb.flip(); - assertTrue("Binary", sameChars(chars, cb)); - } - - static void testBinary(byte[] binary) - { - testBinary(binary, binary); - } - - static void testBinary(byte[] binary, byte[] expected) - { - int N = binary.length; - ByteBuffer bytes = ByteBuffer.wrap(binary); - CharBuffer cb = CharBuffer.allocate(N); - BlockUTF8.toChars(bytes, cb); - cb.flip(); - bytes.position(0); - ByteBuffer bytes2 = ByteBuffer.allocate(2*N); // Null bytes get expanded. - BlockUTF8.fromChars(cb, bytes2); - bytes2.flip(); - sameBytes(bytes, bytes2); - assertTrue("Binary", sameBytes(ByteBuffer.wrap(expected), bytes2)); - } - - // Does not move position. - static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2) - { - if ( bb1.remaining() != bb2.remaining() ) return false; - - for ( int i = 0; i < bb1.remaining(); i++ ) - if ( bb1.get(i+bb1.position()) != bb2.get(i+bb2.position()) ) return false; - return true; - } - // Does not move position. - static boolean sameChars(CharBuffer cb1, CharBuffer cb2) - { - if ( cb1.remaining() != cb2.remaining() ) return false; - - for ( int i = 0; i < cb1.remaining(); i++ ) - if ( cb1.get(i+cb1.position()) != cb2.get(i+cb2.position()) ) return false; - return true; - } - static byte[] stringAsBytes(String x) - { - try { - ByteArrayOutputStream bout = new ByteArrayOutputStream(); - try(Writer out = new OutputStreamWriter(bout, utf8)) { - out.write(x); - } - byte[] bytes = bout.toByteArray(); - return bytes; - } catch (IOException ex) { throw new RuntimeException(ex); } - } -} + static void testOut(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> allocCB) { + testBinary(stringAsBytes(x)); + + int N = x.length(); + // First - get bytes the Java way. + ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)); + CharBuffer cb = allocCB.allocate(N); + + BlockUTF8.toChars(bytes, cb); + cb.flip(); + bytes.flip(); + + ByteBuffer bytes2 = allocBB.allocate(bytes.capacity()); + BlockUTF8.fromChars(cb, bytes2); + bytes2.flip(); + assertTrue("Chars", sameBytes(bytes, bytes2)); + } + + static void testBinary(byte[] binary, CharBuffer chars) { + int N = binary.length; + ByteBuffer bytes = ByteBuffer.wrap(binary); + CharBuffer cb = CharBuffer.allocate(N); + BlockUTF8.toChars(bytes, cb); + cb.flip(); + assertTrue("Binary", sameChars(chars, cb)); + } + + static void testBinary(byte[] binary) { + testBinary(binary, binary); + } + + static void testBinary(byte[] binary, byte[] expected) { + int N = binary.length; + ByteBuffer bytes = ByteBuffer.wrap(binary); + CharBuffer cb = CharBuffer.allocate(N); + BlockUTF8.toChars(bytes, cb); + cb.flip(); + bytes.position(0); + ByteBuffer bytes2 = ByteBuffer.allocate(2 * N); // Null bytes get + // expanded. + BlockUTF8.fromChars(cb, bytes2); + bytes2.flip(); + sameBytes(bytes, bytes2); + assertTrue("Binary", sameBytes(ByteBuffer.wrap(expected), bytes2)); + } + + // Does not move position. + static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2) { + if ( bb1.remaining() != bb2.remaining() ) + return false; + + for ( int i = 0 ; i < bb1.remaining() ; i++ ) + if ( bb1.get(i + bb1.position()) != bb2.get(i + bb2.position()) ) + return false; + return true; + } + + // Does not move position. + static boolean sameChars(CharBuffer cb1, CharBuffer cb2) { + if ( cb1.remaining() != cb2.remaining() ) + return false; + + for ( int i = 0 ; i < cb1.remaining() ; i++ ) + if ( cb1.get(i + cb1.position()) != cb2.get(i + cb2.position()) ) + return false; + return true; + } + + static byte[] stringAsBytes(String x) { + try { + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + try (Writer out = new OutputStreamWriter(bout, utf8)) { + out.write(x); + } + byte[] bytes = bout.toByteArray(); + return bytes; + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + }
