This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git

commit 2c113c655bd5293fb8f95164d357073ee6fb777d
Author: Andy Seaborne <[email protected]>
AuthorDate: Wed Oct 30 13:55:40 2024 +0000

    Support decoding 5 and 6 byte UTF-8
---
 .../java/org/apache/jena/atlas/io/BlockUTF8.java   |  29 +++-
 .../org/apache/jena/atlas/io/TestBlockUTF8.java    | 183 ++++++++++-----------
 2 files changed, 112 insertions(+), 100 deletions(-)

diff --git a/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java 
b/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java
index 356297be59..d13b4cf468 100644
--- a/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java
+++ b/jena-base/src/main/java/org/apache/jena/atlas/io/BlockUTF8.java
@@ -28,7 +28,7 @@ import java.nio.CharBuffer;
  * This code is just the UTF-8 encoding rules - it does not check for legality
  * of the Unicode data.  The standard codecs do, so do not round-trip with 
binary
  * compatibility. (Example: a single element of a surrogate pair will
- * be encoded/decoded without lose.)
+ * be encoded/decoded without loss.)
  *
  * The usual Charset encoders/decoders can be expensive to start up - they are 
also
  * not thread safe. Sometimes we want to convert 10's of chars and UTF-8 can be
@@ -112,8 +112,7 @@ public class BlockUTF8
                 continue;
             }
             if ( (x & 0xE0) == 0xC0 ) {
-                // 10 => extension byte
-                // 110..... => 2 bytes
+                // 110zzzzz => 2 bytes
                 // Unroll common path
                 //int ch = readMultiBytes(bb, x & 0x1F, 2);
                 int x2 = bb.get();
@@ -126,22 +125,36 @@ public class BlockUTF8
                 continue;
             }
             if ( (x & 0xF0) == 0xE0 ) {
-                //  1110.... => 3 bytes : 16 bits : not outside 16bit chars
+                //  1110zzzz => 3 bytes : 16 bits : not outside 16bit chars
                 int ch = readMultiBytes(bb, x & 0x0F, 3);
                 cb.put((char)ch);
                 idx += 3;
                 continue;
             }
             if ( (x & 0xF8) == 0xF0 ) {
-                // Looking like 4 byte character.
                 // 11110zzz => 4 bytes.
-                int ch = readMultiBytes(bb, x & 0x08, 4);
-
+                int ch = readMultiBytes(bb, x & 0x07, 4);
                 char chars[] = Character.toChars(ch);
                 cb.put(chars);
                 idx += 4;
                 continue;
             }
+            if ( (x & 0xFC) == 0xF8 ) {
+                // 111110zz => 5 bytes.
+                int ch = readMultiBytes(bb, x & 0x03, 5);
+                char chars[] = Character.toChars(ch);
+                cb.put(chars);
+                idx += 5;
+                continue;
+            }
+            if ( (x & 0xFE) == 0xFC ) {
+                // 1111110z => 6 bytes.
+                int ch = readMultiBytes(bb, x & 0x01, 6);
+                char chars[] = Character.toChars(ch);
+                cb.put(chars);
+                idx += 6;
+                continue;
+            }
             exception("Illegal UTF-8: 0x%04X",x);
         }
     }
@@ -212,7 +225,7 @@ public class BlockUTF8
             }
             if ( ch <= 0x7FFFFFFF ) {
                 // 32 bits : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
10xxxxxx
-                int x1 = (((ch >> (32 - 1)) & 0x1) | 0xFC);
+                int x1 = (((ch >> (31 - 1)) & 0x1) | 0xFC);
                 outputBytes(bb, x1, 6, ch);
                 continue;
             }
diff --git 
a/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java 
b/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java
index fdf9ec13cd..6f709a63e6 100644
--- a/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java
+++ b/jena-base/src/test/java/org/apache/jena/atlas/io/TestBlockUTF8.java
@@ -101,14 +101,12 @@ public class TestBlockUTF8
     @Test public void binary_10() { testBinary(binaryBytes2, 
CharBuffer.wrap(binaryStr3)); }
     @Test public void binary_11() { testBinary(binaryBytes3, 
CharBuffer.wrap(binaryStr3)); }
 
-
-    static void testIn(String x)
-    {
+    static void testIn(String x) {
         testIn(x, allocByteBufferArray, allocCharBufferArray);
         testIn(x, allocByteBufferDirect, allocCharBufferDirect);
     }
-    static void testIn(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> 
allocCB)
-    {
+
+    static void testIn(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> 
allocCB) {
         // Test as binary.
         testInOutBinary(x);
 
@@ -118,7 +116,7 @@ public class TestBlockUTF8
         // To bytes.stringAsBytes
         int N = x.length();
         CharBuffer cb = CharBuffer.wrap(x.toCharArray());
-        ByteBuffer bb = allocBB.allocate(4*N);
+        ByteBuffer bb = allocBB.allocate(4 * N);
         BlockUTF8.fromChars(cb, bb);
         bb.flip();
 
@@ -131,12 +129,12 @@ public class TestBlockUTF8
         assertEquals(x, str);
     }
 
-    // Testing, but not against what Java would do (it replaces bad chars, we 
want binary).
-    static void testInOutBinary(String x)
-    {
+    // Testing, but not against what Java would do (it replaces bad chars, we 
want
+    // binary).
+    static void testInOutBinary(String x) {
         int N = x.length();
         CharBuffer cb = CharBuffer.wrap(x.toCharArray());
-        ByteBuffer bb = ByteBuffer.allocate(4*N);
+        ByteBuffer bb = ByteBuffer.allocate(4 * N);
         BlockUTF8.fromChars(cb, bb);
         bb.flip();
         CharBuffer cb2 = CharBuffer.allocate(N);
@@ -147,14 +145,13 @@ public class TestBlockUTF8
 
         // And re-code as bytes.
         CharBuffer cb3 = CharBuffer.wrap(x.toCharArray());
-        ByteBuffer bb3 = ByteBuffer.allocate(4*N);
+        ByteBuffer bb3 = ByteBuffer.allocate(4 * N);
         BlockUTF8.fromChars(cb3, bb3);
         bb3.flip();
         assertArrayEquals(bb.array(), bb3.array());
     }
 
-    static void testOut(String x)
-    {
+    static void testOut(String x) {
         testOut(x, allocByteBufferArray, allocCharBufferArray);
         testOut(x, allocByteBufferDirect, allocCharBufferDirect);
     }
@@ -173,82 +170,84 @@ public class TestBlockUTF8
           @Override public CharBuffer allocate(int len) { return 
ByteBuffer.allocateDirect(2*len).asCharBuffer(); }
     };
 
-    static void testOut(String x, Alloc<ByteBuffer> allocBB, Alloc<CharBuffer> 
allocCB)
-    {
-        testBinary(stringAsBytes(x));
-
-        int N = x.length();
-        // First - get bytes the Java way.
-        ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x));
-        CharBuffer cb = allocCB.allocate(N);
-
-        BlockUTF8.toChars(bytes, cb);
-        cb.flip();
-        bytes.flip();
-
-        ByteBuffer bytes2 = allocBB.allocate(bytes.capacity());
-        BlockUTF8.fromChars(cb, bytes2);
-        bytes2.flip();
-        assertTrue("Chars", sameBytes(bytes, bytes2));
-    }
-
-    static void testBinary(byte[] binary, CharBuffer chars)
-    {
-        int N = binary.length;
-        ByteBuffer bytes = ByteBuffer.wrap(binary);
-        CharBuffer cb = CharBuffer.allocate(N);
-        BlockUTF8.toChars(bytes, cb);
-        cb.flip();
-        assertTrue("Binary", sameChars(chars, cb));
-    }
-
-    static void testBinary(byte[] binary)
-    {
-        testBinary(binary, binary);
-    }
-
-    static void testBinary(byte[] binary, byte[] expected)
-    {
-        int N = binary.length;
-        ByteBuffer bytes = ByteBuffer.wrap(binary);
-        CharBuffer cb = CharBuffer.allocate(N);
-        BlockUTF8.toChars(bytes, cb);
-        cb.flip();
-        bytes.position(0);
-        ByteBuffer bytes2 = ByteBuffer.allocate(2*N);  // Null bytes get 
expanded.
-        BlockUTF8.fromChars(cb, bytes2);
-        bytes2.flip();
-        sameBytes(bytes, bytes2);
-        assertTrue("Binary", sameBytes(ByteBuffer.wrap(expected), bytes2));
-    }
-
-    // Does not move position.
-    static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2)
-    {
-        if ( bb1.remaining() != bb2.remaining() ) return false;
-
-        for ( int i = 0; i < bb1.remaining(); i++ )
-            if ( bb1.get(i+bb1.position()) != bb2.get(i+bb2.position()) ) 
return false;
-        return true;
-    }
-    // Does not move position.
-    static boolean sameChars(CharBuffer cb1, CharBuffer cb2)
-    {
-        if ( cb1.remaining() != cb2.remaining() ) return false;
-
-        for ( int i = 0; i < cb1.remaining(); i++ )
-            if ( cb1.get(i+cb1.position()) != cb2.get(i+cb2.position()) ) 
return false;
-        return true;
-    }
-    static byte[] stringAsBytes(String x)
-    {
-        try {
-            ByteArrayOutputStream bout = new ByteArrayOutputStream();
-            try(Writer out = new OutputStreamWriter(bout, utf8)) {
-                out.write(x);
-            }
-            byte[] bytes = bout.toByteArray();
-            return bytes;
-        } catch (IOException ex) { throw new RuntimeException(ex); }
-    }
-}
+      static void testOut(String x, Alloc<ByteBuffer> allocBB, 
Alloc<CharBuffer> allocCB) {
+          testBinary(stringAsBytes(x));
+
+          int N = x.length();
+          // First - get bytes the Java way.
+          ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x));
+          CharBuffer cb = allocCB.allocate(N);
+
+          BlockUTF8.toChars(bytes, cb);
+          cb.flip();
+          bytes.flip();
+
+          ByteBuffer bytes2 = allocBB.allocate(bytes.capacity());
+          BlockUTF8.fromChars(cb, bytes2);
+          bytes2.flip();
+          assertTrue("Chars", sameBytes(bytes, bytes2));
+      }
+
+      static void testBinary(byte[] binary, CharBuffer chars) {
+          int N = binary.length;
+          ByteBuffer bytes = ByteBuffer.wrap(binary);
+          CharBuffer cb = CharBuffer.allocate(N);
+          BlockUTF8.toChars(bytes, cb);
+          cb.flip();
+          assertTrue("Binary", sameChars(chars, cb));
+      }
+
+      static void testBinary(byte[] binary) {
+          testBinary(binary, binary);
+      }
+
+      static void testBinary(byte[] binary, byte[] expected) {
+          int N = binary.length;
+          ByteBuffer bytes = ByteBuffer.wrap(binary);
+          CharBuffer cb = CharBuffer.allocate(N);
+          BlockUTF8.toChars(bytes, cb);
+          cb.flip();
+          bytes.position(0);
+          ByteBuffer bytes2 = ByteBuffer.allocate(2 * N);  // Null bytes get
+                                                           // expanded.
+          BlockUTF8.fromChars(cb, bytes2);
+          bytes2.flip();
+          sameBytes(bytes, bytes2);
+          assertTrue("Binary", sameBytes(ByteBuffer.wrap(expected), bytes2));
+      }
+
+      // Does not move position.
+      static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2) {
+          if ( bb1.remaining() != bb2.remaining() )
+              return false;
+
+          for ( int i = 0 ; i < bb1.remaining() ; i++ )
+              if ( bb1.get(i + bb1.position()) != bb2.get(i + bb2.position()) )
+                  return false;
+          return true;
+      }
+
+      // Does not move position.
+      static boolean sameChars(CharBuffer cb1, CharBuffer cb2) {
+          if ( cb1.remaining() != cb2.remaining() )
+              return false;
+
+          for ( int i = 0 ; i < cb1.remaining() ; i++ )
+              if ( cb1.get(i + cb1.position()) != cb2.get(i + cb2.position()) )
+                  return false;
+          return true;
+      }
+
+      static byte[] stringAsBytes(String x) {
+          try {
+              ByteArrayOutputStream bout = new ByteArrayOutputStream();
+              try (Writer out = new OutputStreamWriter(bout, utf8)) {
+                  out.write(x);
+              }
+              byte[] bytes = bout.toByteArray();
+              return bytes;
+          } catch (IOException ex) {
+              throw new RuntimeException(ex);
+          }
+      }
+  }

Reply via email to