(sis) 02/04: Add a `readNullTerminatedString(Charset)` method in `ChannelDataInput`.

desruisseaux Wed, 22 Jan 2025 01:40:45 -0800

This is an automated email from the ASF dual-hosted git repository.

desruisseaux pushed a commit to branch geoapi-4.0
in repository https://gitbox.apache.org/repos/asf/sis.git


commit 8c31b5d538b00a733c804850850bb0ba9f8c9b32
Author: Martin Desruisseaux <[email protected]>
AuthorDate: Tue Dec 31 19:56:17 2024 +0100

    Add a `readNullTerminatedString(Charset)` method in `ChannelDataInput`.
---
 .../org/apache/sis/io/stream/ChannelDataInput.java | 135 ++++++++++++++++++++-
 .../apache/sis/io/stream/ChannelDataInputTest.java | 111 ++++++++++++++---
 2 files changed, 226 insertions(+), 20 deletions(-)

diff --git 
a/endorsed/src/org.apache.sis.storage/main/org/apache/sis/io/stream/ChannelDataInput.java
 
b/endorsed/src/org.apache.sis.storage/main/org/apache/sis/io/stream/ChannelDataInput.java
index b3c64cb9e7..61b30d9567 100644
--- 
a/endorsed/src/org.apache.sis.storage/main/org/apache/sis/io/stream/ChannelDataInput.java
+++ 
b/endorsed/src/org.apache.sis.storage/main/org/apache/sis/io/stream/ChannelDataInput.java
@@ -20,6 +20,7 @@ import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.io.EOFException;
+import java.io.UnsupportedEncodingException;
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -30,6 +31,7 @@ import java.nio.LongBuffer;
 import java.nio.FloatBuffer;
 import java.nio.DoubleBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.channels.Channel;
 import java.nio.channels.ReadableByteChannel;
 import java.nio.channels.SeekableByteChannel;
@@ -941,12 +943,13 @@ public class ChannelDataInput extends ChannelData 
implements DataInput {
     }
 
     /**
-     * Decodes a string from a sequence of bytes in the given encoding. This 
method tries to avoid the creation
-     * of a temporary {@code byte[]} array when possible.
+     * Decodes a string from a sequence of bytes in the given encoding.
+     * This method tries to avoid the creation of a temporary {@code byte[]} 
array when possible.
      *
-     * <p>This convenience method shall be used only for relatively small 
number of {@link String} instances
-     * to decode, for example attribute values in the file header. For large 
amount of data, consider using
-     * {@link java.nio.charset.CharsetDecoder} instead.</p>
+     * <h4>Performance note</h4>
+     * This convenience method should be used only for small number of short 
{@link String} instances
+     * to decode, for example attribute values in the file header. For large 
amount of data, consider
+     * using {@link java.nio.charset.CharsetDecoder} instead.
      *
      * @param  length    number of bytes to read.
      * @param  encoding  the character encoding.
@@ -973,10 +976,117 @@ public class ChannelDataInput extends ChannelData 
implements DataInput {
     }
 
     /**
-     * Reads in a string that has been encoded using a UTF-8 string.
+     * Reads a null-terminated US-ASCII, ISO-LATIN-1, UTF-8, UTF-16 or UTF-32 
string.
+     * Note that {@code 0x00} is always the one-byte NUL character in UTF-8.
+     * It cannot be part of a multi-byte character's representation by design.
+     *
+     * <p>The character encoding should be specified in argument. If {@code 
null}, this method infers
+     * the encoding with the following rules specified by ISO 14496-12 (Base 
Media File Format):</p>
+     *
+     * <ul>
+     *   <li>If the string starts with a Byte Order Mark (<abbr>BOM</abbr>), 
then UTF-16 encoding is assumed.</li>
+     *   <li>Otherwise, UTF-8 encoding is assumed. This method does not test 
whether the string is well-formed.</li>
+     * </ul>
+     *
+     * <h4>Limitations</h4>
+     * This convenience method should be used only for small number of short 
{@link String} instances to decode
+     * using one of the encoding specified in {@link StandardCharsets}. For 
large amount of data, or for support
+     * of any encoding other than the standard ones, use {@link 
java.nio.charset.CharsetDecoder} instead.
+     *
+     * @param  encoding  the character encoding, or {@code null} for UTF-8 or 
UTF-16 depending on whether a <abbr>BOM</abbr> is present.
+     * @return the character string, possibly empty.
+     * @throws UnsupportedEncodingException if the encoding is not one of the 
{@link StandardCharsets}.
+     * @throws IOException if an error occurred while reading the string.
+     */
+    public final String readNullTerminatedString(Charset encoding) throws 
IOException {
+        long start = position();
+        if (encoding == null) {
+            /*
+             * If the string may be UTF-16, check for the Byte Order Mark 
(BOM).
+             * If none, UTF-8 is assumed. This semantic is used by ISO 14496-12
+             * (Base Media File Format).
+             */
+            switch (readByte()) {
+                case (byte) 0x00: return "";
+                case (byte) 0xFE: if (readByte() == (byte) 0xFF) encoding = 
StandardCharsets.UTF_16BE; break;
+                case (byte) 0xFF: if (readByte() == (byte) 0xFE) encoding = 
StandardCharsets.UTF_16LE; break;
+            }
+            if (encoding == null) {
+                encoding = StandardCharsets.UTF_8;
+                buffer.position(buffer.position() - Byte.BYTES);
+                // No need to push back the first character because it is 
known to be non-zero.
+            } else {
+                start += Short.BYTES;
+            }
+        }
+        /*
+         * Get the number of bytes per character. This number determines the 
size of the NUL terminator.
+         * This information is not provided in the `Charset` API (as of Java 
23), which is the reason why
+         * this method supports only `StandardCharsets` values.
+         */
+        final int charSize;
+        final String name = encoding.name();
+        if (name.equals("US-ASCII") || name.equals("ISO-8859-1") || 
name.equals("UTF-8")) {
+            charSize = Byte.BYTES;
+        } else if (name.startsWith("UTF-16")) {
+            charSize = Short.BYTES;
+        } else if (name.startsWith("UTF-32")) {
+            charSize = Integer.BYTES;
+        } else {
+            throw new UnsupportedEncodingException(name);
+        }
+        /*
+         * Search the nul terminator directly in the buffer. If we need to 
read more bytes,
+         * we will try to do that without discarding the first characters of 
the strings.
+         */
+        int base = buffer.position();
+search: for (;;) {
+            if (charSize == 1) {
+                // Optimization for the most common cases: US-ASCII, 
ISO-LATIN-1, UTF-8.
+                while (buffer.hasRemaining()) {
+                    if (buffer.get() == 0) {
+                        break search;
+                    }
+                }
+            } else {
+                while (buffer.remaining() >= charSize) {
+                    int c = (charSize <= Short.BYTES) ? buffer.getShort() : 
buffer.getInt();
+                    if (c == 0) break search;
+                }
+            }
+            /*
+             * Need more bytes. If there is enough room in the buffer either 
at the beginning (base > 0)
+             * or at the end (limit < capacity), temporarily move the position 
back to the base before
+             * to invoke `ensureBufferContains(…)` for avoiding to discard the 
bytes that we will need.
+             */
+            final int count = buffer.position() - base;
+            final int need  = count + charSize;
+            if (buffer.capacity() - need >= 0) {
+                buffer.position(base);
+                ensureBufferContains(need);
+                base = buffer.position();
+                buffer.position(base + count);
+            } else {
+                // Cannot avoid to discard what we have read before.
+                ensureBufferContains(charSize);
+            }
+        }
+        int size = Math.toIntExact(position() - start - charSize);
+        if (size <= 0) return "";   // Shortcut for a common case.
+        seek(start);
+        String value = readString(size, encoding);
+        skipNBytes(charSize);       // Skip the NUL terminal character.
+        return value;
+    }
+
+    /**
+     * Reads in a string that has been encoded using a Java modified UTF-8 
string.
+     * The number of bytes to read is encoded in the next unsigned short 
integer of the stream.
      *
      * @return the string reads from the stream.
      * @throws IOException if an error (including EOF) occurred while reading 
the stream.
+     *
+     * @see DataInput#readUTF()
      */
     @Override
     public final String readUTF() throws IOException {
@@ -1040,6 +1150,19 @@ loop:   while (hasRemaining()) {
         return n;
     }
 
+    /**
+     * Skips over and discards exactly <var>n</var> bytes of data from this 
input stream.
+     *
+     * @param  n  number of bytes to skip. Can be negative.
+     * @throws IOException if an error occurred while reading.
+     */
+    public final void skipNBytes(int n) throws IOException {
+        n -= skipBytes(n);
+        if (n != 0) {
+            seek(Math.addExact(position(), n));
+        }
+    }
+
     /**
      * Moves to the given position in this stream.
      *
diff --git 
a/endorsed/src/org.apache.sis.storage/test/org/apache/sis/io/stream/ChannelDataInputTest.java
 
b/endorsed/src/org.apache.sis.storage/test/org/apache/sis/io/stream/ChannelDataInputTest.java
index 24834c0220..e2099102f3 100644
--- 
a/endorsed/src/org.apache.sis.storage/test/org/apache/sis/io/stream/ChannelDataInputTest.java
+++ 
b/endorsed/src/org.apache.sis.storage/test/org/apache/sis/io/stream/ChannelDataInputTest.java
@@ -21,6 +21,7 @@ import java.io.DataInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 
 // Test dependencies
@@ -147,15 +148,96 @@ public final class ChannelDataInputTest extends 
ChannelDataTestCase {
     @Test
     public void testReadString() throws IOException {
         final String expected = "お元気ですか";
-        final byte[] array    = expected.getBytes("UTF-8");
-        assertEquals(expected.length()*3, array.length); // Sanity check.
-        final ChannelDataInput input = new ChannelDataInput("testReadString",
-                new DripByteChannel(array, random, 1, 32),
-                ByteBuffer.allocate(array.length + 4), false);
+        final byte[] array = expected.getBytes(StandardCharsets.UTF_8);
+        assertEquals(expected.length()*3, array.length);    // Sanity check.
+        final var input = new ChannelDataInput("testReadString",
+                              new DripByteChannel(array, random, 1, 32),
+                              ByteBuffer.allocate(array.length + 4), false);
         assertEquals(expected, input.readString(array.length, 
StandardCharsets.UTF_8));
         assertFalse(input.buffer.hasRemaining());
     }
 
+    /**
+     * Tests the {@link ChannelDataInput#readNullTerminatedString(Charset)} 
method.
+     *
+     * @throws IOException should never happen since we read and write in 
memory only.
+     */
+    public void testReadNullTerminatedString() throws IOException {
+        for (int i=0; i<=8; i++) {
+            String  expected = "theatre théâtre 劇場";
+            Charset encoding;
+            int     charSize;
+            char    bom = 0;
+            switch (i) {
+                case 0: {
+                    encoding = StandardCharsets.US_ASCII;
+                    expected = expected.substring(0, expected.indexOf(' '));
+                    charSize = Byte.BYTES;
+                    break;
+                }
+                case 1: {
+                    encoding = StandardCharsets.ISO_8859_1;
+                    expected = expected.substring(0, expected.lastIndexOf(' 
'));
+                    charSize = Byte.BYTES;
+                    break;
+                }
+                case 2: {
+                    encoding = StandardCharsets.UTF_8;
+                    charSize = Byte.BYTES;
+                    break;
+                }
+                case 3: {
+                    encoding = StandardCharsets.UTF_16;
+                    charSize = Short.BYTES;
+                    break;
+                }
+                case 4: {
+                    encoding = StandardCharsets.UTF_16BE;
+                    charSize = Short.BYTES;
+                    break;
+                }
+                case 5: {
+                    encoding = StandardCharsets.UTF_16LE;
+                    charSize = Short.BYTES;
+                    break;
+                }
+                case 6: {
+                    encoding = StandardCharsets.UTF_8;
+                    charSize = Byte.BYTES;
+                    bom      = ' ';             // Arbitrary value for meaning 
"do not write BOM".
+                    break;
+                }
+                case 7: {
+                    encoding = StandardCharsets.UTF_16BE;
+                    charSize = Short.BYTES;
+                    bom      = '\uFEFF';
+                    break;
+                }
+                case 8: {
+                    encoding = StandardCharsets.UTF_16LE;
+                    charSize = Short.BYTES;
+                    bom      = '\uFFFE';        // BOM with swapped bytes.
+                    break;
+                }
+                default: throw new AssertionError(i);
+            }
+            int base = random.nextInt(5) + 3;
+            final byte[] bytes = expected.getBytes(encoding);
+            final byte[] array = new byte[base + bytes.length + charSize];
+            System.arraycopy(bytes, 0, array, base, bytes.length);
+            if (bom > ' ') {
+                array[--base] = (byte) (bom & 0xFF);
+                array[--base] = (byte) (bom >>> Byte.SIZE);
+            }
+            final var input = new 
ChannelDataInput("testReadNullTerminatedString",
+                                  new DripByteChannel(array, random, charSize, 
24),
+                                  ByteBuffer.allocate(array.length + 4), 
false);
+            input.seek(base);
+            assertEquals(expected, input.readNullTerminatedString(bom == 0 ? 
encoding : null));
+            assertFalse(input.buffer.hasRemaining());
+        }
+    }
+
     /**
      * Tests {@link ChannelDataInput#seek(long)} on a channel that do not 
implement
      * {@link java.nio.channels.SeekableByteChannel}.
@@ -166,11 +248,11 @@ public final class ChannelDataInputTest extends 
ChannelDataTestCase {
     public void testSeekOnForwardOnlyChannel() throws IOException {
         int length = random.nextInt(2048) + 1024;
         final byte[] array = createRandomArray(length);
-        length -= Long.BYTES; // Safety against buffer underflow.
+        length -= Long.BYTES;   // Safety against buffer underflow.
         final ByteBuffer buffer = ByteBuffer.wrap(array);
-        final ChannelDataInput input = new 
ChannelDataInput("testSeekOnForwardOnlyChannel",
-                new DripByteChannel(array, random, 1, 2048),
-                ByteBuffer.allocate(random.nextInt(64) + 16), false);
+        final var input = new ChannelDataInput("testSeekOnForwardOnlyChannel",
+                              new DripByteChannel(array, random, 1, 2048),
+                              ByteBuffer.allocate(random.nextInt(64) + 16), 
false);
         int position = 0;
         while (position < length) {
             input.seek(position);
@@ -187,11 +269,12 @@ public final class ChannelDataInputTest extends 
ChannelDataTestCase {
      */
     @Test
     public void testPrefetch() throws IOException {
-        final int        length = random.nextInt(256) + 128;
-        final byte[]     array  = createRandomArray(length);
-        final ByteBuffer buffer = ByteBuffer.allocate(random.nextInt(64) + 16);
-        final ChannelDataInput input = new ChannelDataInput("testPrefetch",
-                new DripByteChannel(array, random, 1, 64), buffer, false);
+        final int    length = random.nextInt(256) + 128;
+        final byte[] array  = createRandomArray(length);
+        final var    buffer = ByteBuffer.allocate(random.nextInt(64) + 16);
+        final var    input  = new ChannelDataInput("testPrefetch",
+                                  new DripByteChannel(array, random, 1, 64),
+                                  buffer, false);
         int position = 0;
         while (position != length) {
             if (random.nextBoolean()) {

(sis) 02/04: Add a `readNullTerminatedString(Charset)` method in `ChannelDataInput`.

Reply via email to