rdblue commented on code in PR #3197: URL: https://github.com/apache/parquet-java/pull/3197#discussion_r2056637116
########## parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java: ########## @@ -0,0 +1,659 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.parquet.variant; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +/** + * This class defines constants related to the Variant format and provides functions for + * manipulating Variant binaries. + * + * A Variant is made up of 2 binaries: value and metadata. A Variant value consists of a one-byte + * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits + * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in + * the below constants for all possible basic type and type info values. + * + * The Variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). + * Its binary format is: + * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. + * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the + * dictionary. + * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the + * starting position of string i, counting starting from the address of `offsets[0]`. Strings + * must be stored contiguously, so we don’t need to store the string size, instead, we compute it + * with `offset[i + 1] - offset[i]`. + * - UTF-8 string data. + */ +class VariantUtil { + static final int BASIC_TYPE_BITS = 2; + static final int BASIC_TYPE_MASK = 0b00000011; + static final int PRIMITIVE_TYPE_MASK = 0b00111111; + /** The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. */ + static final int MAX_SHORT_STR_SIZE = 0b00111111; + + // The basic types + + /** + * Primitive value. + * The type info value must be one of the values in the "Primitive" section below. + */ + static final int PRIMITIVE = 0; + /** + * Short string value. + * The type info value is the string size, which must be in `[0, MAX_SHORT_STR_SIZE]`. + * The string content bytes directly follow the header byte. + */ + static final int SHORT_STR = 1; + /** + * Object value. + * The content contains a size, a list of field ids, a list of field offsets, and + * the actual field values. The list of field ids has `size` ids, while the list of field offsets + * has `size + 1` offsets, where the last offset represents the total size of the field values + * data. The list of fields ids must be sorted by the field name in alphabetical order. + * Duplicate field names within one object are not allowed. + * 5 bits in the type info are used to specify the integer type of the object header. It is + * 0_b4_b3b2_b1b0 (MSB is 0), where: + * - b4: the integer type of size. When it is 0/1, `size` is a little-endian 1/4-byte + * unsigned integer. + * - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list contains + * 1/2/3-byte little-endian unsigned integers. + * - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the offset list contains + * 1/2/3-byte little-endian unsigned integers. + */ + static final int OBJECT = 2; + /** + * Array value. + * The content contains a size, a list of field offsets, and the actual element values. + * It is similar to an object without the id list. The length of the offset list + * is `size + 1`, where the last offset represent the total size of the element data. + * Its type info is: 000_b2_b1b0: + * - b2: the type of size. + * - b1b0: the integer type of offset. + */ + static final int ARRAY = 3; + + // The primitive types + + /** JSON Null value. Empty content. */ + static final int NULL = 0; + /** True value. Empty content. */ + static final int TRUE = 1; + /** False value. Empty content. */ + static final int FALSE = 2; + /** 1-byte little-endian signed integer. */ + static final int INT8 = 3; + /** 2-byte little-endian signed integer. */ + static final int INT16 = 4; + /** 4-byte little-endian signed integer. */ + static final int INT32 = 5; + /** 4-byte little-endian signed integer. */ + static final int INT64 = 6; + /** 8-byte IEEE double. */ + static final int DOUBLE = 7; + /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. */ + static final int DECIMAL4 = 8; + /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. */ + static final int DECIMAL8 = 9; + /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. */ + static final int DECIMAL16 = 10; + /** + * Date value. Content is 4-byte little-endian signed integer that represents the + * number of days from the Unix epoch. + */ + static final int DATE = 11; + /** + * Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in + * their local time zones and may be displayed differently depending on the execution environment. + */ + static final int TIMESTAMP_TZ = 12; + /** + * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + * as if the local time zone is UTC. + */ + static final int TIMESTAMP_NTZ = 13; + /** 4-byte IEEE float. */ + static final int FLOAT = 14; + /** + * Binary value. The content is (4-byte little-endian unsigned integer representing the binary + * size) + (size bytes of binary content). + */ + static final int BINARY = 15; + /** + * Long string value. The content is (4-byte little-endian unsigned integer representing the + * string size) + (size bytes of string content). + */ + static final int LONG_STR = 16; + /** + * Time value. Values can be from 00:00:00 to 23:59:59.999999. + * Content is 8-byte little-endian unsigned integer that represents the number of microseconds + * since midnight. + */ + static final int TIME = 17; + /** + * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number of nanoseconds + * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. + */ + static final int TIMESTAMP_NANOS = 18; + /** + * Timestamp nanos (without timestamp) value. It has the same content as `TIMESTAMP_NANOS` but + * should always be interpreted as if the local time zone is UTC. + */ + static final int TIMESTAMP_NANOS_NTZ = 19; + /** + * UUID value. The content is a 16-byte binary, encoded using big-endian. + * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the bytes + * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff. + */ + static final int UUID = 20; + + // The metadata version. + static final byte VERSION = 1; + // The lower 4 bits of the first metadata byte contain the version. + static final byte VERSION_MASK = 0x0F; + + // Constants for various unsigned integer sizes. + static final int U8_MAX = 0xFF; + static final int U16_MAX = 0xFFFF; + static final int U24_MAX = 0xFFFFFF; + static final int U8_SIZE = 1; + static final int U16_SIZE = 2; + static final int U24_SIZE = 3; + static final int U32_SIZE = 4; + + // Max decimal precision for each decimal type. + static final int MAX_DECIMAL4_PRECISION = 9; + static final int MAX_DECIMAL8_PRECISION = 18; + static final int MAX_DECIMAL16_PRECISION = 38; + + // The size (in bytes) of a UUID. + static final int UUID_SIZE = 16; + + static byte primitiveHeader(int type) { + return (byte) (type << 2 | PRIMITIVE); + } + + static byte shortStrHeader(int size) { + return (byte) (size << 2 | SHORT_STR); + } + + static byte objectHeader(boolean largeSize, int idSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) + | ((idSize - 1) << (BASIC_TYPE_BITS + 2)) + | ((offsetSize - 1) << BASIC_TYPE_BITS) + | OBJECT); + } + + static byte arrayHeader(boolean largeSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); + } + + /** + * Check the validity of an array index `pos`. + * @param pos The index to check + * @param length The length of the array + * @throws IllegalArgumentException if the index is out of bound + */ + static void checkIndex(int pos, int length) { + if (pos < 0 || pos >= length) { + throw new IllegalArgumentException( + String.format("Invalid byte-array offset (%d). length: %d", pos, length)); + } + } + + /** + * Reads a little-endian signed long value from `buffer[pos, pos + numBytes)`. + * @param buffer The ByteBuffer to read from + * @param pos The starting index of the buffer to read from + * @param numBytes The number of bytes to read + * @return The long value + */ + static long readLong(ByteBuffer buffer, int pos, int numBytes) { + checkIndex(pos, buffer.limit()); + checkIndex(pos + numBytes - 1, buffer.limit()); + long result = 0; + // All bytes except the most significant byte should be unsigned-extended and shifted + // (so we need & 0xFF`). The most significant byte should be sign-extended and is handled + // after the loop. + for (int i = 0; i < numBytes - 1; ++i) { + long unsignedByteValue = buffer.get(pos + i) & 0xFF; + result |= unsignedByteValue << (8 * i); + } + long signedByteValue = buffer.get(pos + numBytes - 1); + result |= signedByteValue << (8 * (numBytes - 1)); + return result; + } + + /** + * Read a little-endian unsigned int value from `bytes[pos, pos + numBytes)`. The value must fit + * into a non-negative int (`[0, Integer.MAX_VALUE]`). + */ + static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) { + checkIndex(pos, bytes.limit()); + checkIndex(pos + numBytes - 1, bytes.limit()); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsigned-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes.get(pos + i) & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) { + throw new IllegalArgumentException(String.format("Failed to read unsigned int. numBytes: %d", numBytes)); + } + return result; + } + + /** + * Returns the value type of Variant value `value[pos...]`. It is only legal to call `get*` if + * `getType` returns the corresponding type. For example, it is only legal to call + * `getLong` if this method returns `Type.Long`. + * @param value The Variant value to get the type from + * @return The type of the Variant value + */ + static Variant.Type getType(ByteBuffer value) { + checkIndex(value.position(), value.limit()); + int basicType = value.get(value.position()) & BASIC_TYPE_MASK; + int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK; + switch (basicType) { + case SHORT_STR: + return Variant.Type.STRING; + case OBJECT: + return Variant.Type.OBJECT; + case ARRAY: + return Variant.Type.ARRAY; + default: + switch (typeInfo) { + case NULL: + return Variant.Type.NULL; + case TRUE: + case FALSE: + return Variant.Type.BOOLEAN; + case INT8: + return Variant.Type.BYTE; + case INT16: + return Variant.Type.SHORT; + case INT32: + return Variant.Type.INT; + case INT64: + return Variant.Type.LONG; + case DOUBLE: + return Variant.Type.DOUBLE; + case DECIMAL4: + return Variant.Type.DECIMAL4; + case DECIMAL8: + return Variant.Type.DECIMAL8; + case DECIMAL16: + return Variant.Type.DECIMAL16; + case DATE: + return Variant.Type.DATE; + case TIMESTAMP_TZ: + return Variant.Type.TIMESTAMP_TZ; + case TIMESTAMP_NTZ: + return Variant.Type.TIMESTAMP_NTZ; + case FLOAT: + return Variant.Type.FLOAT; + case BINARY: + return Variant.Type.BINARY; + case LONG_STR: + return Variant.Type.STRING; + case TIME: + return Variant.Type.TIME; + case TIMESTAMP_NANOS: + return Variant.Type.TIMESTAMP_NANOS; + case TIMESTAMP_NANOS_NTZ: + return Variant.Type.TIMESTAMP_NANOS_NTZ; + case UUID: + return Variant.Type.UUID; + default: + throw new UnsupportedOperationException( + String.format("Unknown type in Variant. primitive type: %d", typeInfo)); + } + } + } + + private static IllegalArgumentException unexpectedType(Variant.Type type) { + return new IllegalArgumentException("Expected type to be " + type); Review Comment: I think this may be phrased incorrectly. Isn't the problem that a method like `getUUID` was called for a value that is not a UUID? In that case do we trust the user called the right method and the expectation is valid? Instead, I would say something more direct that doesn't make that assumption: `"Cannot read %s value as a %s", actualType, expectedType` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
