aherbert commented on a change in pull request #46: URL: https://github.com/apache/commons-codec/pull/46#discussion_r446271719
########## File path: src/main/java/org/apache/commons/codec/binary/Base16.java ########## @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.binary; + +import org.apache.commons.codec.CodecPolicy; + +/** + * Provides Base16 encoding and decoding. + * + * <p> + * This class is thread-safe. + * </p> + * + * @see <a href="https://tools.ietf.org/html/rfc4648#section-8">RFC 4648 - 8. Base 16 Encoding</a> + * + * @since 1.15 + */ +public class Base16 extends BaseNCodec { + + /** + * BASE16 characters are 4 bits in length. + * They are formed by taking an 8-bit group, + * which is converted into two BASE16 characters. + */ + private static final int BITS_PER_ENCODED_BYTE = 4; + private static final int BYTES_PER_ENCODED_BLOCK = 2; + private static final int BYTES_PER_UNENCODED_BLOCK = 1; + + /** + * This array is a lookup table that translates Unicode characters drawn from the "Base16 Alphabet" (as specified + * in Table 5 of RFC 4648) into their 4-bit positive integer equivalents. Characters that are not in the Base16 + * alphabet but fall within the bounds of the array are translated to -1. + */ + private static final byte[] UPPER_CASE_DECODE_TABLE = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 20-2f + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 30-3f 0-9 + -1, 10, 11, 12, 13, 14, 15 // 40-46 A-F + }; + + /** + * This array is a lookup table that translates 4-bit positive integer index values into their "Base16 Alphabet" + * equivalents as specified in Table 5 of RFC 4648. + */ + private static final byte[] UPPER_CASE_ENCODE_TABLE = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'A', 'B', 'C', 'D', 'E', 'F' + }; + + /** + * This array is a lookup table that translates Unicode characters drawn from the a lower-case "Base16 Alphabet" + * into their 4-bit positive integer equivalents. Characters that are not in the Base16 + * alphabet but fall within the bounds of the array are translated to -1. + */ + private static final byte[] LOWER_CASE_DECODE_TABLE = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 20-2f + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 30-3f 0-9 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 40-4f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 50-5f + -1, 10, 11, 12, 13, 14, 15 // 60-66 a-f + }; + + /** + * This array is a lookup table that translates 4-bit positive integer index values into their "Base16 Alphabet" + * lower-case equivalents. + */ + private static final byte[] LOWER_CASE_ENCODE_TABLE = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'a', 'b', 'c', 'd', 'e', 'f' + }; + + /** Mask used to extract 4 bits, used when decoding character. */ + private static final int MASK_4BITS = 0x0f; + + /** + * Decode table to use. + */ + private final byte[] decodeTable; + + /** + * Encode table to use. + */ + private final byte[] encodeTable; + + /** + * Creates a Base16 codec used for decoding and encoding. + */ + public Base16() { + this(false); + } + + /** + * Creates a Base16 codec used for decoding and encoding. + * + * @param lowerCase if {@code true} then use a lower-case Base16 alphabet. + */ + public Base16(final boolean lowerCase) { + this(lowerCase, DECODING_POLICY_DEFAULT); + } + + /** + * Creates a Base16 codec used for decoding and encoding. + * + * @param lowerCase if {@code true} then use a lower-case Base16 alphabet. + * @param decodingPolicy Decoding policy. + */ + public Base16(final boolean lowerCase, final CodecPolicy decodingPolicy) { + super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK, 0, 0, + PAD_DEFAULT, decodingPolicy); + if (lowerCase) { + this.encodeTable = LOWER_CASE_ENCODE_TABLE; + this.decodeTable = LOWER_CASE_DECODE_TABLE; + } else { + this.encodeTable = UPPER_CASE_ENCODE_TABLE; + this.decodeTable = UPPER_CASE_DECODE_TABLE; + } + } + + @Override + void decode(final byte[] data, int offset, final int length, final Context context) { + if (context.eof || length < 0) { + context.eof = true; + if (context.ibitWorkArea > 0) { + validateTrailingCharacter(); + } + return; + } + + final int dataLen = Math.min(data.length - offset, length); + final int availableChars = (context.ibitWorkArea > 0 ? 1 : 0) + dataLen; + + // small optimisation to short-cut the rest of this method when it is fed byte-by-byte + if (availableChars == 1 && availableChars == dataLen) { + context.ibitWorkArea = decodeOctet(data[offset]) + 1; // store 1/2 byte for next invocation of decode, we offset by +1 as empty-value is 0 + return; + } + + // we must have an even number of chars to decode + final int charsToProcess = availableChars % BYTES_PER_ENCODED_BLOCK == 0 ? availableChars : availableChars - 1; + + final byte[] buffer = ensureBufferSize(charsToProcess / BYTES_PER_ENCODED_BLOCK, context); + + int result; + int i = 0; + if (dataLen < availableChars) { + // we have 1/2 byte from previous invocation to decode + result = (context.ibitWorkArea - 1) << BITS_PER_ENCODED_BYTE; + result |= decodeOctet(data[offset++]); + i = 2; + + buffer[context.pos++] = (byte)result; + + // reset to empty-value for next invocation! + context.ibitWorkArea = 0; + } + + while (i < charsToProcess) { + result = decodeOctet(data[offset++]) << BITS_PER_ENCODED_BYTE; + result |= decodeOctet(data[offset++]); + i += 2; + buffer[context.pos++] = (byte)result; + } + + // we have one char of a hex-pair left over + if (i < dataLen) { + context.ibitWorkArea = decodeOctet(data[i]) + 1; // store 1/2 byte for next invocation of decode, we offset by +1 as empty-value is 0 + } + } + + private int decodeOctet(final byte octet) { + int decoded = -1; + if (octet >= 0 && octet < decodeTable.length) { + decoded = decodeTable[octet]; + } + + if (decoded == -1) { + throw new IllegalArgumentException("Invalid octet in encoded value: " + (int)octet); + } + + return decoded; + } + + @Override + void encode(final byte[] data, final int offset, final int length, final Context context) { + if (context.eof) { + return; + } + + if (length < 0) { + context.eof = true; + return; + } + + final byte[] buffer = ensureBufferSize(length * BYTES_PER_ENCODED_BLOCK, context); + + final int end = offset + length; + for (int i = offset; i < end; i++) { + final int value = data[i]; + final int high = (value >> BITS_PER_ENCODED_BYTE) & MASK_4BITS; + final int low = value & MASK_4BITS; + buffer[context.pos++] = encodeTable[high]; + buffer[context.pos++] = encodeTable[low]; + } + } + + /** + * Returns whether or not the {@code octet} is in the Base16 alphabet. + * + * @param octet The value to test. + * + * @return {@code true} if the value is defined in the the Base16 alphabet {@code false} otherwise. + */ + @Override + public boolean isInAlphabet(final byte octet) { Review comment: I think there are two issues here: - Not knowing if the hex is upper or lower case, but it must use exclusively one or the other - Not allowing mixed case hex Ideally the most flexible decoder would have a 3rd mode for case insensitive decoding. This would apply to decoding only. It would work using a 3rd decoding table that combines the upper and lower case tables. However allowing mixed case in the same input is not really valid. So the decoder would have to detect the first A-F character and ensure the same case is used for the rest. An exception can be raised if not. This can even be covered by the strict/lenient decoding policy. Tracking the first A-F character is a lot of extra code so I don't like this idea. What do you think to using the decoding policy to cover this? The constructor accepts the upper/lower case flag for encoding and the strict/lenient decoding flag. If using strict decoding the case must match. If lenient mode then decoding is case insensitive. After all, if we are allowing a trailing character to be discarded, then allowing upper or lower case hex seems no worse a violation of a true encoding. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
