aherbert commented on a change in pull request #46: URL: https://github.com/apache/commons-codec/pull/46#discussion_r446943093
########## File path: src/main/java/org/apache/commons/codec/binary/Base16.java ########## @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.binary; + +import org.apache.commons.codec.CodecPolicy; + +/** + * Provides Base16 encoding and decoding. + * + * <p> + * This class is thread-safe. + * </p> + * + * @see <a href="https://tools.ietf.org/html/rfc4648#section-8">RFC 4648 - 8. Base 16 Encoding</a> + * + * @since 1.15 + */ +public class Base16 extends BaseNCodec { + + /** + * BASE16 characters are 4 bits in length. + * They are formed by taking an 8-bit group, + * which is converted into two BASE16 characters. + */ + private static final int BITS_PER_ENCODED_BYTE = 4; + private static final int BYTES_PER_ENCODED_BLOCK = 2; + private static final int BYTES_PER_UNENCODED_BLOCK = 1; + + /** + * This array is a lookup table that translates Unicode characters drawn from the "Base16 Alphabet" (as specified + * in Table 5 of RFC 4648) into their 4-bit positive integer equivalents. Characters that are not in the Base16 + * alphabet but fall within the bounds of the array are translated to -1. + */ + private static final byte[] UPPER_CASE_DECODE_TABLE = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 20-2f + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 30-3f 0-9 + -1, 10, 11, 12, 13, 14, 15 // 40-46 A-F + }; + + /** + * This array is a lookup table that translates 4-bit positive integer index values into their "Base16 Alphabet" + * equivalents as specified in Table 5 of RFC 4648. + */ + private static final byte[] UPPER_CASE_ENCODE_TABLE = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'A', 'B', 'C', 'D', 'E', 'F' + }; + + /** + * This array is a lookup table that translates Unicode characters drawn from the a lower-case "Base16 Alphabet" + * into their 4-bit positive integer equivalents. Characters that are not in the Base16 + * alphabet but fall within the bounds of the array are translated to -1. + */ + private static final byte[] LOWER_CASE_DECODE_TABLE = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 20-2f + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 30-3f 0-9 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 40-4f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 50-5f + -1, 10, 11, 12, 13, 14, 15 // 60-66 a-f + }; + + /** + * This array is a lookup table that translates 4-bit positive integer index values into their "Base16 Alphabet" + * lower-case equivalents. + */ + private static final byte[] LOWER_CASE_ENCODE_TABLE = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'a', 'b', 'c', 'd', 'e', 'f' + }; + + /** Mask used to extract 4 bits, used when decoding character. */ + private static final int MASK_4BITS = 0x0f; + + /** + * Decode table to use. + */ + private final byte[] decodeTable; + + /** + * Encode table to use. + */ + private final byte[] encodeTable; + + /** + * Creates a Base16 codec used for decoding and encoding. + */ + public Base16() { + this(false); + } + + /** + * Creates a Base16 codec used for decoding and encoding. + * + * @param lowerCase if {@code true} then use a lower-case Base16 alphabet. + */ + public Base16(final boolean lowerCase) { + this(lowerCase, DECODING_POLICY_DEFAULT); + } + + /** + * Creates a Base16 codec used for decoding and encoding. + * + * @param lowerCase if {@code true} then use a lower-case Base16 alphabet. + * @param decodingPolicy Decoding policy. + */ + public Base16(final boolean lowerCase, final CodecPolicy decodingPolicy) { + super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK, 0, 0, + PAD_DEFAULT, decodingPolicy); + if (lowerCase) { + this.encodeTable = LOWER_CASE_ENCODE_TABLE; + this.decodeTable = LOWER_CASE_DECODE_TABLE; + } else { + this.encodeTable = UPPER_CASE_ENCODE_TABLE; + this.decodeTable = UPPER_CASE_DECODE_TABLE; + } + } + + @Override + void decode(final byte[] data, int offset, final int length, final Context context) { + if (context.eof || length < 0) { + context.eof = true; + if (context.ibitWorkArea > 0) { + validateTrailingCharacter(); + } + return; + } + + final int dataLen = Math.min(data.length - offset, length); + final int availableChars = (context.ibitWorkArea > 0 ? 1 : 0) + dataLen; + + // small optimisation to short-cut the rest of this method when it is fed byte-by-byte + if (availableChars == 1 && availableChars == dataLen) { + context.ibitWorkArea = decodeOctet(data[offset]) + 1; // store 1/2 byte for next invocation of decode, we offset by +1 as empty-value is 0 + return; + } + + // we must have an even number of chars to decode + final int charsToProcess = availableChars % BYTES_PER_ENCODED_BLOCK == 0 ? availableChars : availableChars - 1; + + final byte[] buffer = ensureBufferSize(charsToProcess / BYTES_PER_ENCODED_BLOCK, context); + + int result; + int i = 0; + if (dataLen < availableChars) { + // we have 1/2 byte from previous invocation to decode + result = (context.ibitWorkArea - 1) << BITS_PER_ENCODED_BYTE; + result |= decodeOctet(data[offset++]); + i = 2; + + buffer[context.pos++] = (byte)result; + + // reset to empty-value for next invocation! + context.ibitWorkArea = 0; + } + + while (i < charsToProcess) { + result = decodeOctet(data[offset++]) << BITS_PER_ENCODED_BYTE; + result |= decodeOctet(data[offset++]); + i += 2; + buffer[context.pos++] = (byte)result; + } + + // we have one char of a hex-pair left over + if (i < dataLen) { + context.ibitWorkArea = decodeOctet(data[i]) + 1; // store 1/2 byte for next invocation of decode, we offset by +1 as empty-value is 0 + } + } + + private int decodeOctet(final byte octet) { + int decoded = -1; + if (octet >= 0 && octet < decodeTable.length) { + decoded = decodeTable[octet]; + } + + if (decoded == -1) { + throw new IllegalArgumentException("Invalid octet in encoded value: " + (int)octet); + } + + return decoded; + } + + @Override + void encode(final byte[] data, final int offset, final int length, final Context context) { + if (context.eof) { + return; + } + + if (length < 0) { + context.eof = true; + return; + } + + final byte[] buffer = ensureBufferSize(length * BYTES_PER_ENCODED_BLOCK, context); + + final int end = offset + length; + for (int i = offset; i < end; i++) { + final int value = data[i]; + final int high = (value >> BITS_PER_ENCODED_BYTE) & MASK_4BITS; + final int low = value & MASK_4BITS; + buffer[context.pos++] = encodeTable[high]; + buffer[context.pos++] = encodeTable[low]; + } + } + + /** + * Returns whether or not the {@code octet} is in the Base16 alphabet. + * + * @param octet The value to test. + * + * @return {@code true} if the value is defined in the the Base16 alphabet {@code false} otherwise. + */ + @Override + public boolean isInAlphabet(final byte octet) { Review comment: OK. The idea of using the lenient decoding to cover both case insensitive decoding and allowing trailing characters is a mash up. A case insensitive decoding table can be added as an option later. After all, you may wish to support case insensitive decoding but enforce no trailing characters. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
