liyafan82 commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString URL: https://github.com/apache/flink/pull/8689#discussion_r294640365
########## File path: flink-table/flink-table-runtime-blink/src/main/java/org/apache/flink/table/dataformat/BinaryStringUtil.java ########## @@ -0,0 +1,1134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.table.dataformat; + +import org.apache.flink.core.memory.MemorySegment; +import org.apache.flink.table.runtime.util.StringUtf8Utils; +import org.apache.flink.table.util.SegmentsUtil; +import org.apache.flink.table.utils.EncodingUtils; + +import java.math.BigDecimal; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.flink.table.dataformat.BinaryString.EMPTY_UTF8; +import static org.apache.flink.table.dataformat.BinaryString.fromAddress; +import static org.apache.flink.table.dataformat.BinaryString.fromBytes; +import static org.apache.flink.table.dataformat.BinaryString.fromString; +import static org.apache.flink.table.dataformat.BinaryString.numBytesForFirstByte; + +/** + * Util for {@link BinaryString}. + */ +public class BinaryStringUtil { + + public static final BinaryString[] EMPTY_STRING_ARRAY = new BinaryString[0]; + private static final List<BinaryString> TRUE_STRINGS = + Stream.of("t", "true", "y", "yes", "1") + .map(BinaryString::fromString) + .peek(BinaryString::ensureMaterialized) + .collect(Collectors.toList()); + + private static final List<BinaryString> FALSE_STRINGS = + Stream.of("f", "false", "n", "no", "0") + .map(BinaryString::fromString) + .peek(BinaryString::ensureMaterialized) + .collect(Collectors.toList()); + + private static byte[] getTmpBytes(BinaryString str, int sizeInBytes) { + byte[] bytes = SegmentsUtil.allocateReuseBytes(sizeInBytes); + SegmentsUtil.copyToBytes(str.getSegments(), str.getOffset(), bytes, 0, sizeInBytes); + return bytes; + } + + /** + * <p>Splits the provided text into an array, separator string specified. </p> + * + * <p>The separator is not included in the returned String array. + * Adjacent separators are treated as separators for empty tokens.</p> + * + * <p>A {@code null} separator splits on whitespace.</p> + * + * <pre> + * "".splitByWholeSeparatorPreserveAllTokens(*) = [] + * "ab de fg".splitByWholeSeparatorPreserveAllTokens(null) = ["ab", "de", "fg"] + * "ab de fg".splitByWholeSeparatorPreserveAllTokens(null) = ["ab", "", "", "de", "fg"] + * "ab:cd:ef".splitByWholeSeparatorPreserveAllTokens(":") = ["ab", "cd", "ef"] + * "ab-!-cd-!-ef".splitByWholeSeparatorPreserveAllTokens("-!-") = ["ab", "cd", "ef"] + * </pre> + * + * <p>Note: return BinaryStrings is reuse MemorySegments from this.</p> + * + * @param separator String containing the String to be used as a delimiter, + * {@code null} splits on whitespace + * @return an array of parsed Strings, {@code null} if null String was input + */ + public static BinaryString[] splitByWholeSeparatorPreserveAllTokens(BinaryString str, BinaryString separator) { + str.ensureMaterialized(); + final int sizeInBytes = str.getSizeInBytes(); + MemorySegment[] segments = str.getSegments(); + int offset = str.getOffset(); + + if (sizeInBytes == 0) { + return EMPTY_STRING_ARRAY; + } + + if (separator == null || EMPTY_UTF8.equals(separator)) { + // Split on whitespace. + return splitByWholeSeparatorPreserveAllTokens(str, fromString(" ")); + } + separator.ensureMaterialized(); + + int sepSize = separator.getSizeInBytes(); + MemorySegment[] sepSegs = separator.getSegments(); + int sepOffset = separator.getOffset(); + + final ArrayList<BinaryString> substrings = new ArrayList<>(); + int beg = 0; + int end = 0; + while (end < sizeInBytes) { + end = SegmentsUtil.find( + segments, offset + beg, sizeInBytes - beg, + sepSegs, sepOffset, sepSize) - offset; + + if (end > -1) { + if (end > beg) { + + // The following is OK, because String.substring( beg, end ) excludes + // the character at the position 'end'. + substrings.add(fromAddress(segments, offset + beg, end - beg)); + + // Set the starting point for the next search. + // The following is equivalent to beg = end + (separatorLength - 1) + 1, + // which is the right calculation: + beg = end + sepSize; + } else { + // We found a consecutive occurrence of the separator. + substrings.add(EMPTY_UTF8); + beg = end + sepSize; + } + } else { + // String.substring( beg ) goes from 'beg' to the end of the String. + substrings.add(fromAddress(segments, offset + beg, sizeInBytes - beg)); + end = sizeInBytes; + } + } + + return substrings.toArray(new BinaryString[0]); + } + + /** + * Decide boolean representation of a string. + */ + public static Boolean toBooleanSQL(BinaryString str) { + BinaryString lowerCase = str.toLowerCase(); + return TRUE_STRINGS.contains(lowerCase) ? Boolean.TRUE : + (FALSE_STRINGS.contains(lowerCase) ? Boolean.FALSE : null); + } + + /** + * Calculate the hash value of a given string use {@link MessageDigest}. + */ + public static BinaryString hash(BinaryString str, MessageDigest md) { + return fromString(EncodingUtils.hex(md.digest(str.getBytes()))); + } + + public static BinaryString hash(BinaryString str, String algorithm) throws NoSuchAlgorithmException { + return hash(str, MessageDigest.getInstance(algorithm)); + } + + /** + * Parses this BinaryString to Decimal. + * + * @return Decimal value if the parsing was successful, or null if overflow + * @throws NumberFormatException if the parsing failed. + */ + public static Decimal toDecimal(BinaryString str, int precision, int scale) { + str.ensureMaterialized(); + + if (precision > Decimal.MAX_LONG_DIGITS || str.getSizeInBytes() > Decimal.MAX_LONG_DIGITS) { + return toBigPrecisionDecimal(str, precision, scale); + } + + int sizeInBytes = str.getSizeInBytes(); + return toDecimalFromBytes(precision, scale, getTmpBytes(str, sizeInBytes), 0, sizeInBytes); + } + + private static Decimal toDecimalFromBytes( + int precision, int scale, byte[] bytes, int offset, int sizeInBytes) { + // Data in Decimal is stored by one long value if `precision` <= Decimal.MAX_LONG_DIGITS. + // In this case we can directly extract the value from memory segment. + int i = 0; + + // Remove white spaces at the beginning + byte b = 0; + while (i < sizeInBytes) { + b = bytes[offset + i]; + if (b != ' ' && b != '\n' && b != '\t') { + break; + } + i++; + } + if (i == sizeInBytes) { + // all whitespaces + return null; + } + + // ======= Significand part begin ======= Review comment: Significand -> Significant begin -> begins ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
