[GitHub] [flink] liyafan82 commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString

GitBox Tue, 18 Jun 2019 00:22:24 -0700

liyafan82 commented on a change in pull request #8689: 
[FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString
URL: https://github.com/apache/flink/pull/8689#discussion_r294642045


 ##########
 File path: 
flink-table/flink-table-runtime-blink/src/main/java/org/apache/flink/table/dataformat/BinaryStringUtil.java
 ##########
 @@ -0,0 +1,1134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.     See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.        You may obtain a copy of the License at
+ *
+ *             http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.dataformat;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.table.runtime.util.StringUtf8Utils;
+import org.apache.flink.table.util.SegmentsUtil;
+import org.apache.flink.table.utils.EncodingUtils;
+
+import java.math.BigDecimal;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static org.apache.flink.table.dataformat.BinaryString.EMPTY_UTF8;
+import static org.apache.flink.table.dataformat.BinaryString.fromAddress;
+import static org.apache.flink.table.dataformat.BinaryString.fromBytes;
+import static org.apache.flink.table.dataformat.BinaryString.fromString;
+import static 
org.apache.flink.table.dataformat.BinaryString.numBytesForFirstByte;
+
+/**
+ * Util for {@link BinaryString}.
+ */
+public class BinaryStringUtil {
+
+       public static final BinaryString[] EMPTY_STRING_ARRAY = new 
BinaryString[0];
+       private static final List<BinaryString> TRUE_STRINGS =
+                       Stream.of("t", "true", "y", "yes", "1")
+                                       .map(BinaryString::fromString)
+                                       .peek(BinaryString::ensureMaterialized)
+                                       .collect(Collectors.toList());
+
+       private static final List<BinaryString> FALSE_STRINGS =
+                       Stream.of("f", "false", "n", "no", "0")
+                                       .map(BinaryString::fromString)
+                                       .peek(BinaryString::ensureMaterialized)
+                                       .collect(Collectors.toList());
+
+       private static byte[] getTmpBytes(BinaryString str, int sizeInBytes) {
+               byte[] bytes = SegmentsUtil.allocateReuseBytes(sizeInBytes);
+               SegmentsUtil.copyToBytes(str.getSegments(), str.getOffset(), 
bytes, 0, sizeInBytes);
+               return bytes;
+       }
+
+       /**
+        * <p>Splits the provided text into an array, separator string 
specified. </p>
+        *
+        * <p>The separator is not included in the returned String array.
+        * Adjacent separators are treated as separators for empty tokens.</p>
+        *
+        * <p>A {@code null} separator splits on whitespace.</p>
+        *
+        * <pre>
+        * "".splitByWholeSeparatorPreserveAllTokens(*)                 = []
+        * "ab de fg".splitByWholeSeparatorPreserveAllTokens(null)      = 
["ab", "de", "fg"]
+        * "ab   de fg".splitByWholeSeparatorPreserveAllTokens(null)    = 
["ab", "", "", "de", "fg"]
+        * "ab:cd:ef".splitByWholeSeparatorPreserveAllTokens(":")       = 
["ab", "cd", "ef"]
+        * "ab-!-cd-!-ef".splitByWholeSeparatorPreserveAllTokens("-!-") = 
["ab", "cd", "ef"]
+        * </pre>
+        *
+        * <p>Note: return BinaryStrings is reuse MemorySegments from this.</p>
+        *
+        * @param separator  String containing the String to be used as a 
delimiter,
+        *  {@code null} splits on whitespace
+        * @return an array of parsed Strings, {@code null} if null String was 
input
+        */
+       public static BinaryString[] 
splitByWholeSeparatorPreserveAllTokens(BinaryString str, BinaryString 
separator) {
+               str.ensureMaterialized();
+               final int sizeInBytes = str.getSizeInBytes();
+               MemorySegment[] segments = str.getSegments();
+               int offset = str.getOffset();
+
+               if (sizeInBytes == 0) {
+                       return EMPTY_STRING_ARRAY;
+               }
+
+               if (separator == null || EMPTY_UTF8.equals(separator)) {
+                       // Split on whitespace.
+                       return splitByWholeSeparatorPreserveAllTokens(str, 
fromString(" "));
+               }
+               separator.ensureMaterialized();
+
+               int sepSize = separator.getSizeInBytes();
+               MemorySegment[] sepSegs = separator.getSegments();
+               int sepOffset = separator.getOffset();
+
+               final ArrayList<BinaryString> substrings = new ArrayList<>();
+               int beg = 0;
+               int end = 0;
+               while (end < sizeInBytes) {
+                       end = SegmentsUtil.find(
+                                       segments, offset + beg, sizeInBytes - 
beg,
+                                       sepSegs, sepOffset, sepSize) - offset;
+
+                       if (end > -1) {
+                               if (end > beg) {
+
+                                       // The following is OK, because 
String.substring( beg, end ) excludes
+                                       // the character at the position 'end'.
+                                       substrings.add(fromAddress(segments, 
offset + beg, end - beg));
+
+                                       // Set the starting point for the next 
search.
+                                       // The following is equivalent to beg = 
end + (separatorLength - 1) + 1,
+                                       // which is the right calculation:
+                                       beg = end + sepSize;
+                               } else {
+                                       // We found a consecutive occurrence of 
the separator.
+                                       substrings.add(EMPTY_UTF8);
+                                       beg = end + sepSize;
+                               }
+                       } else {
+                               // String.substring( beg ) goes from 'beg' to 
the end of the String.
+                               substrings.add(fromAddress(segments, offset + 
beg, sizeInBytes - beg));
+                               end = sizeInBytes;
+                       }
+               }
+
+               return substrings.toArray(new BinaryString[0]);
+       }
+
+       /**
+        * Decide boolean representation of a string.
+        */
+       public static Boolean toBooleanSQL(BinaryString str) {
+               BinaryString lowerCase = str.toLowerCase();
+               return TRUE_STRINGS.contains(lowerCase) ? Boolean.TRUE :
+                               (FALSE_STRINGS.contains(lowerCase) ? 
Boolean.FALSE : null);
+       }
+
+       /**
+        * Calculate the hash value of a given string use {@link MessageDigest}.
+        */
+       public static BinaryString hash(BinaryString str, MessageDigest md) {
+               return fromString(EncodingUtils.hex(md.digest(str.getBytes())));
+       }
+
+       public static BinaryString hash(BinaryString str, String algorithm) 
throws NoSuchAlgorithmException {
+               return hash(str, MessageDigest.getInstance(algorithm));
+       }
+
+       /**
+        * Parses this BinaryString to Decimal.
+        *
+        * @return Decimal value if the parsing was successful, or null if 
overflow
+        * @throws NumberFormatException if the parsing failed.
+        */
+       public static Decimal toDecimal(BinaryString str, int precision, int 
scale) {
+               str.ensureMaterialized();
+
+               if (precision > Decimal.MAX_LONG_DIGITS || str.getSizeInBytes() 
> Decimal.MAX_LONG_DIGITS) {
+                       return toBigPrecisionDecimal(str, precision, scale);
+               }
+
+               int sizeInBytes = str.getSizeInBytes();
+               return toDecimalFromBytes(precision, scale, getTmpBytes(str, 
sizeInBytes), 0, sizeInBytes);
+       }
+
+       private static Decimal toDecimalFromBytes(
+                       int precision, int scale, byte[] bytes, int offset, int 
sizeInBytes) {
+               // Data in Decimal is stored by one long value if `precision` 
<= Decimal.MAX_LONG_DIGITS.
+               // In this case we can directly extract the value from memory 
segment.
+               int i = 0;
+
+               // Remove white spaces at the beginning
+               byte b = 0;
+               while (i < sizeInBytes) {
+                       b = bytes[offset + i];
+                       if (b != ' ' && b != '\n' && b != '\t') {
+                               break;
+                       }
+                       i++;
+               }
+               if (i == sizeInBytes) {
+                       // all whitespaces
+                       return null;
+               }
+
+               // ======= Significand part begin =======
+               final boolean negative = b == '-';
+               if (negative || b == '+') {
+                       i++;
+                       if (i == sizeInBytes) {
+                               // only contains prefix plus/minus
+                               return null;
+                       }
+               }
+
+               long significand = 0;
+               int exp = 0;
+               int significandLen = 0, pointPos = -1;
+
+               while (i < sizeInBytes) {
+                       b = bytes[offset + i];
+                       i++;
+
+                       if (b >= '0' && b <= '9') {
+                               // No need to worry about overflow, because 
sizeInBytes <= Decimal.MAX_LONG_DIGITS
+                               significand = significand * 10 + (b - '0');
+                               significandLen++;
+                       } else if (b == '.') {
+                               if (pointPos >= 0) {
+                                       // More than one decimal point
+                                       return null;
+                               }
+                               pointPos = significandLen;
+                       } else {
+                               break;
+                       }
+               }
+
+               if (pointPos < 0) {
+                       pointPos = significandLen;
+               }
+               if (negative) {
+                       significand = -significand;
+               }
+               // ======= Significand part end =======
+
+               // ======= Exponential part begin =======
+               if ((b == 'e' || b == 'E') && i < sizeInBytes) {
+                       b = bytes[offset + i];
+                       final boolean expNegative = b == '-';
+                       if (expNegative || b == '+') {
+                               i++;
+                               if (i == sizeInBytes) {
+                                       return null;
+                               }
+                       }
+
+                       int expDigits = 0;
+                       // As `precision` <= 18, value absolute range is 
limited to 10^-18 ~ 10^18.
+                       // The worst case is <18-digits>E-36
+                       final int expStopValue = 40;
+
+                       while (i < sizeInBytes) {
+                               b = bytes[offset + i];
+                               i++;
+
+                               if (b >= '0' && b <= '9') {
+                                       // No need to worry about larger 
exponents,
+                                       // because they will produce overflow 
or underflow
+                                       if (expDigits < expStopValue) {
+                                               expDigits = expDigits * 10 + (b 
- '0');
+                                       }
+                               } else {
+                                       break;
+                               }
+                       }
+
+                       if (expNegative) {
+                               expDigits = -expDigits;
+                       }
+                       exp += expDigits;
+               }
+               exp -= significandLen - pointPos;
+               // ======= Exponential part end =======
+
+               // Check for invalid character at the end
+               while (i < sizeInBytes) {
+                       b = bytes[offset + i];
+                       i++;
+                       // White spaces are allowed at the end
+                       if (b != ' ' && b != '\n' && b != '\t') {
+                               return null;
+                       }
+               }
+
+               // Round exp to scale
+               int change = exp + scale;
+               if (significandLen + change > precision) {
+                       // Overflow
+                       return null;
+               }
+               if (change >= 0) {
+                       significand *= Decimal.POW10[change];
+               } else {
+                       int k = negative ? -5 : 5;
+                       significand = (significand + k * Decimal.POW10[-change 
- 1]) / Decimal.POW10[-change];
+               }
+               return Decimal.fromLong(significand, precision, scale);
+       }
+
+       private static Decimal toBigPrecisionDecimal(BinaryString str, int 
precision, int scale) {
+               // As data in Decimal is currently stored by BigDecimal if 
`precision` > Decimal.MAX_LONG_DIGITS,
+               // and BigDecimal only supports String or char[] for its 
constructor,
+               // we can't directly extract the value from BinaryString.
+               //
+               // As BigDecimal(char[], int, int) is faster than 
BigDecimal(String, int, int),
+               // we extract char[] from the memory segment and pass it to the 
constructor of BigDecimal.
+               int sizeInBytes = str.getSizeInBytes();
+               int offset = str.getOffset();
+               MemorySegment[] segments = str.getSegments();
+               char[] chars = SegmentsUtil.allocateReuseChars(sizeInBytes);
+               int len;
+               if (segments.length == 1) {
+                       len = StringUtf8Utils.decodeUTF8Strict(segments[0], 
offset, sizeInBytes, chars);
+               } else {
+                       byte[] bytes = 
SegmentsUtil.allocateReuseBytes(sizeInBytes);
+                       SegmentsUtil.copyToBytes(segments, offset, bytes, 0, 
sizeInBytes);
+                       len = StringUtf8Utils.decodeUTF8Strict(bytes, 0, 
sizeInBytes, chars);
+               }
+
+               if (len < 0) {
+                       return null;
+               } else {
+                       // Trim white spaces
+                       int start = 0, end = len;
+                       for (int i = 0; i < len; i++) {
+                               if (chars[i] != ' ' && chars[i] != '\n' && 
chars[i] != '\t') {
+                                       start = i;
+                                       break;
+                               }
+                       }
+                       for (int i = len - 1; i >= 0; i--) {
+                               if (chars[i] != ' ' && chars[i] != '\n' && 
chars[i] != '\t') {
+                                       end = i + 1;
+                                       break;
+                               }
+                       }
+                       try {
+                               BigDecimal bd = new BigDecimal(chars, start, 
end - start);
+                               return Decimal.fromBigDecimal(bd, precision, 
scale);
+                       } catch (NumberFormatException nfe) {
+                               return null;
+                       }
+               }
+       }
+
+       /**
+        * Parses this BinaryString to Long.
+        *
+        * <p>Note that, in this method we accumulate the result in negative 
format, and convert it to
+        * positive format at the end, if this string is not started with '-'. 
This is because min value
+        * is bigger than max value in digits, e.g. Long.MAX_VALUE is 
'9223372036854775807' and
+        * Long.MIN_VALUE is '-9223372036854775808'.
+        *
+        * <p>This code is mostly copied from LazyLong.parseLong in Hive.
+        * @return Long value if the parsing was successful else null.
+        */
+       public static Long toLong(BinaryString str) {
+               int sizeInBytes = str.getSizeInBytes();
+               byte[] tmpBytes = getTmpBytes(str, sizeInBytes);
+               if (sizeInBytes == 0) {
+                       return null;
+               }
+               int i = 0;
+
+               byte b = tmpBytes[i];
+               final boolean negative = b == '-';
+               if (negative || b == '+') {
+                       i++;
+                       if (sizeInBytes == 1) {
+                               return null;
+                       }
+               }
+
+               long result = 0;
+               final byte separator = '.';
+               final int radix = 10;
+               final long stopValue = Long.MIN_VALUE / radix;
+               while (i < sizeInBytes) {
+                       b = tmpBytes[i];
+                       i++;
+                       if (b == separator) {
+                               // We allow decimals and will return a 
truncated integral in that case.
+                               // Therefore we won't throw an exception here 
(checking the fractional
+                               // part happens below.)
+                               break;
+                       }
+
+                       int digit;
+                       if (b >= '0' && b <= '9') {
+                               digit = b - '0';
+                       } else {
+                               return null;
+                       }
+
+                       // We are going to process the new digit and accumulate 
the result. However, before
+                       // doing this, if the result is already smaller than the
+                       // stopValue(Long.MIN_VALUE / radix), then result * 10 
will definitely be smaller
+                       // than minValue, and we can stop.
+                       if (result < stopValue) {
+                               return null;
+                       }
+
+                       result = result * radix - digit;
+                       // Since the previous result is less than or equal to
+                       // stopValue(Long.MIN_VALUE / radix), we can just use 
`result > 0` to check overflow.
+                       // If result overflows, we should stop.
+                       if (result > 0) {
+                               return null;
+                       }
+               }
+
+               // This is the case when we've encountered a decimal separator. 
The fractional
+               // part will not change the number, but we will verify that the 
fractional part
+               // is well formed.
+               while (i < sizeInBytes) {
+                       byte currentByte = tmpBytes[i];
+                       if (currentByte < '0' || currentByte > '9') {
+                               return null;
+                       }
+                       i++;
+               }
+
+               if (!negative) {
+                       result = -result;
+                       if (result < 0) {
+                               return null;
+                       }
+               }
+               return result;
+       }
+
+       /**
+        * Parses this BinaryString to Int.
+        *
+        * <p>Note that, in this method we accumulate the result in negative 
format, and convert it to
+        * positive format at the end, if this string is not started with '-'. 
This is because min value
+        * is bigger than max value in digits, e.g. Integer.MAX_VALUE is 
'2147483647' and
+        * Integer.MIN_VALUE is '-2147483648'.
+        *
+        * <p>This code is mostly copied from LazyInt.parseInt in Hive.
+        *
+        * <p>Note that, this method is almost same as `toLong`, but we leave 
it duplicated for performance
+        * reasons, like Hive does.
+        * @return Integer value if the parsing was successful else null.
+        */
+       public static Integer toInt(BinaryString str) {
 
 Review comment:
   I think we can get better performance by using the primitive type int?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [flink] liyafan82 commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString

Reply via email to