[GitHub] [flink] liyafan82 commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString

GitBox Tue, 18 Jun 2019 00:17:50 -0700

liyafan82 commented on a change in pull request #8689: 
[FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString
URL: https://github.com/apache/flink/pull/8689#discussion_r294640365


 ##########
 File path: 
flink-table/flink-table-runtime-blink/src/main/java/org/apache/flink/table/dataformat/BinaryStringUtil.java
 ##########
 @@ -0,0 +1,1134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.     See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.        You may obtain a copy of the License at
+ *
+ *             http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.dataformat;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.table.runtime.util.StringUtf8Utils;
+import org.apache.flink.table.util.SegmentsUtil;
+import org.apache.flink.table.utils.EncodingUtils;
+
+import java.math.BigDecimal;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static org.apache.flink.table.dataformat.BinaryString.EMPTY_UTF8;
+import static org.apache.flink.table.dataformat.BinaryString.fromAddress;
+import static org.apache.flink.table.dataformat.BinaryString.fromBytes;
+import static org.apache.flink.table.dataformat.BinaryString.fromString;
+import static 
org.apache.flink.table.dataformat.BinaryString.numBytesForFirstByte;
+
+/**
+ * Util for {@link BinaryString}.
+ */
+public class BinaryStringUtil {
+
+       public static final BinaryString[] EMPTY_STRING_ARRAY = new 
BinaryString[0];
+       private static final List<BinaryString> TRUE_STRINGS =
+                       Stream.of("t", "true", "y", "yes", "1")
+                                       .map(BinaryString::fromString)
+                                       .peek(BinaryString::ensureMaterialized)
+                                       .collect(Collectors.toList());
+
+       private static final List<BinaryString> FALSE_STRINGS =
+                       Stream.of("f", "false", "n", "no", "0")
+                                       .map(BinaryString::fromString)
+                                       .peek(BinaryString::ensureMaterialized)
+                                       .collect(Collectors.toList());
+
+       private static byte[] getTmpBytes(BinaryString str, int sizeInBytes) {
+               byte[] bytes = SegmentsUtil.allocateReuseBytes(sizeInBytes);
+               SegmentsUtil.copyToBytes(str.getSegments(), str.getOffset(), 
bytes, 0, sizeInBytes);
+               return bytes;
+       }
+
+       /**
+        * <p>Splits the provided text into an array, separator string 
specified. </p>
+        *
+        * <p>The separator is not included in the returned String array.
+        * Adjacent separators are treated as separators for empty tokens.</p>
+        *
+        * <p>A {@code null} separator splits on whitespace.</p>
+        *
+        * <pre>
+        * "".splitByWholeSeparatorPreserveAllTokens(*)                 = []
+        * "ab de fg".splitByWholeSeparatorPreserveAllTokens(null)      = 
["ab", "de", "fg"]
+        * "ab   de fg".splitByWholeSeparatorPreserveAllTokens(null)    = 
["ab", "", "", "de", "fg"]
+        * "ab:cd:ef".splitByWholeSeparatorPreserveAllTokens(":")       = 
["ab", "cd", "ef"]
+        * "ab-!-cd-!-ef".splitByWholeSeparatorPreserveAllTokens("-!-") = 
["ab", "cd", "ef"]
+        * </pre>
+        *
+        * <p>Note: return BinaryStrings is reuse MemorySegments from this.</p>
+        *
+        * @param separator  String containing the String to be used as a 
delimiter,
+        *  {@code null} splits on whitespace
+        * @return an array of parsed Strings, {@code null} if null String was 
input
+        */
+       public static BinaryString[] 
splitByWholeSeparatorPreserveAllTokens(BinaryString str, BinaryString 
separator) {
+               str.ensureMaterialized();
+               final int sizeInBytes = str.getSizeInBytes();
+               MemorySegment[] segments = str.getSegments();
+               int offset = str.getOffset();
+
+               if (sizeInBytes == 0) {
+                       return EMPTY_STRING_ARRAY;
+               }
+
+               if (separator == null || EMPTY_UTF8.equals(separator)) {
+                       // Split on whitespace.
+                       return splitByWholeSeparatorPreserveAllTokens(str, 
fromString(" "));
+               }
+               separator.ensureMaterialized();
+
+               int sepSize = separator.getSizeInBytes();
+               MemorySegment[] sepSegs = separator.getSegments();
+               int sepOffset = separator.getOffset();
+
+               final ArrayList<BinaryString> substrings = new ArrayList<>();
+               int beg = 0;
+               int end = 0;
+               while (end < sizeInBytes) {
+                       end = SegmentsUtil.find(
+                                       segments, offset + beg, sizeInBytes - 
beg,
+                                       sepSegs, sepOffset, sepSize) - offset;
+
+                       if (end > -1) {
+                               if (end > beg) {
+
+                                       // The following is OK, because 
String.substring( beg, end ) excludes
+                                       // the character at the position 'end'.
+                                       substrings.add(fromAddress(segments, 
offset + beg, end - beg));
+
+                                       // Set the starting point for the next 
search.
+                                       // The following is equivalent to beg = 
end + (separatorLength - 1) + 1,
+                                       // which is the right calculation:
+                                       beg = end + sepSize;
+                               } else {
+                                       // We found a consecutive occurrence of 
the separator.
+                                       substrings.add(EMPTY_UTF8);
+                                       beg = end + sepSize;
+                               }
+                       } else {
+                               // String.substring( beg ) goes from 'beg' to 
the end of the String.
+                               substrings.add(fromAddress(segments, offset + 
beg, sizeInBytes - beg));
+                               end = sizeInBytes;
+                       }
+               }
+
+               return substrings.toArray(new BinaryString[0]);
+       }
+
+       /**
+        * Decide boolean representation of a string.
+        */
+       public static Boolean toBooleanSQL(BinaryString str) {
+               BinaryString lowerCase = str.toLowerCase();
+               return TRUE_STRINGS.contains(lowerCase) ? Boolean.TRUE :
+                               (FALSE_STRINGS.contains(lowerCase) ? 
Boolean.FALSE : null);
+       }
+
+       /**
+        * Calculate the hash value of a given string use {@link MessageDigest}.
+        */
+       public static BinaryString hash(BinaryString str, MessageDigest md) {
+               return fromString(EncodingUtils.hex(md.digest(str.getBytes())));
+       }
+
+       public static BinaryString hash(BinaryString str, String algorithm) 
throws NoSuchAlgorithmException {
+               return hash(str, MessageDigest.getInstance(algorithm));
+       }
+
+       /**
+        * Parses this BinaryString to Decimal.
+        *
+        * @return Decimal value if the parsing was successful, or null if 
overflow
+        * @throws NumberFormatException if the parsing failed.
+        */
+       public static Decimal toDecimal(BinaryString str, int precision, int 
scale) {
+               str.ensureMaterialized();
+
+               if (precision > Decimal.MAX_LONG_DIGITS || str.getSizeInBytes() 
> Decimal.MAX_LONG_DIGITS) {
+                       return toBigPrecisionDecimal(str, precision, scale);
+               }
+
+               int sizeInBytes = str.getSizeInBytes();
+               return toDecimalFromBytes(precision, scale, getTmpBytes(str, 
sizeInBytes), 0, sizeInBytes);
+       }
+
+       private static Decimal toDecimalFromBytes(
+                       int precision, int scale, byte[] bytes, int offset, int 
sizeInBytes) {
+               // Data in Decimal is stored by one long value if `precision` 
<= Decimal.MAX_LONG_DIGITS.
+               // In this case we can directly extract the value from memory 
segment.
+               int i = 0;
+
+               // Remove white spaces at the beginning
+               byte b = 0;
+               while (i < sizeInBytes) {
+                       b = bytes[offset + i];
+                       if (b != ' ' && b != '\n' && b != '\t') {
+                               break;
+                       }
+                       i++;
+               }
+               if (i == sizeInBytes) {
+                       // all whitespaces
+                       return null;
+               }
+
+               // ======= Significand part begin =======
 
 Review comment:
   Significant

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [flink] liyafan82 commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString

Reply via email to