Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/16550#discussion_r95588520 --- Diff: common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java --- @@ -835,6 +835,185 @@ public UTF8String translate(Map<Character, Character> dict) { return fromString(sb.toString()); } + private int getDigit(byte b) { + if (b >= '0' && b <= '9') { + return b - '0'; + } + throw new NumberFormatException(toString()); + } + + /** + * Parses this UTF8String to long. + * + * Note that, in this method we accumulate the result in negative format, and convert it to + * positive format at the end, if this string is not started with '-'. This is because min value + * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and + * Integer.MIN_VALUE is '-2147483648'. + * + * These codes are mostly copied from LazyLong.parseLong in Hive. + */ + public long toLong() { + if (numBytes == 0) { + throw new NumberFormatException("Empty string!"); + } + + byte b = getByte(0); + final boolean negative = b == '-'; + int offset = 0; + if (negative || b == '+') { + offset++; + if (numBytes == 1) { + throw new NumberFormatException(toString()); + } + } + + final byte separator = '.'; + final long stopValue = Long.MIN_VALUE / 10; + long result = 0; + + while (offset < numBytes) { + b = getByte(offset); + offset++; + if (b == separator) { + // We allow decimals and will return a truncated integral in that case. + // Therefore we won't throw an exception here (checking the fractional + // part happens below.) + break; + } + + int digit = getDigit(b); + // We are going to process the new digit and accumulate the result. However, before doing + // this, if the result is already smaller than the stopValue(Long.MIN_VALUE / 10), then + // result * 10 will definitely be smaller than minValue, and we can stop and throw exception. + if (result < stopValue) { + throw new NumberFormatException(toString()); + } + + result = result * 10 - digit; + // Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / 10), we can + // just use `result > 0` to check overflow. If result overflows, we should stop and throw + // exception. + if (result > 0) { + throw new NumberFormatException(toString()); + } + } + + // This is the case when we've encountered a decimal separator. The fractional + // part will not change the number, but we will verify that the fractional part + // is well formed. + while (offset < numBytes) { + if (getDigit(getByte(offset)) == -1) { + throw new NumberFormatException(toString()); + } + offset++; + } + + if (!negative) { + result = -result; + if (result < 0) { + throw new NumberFormatException(toString()); + } + } + + return result; + } + + /** + * Parses this UTF8String to int. + * + * Note that, in this method we accumulate the result in negative format, and convert it to + * positive format at the end, if this string is not started with '-'. This is because min value + * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and + * Integer.MIN_VALUE is '-2147483648'. + * + * These codes are mostly copied from LazyInt.parseInt in Hive. + */ + public int toInt() { --- End diff -- Hive also duplicates the code for parsing to long and int, I'm not sure how to remove the duplication without hurting the performance.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org