Github user original-brownbear commented on a diff in the pull request: https://github.com/apache/spark/pull/19180#discussion_r138990727 --- Diff: common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java --- @@ -1097,10 +1100,23 @@ public UTF8String copy() { @Override public int compareTo(@Nonnull final UTF8String other) { int len = Math.min(numBytes, other.numBytes); - // TODO: compare 8 bytes as unsigned long - for (int i = 0; i < len; i ++) { + int wordMax = len & ~7; + long roffset = other.offset; + Object rbase = other.base; + for (int i = 0; i < wordMax; i += 8) { + long left = getLong(base, offset + i); + long right = getLong(rbase, roffset + i); + if (left != right) { + if (IS_LITTLE_ENDIAN) { + return UnsignedLongs.compare(Long.reverseBytes(left), Long.reverseBytes(right)); + } else { + return UnsignedLongs.compare(left, right); + } + } + } + for (int i = wordMax; i < len; i++) { // In UTF-8, the byte should be unsigned, so we should compare them as unsigned int. - int res = (getByte(i) & 0xFF) - (other.getByte(i) & 0xFF); + int res = (getByte(i) & 0xFF) - (Platform.getByte(rbase, roffset + i) & 0xFF); --- End diff -- @srowen yea this was visible in the benchmark. Likely because `base` and `offset` aren't `final`, so you actually do the field lookup on every invocation I think.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org