Repository: tajo Updated Branches: refs/heads/master 36f691d9f -> 45100ced2
TAJO-2119: Invalid sort result when sort key columns contain non-ascii values. Closes #998 Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/45100ced Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/45100ced Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/45100ced Branch: refs/heads/master Commit: 45100ced2c44e92eed1c24bc58a118e74f58d277 Parents: 36f691d Author: Jihoon Son <[email protected]> Authored: Fri Apr 15 14:33:02 2016 +0900 Committer: Jihoon Son <[email protected]> Committed: Fri Apr 15 14:33:02 2016 +0900 ---------------------------------------------------------------------- CHANGES | 3 + .../memory/UnSafeTupleBytesComparator.java | 52 +++++++---------- .../memory/TestUnSafeTupleBytesComparator.java | 61 ++++++++++++++++++++ 3 files changed, 84 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/45100ced/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index cefb3df..7565faf 100644 --- a/CHANGES +++ b/CHANGES @@ -122,6 +122,9 @@ Release 0.12.0 - unreleased BUG FIXES + TAJO-2119: Invalid sort result when sort key columns contain non-ascii values. + (jihoon) + TAJO-2100: Add missing cancellation in defaultTaskScheduler when a worker is no respond. (jinho) http://git-wip-us.apache.org/repos/asf/tajo/blob/45100ced/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java index 53a78a8..50c4345 100644 --- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java +++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java @@ -19,6 +19,7 @@ package org.apache.tajo.tuple.memory; import com.google.common.primitives.Longs; +import com.google.common.primitives.UnsignedBytes; import com.google.common.primitives.UnsignedLongs; import org.apache.tajo.util.SizeOf; import org.apache.tajo.util.UnsafeUtil; @@ -31,10 +32,10 @@ import java.nio.ByteOrder; */ public class UnSafeTupleBytesComparator { private static final Unsafe UNSAFE = UnsafeUtil.unsafe; + private static final int UNSIGNED_MASK = 0xFF; + private static final boolean littleEndian = ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN); - static final boolean littleEndian = - ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN); - + // This code is borrowed from Guava's UnsignedBytes:::compare() public static int compare(long ptr1, long ptr2) { int lstrLen = UNSAFE.getInt(ptr1); int rstrLen = UNSAFE.getInt(ptr2); @@ -45,42 +46,29 @@ public class UnSafeTupleBytesComparator { int minLength = Math.min(lstrLen, rstrLen); int minWords = minLength / Longs.BYTES; - /* - * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a - * time is no slower than comparing 4 bytes at a time even on 32-bit. - * On the other hand, it is substantially faster on 64-bit. - */ + /* + * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a + * time is no slower than comparing 4 bytes at a time even on 32-bit. + * On the other hand, it is substantially faster on 64-bit. + */ for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) { long lw = UNSAFE.getLong(ptr1); long rw = UNSAFE.getLong(ptr2); - long diff = lw ^ rw; - if (diff != 0) { + if (lw != rw) { if (!littleEndian) { return UnsignedLongs.compare(lw, rw); } - // Use binary search - int n = 0; - int y; - int x = (int) diff; - if (x == 0) { - x = (int) (diff >>> 32); - n = 32; - } - - y = x << 16; - if (y == 0) { - n += 16; - } else { - x = y; - } - - y = x << 8; - if (y == 0) { - n += 8; - } - return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL)); + /* + * We want to compare only the first index where left[index] != right[index]. + * This corresponds to the least significant nonzero byte in lw ^ rw, since lw + * and rw are little-endian. Long.numberOfTrailingZeros(diff) tells us the least + * significant nonzero bit, and zeroing out the first three bits of L.nTZ gives us the + * shift to get that least significant nonzero byte. + */ + int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7; + return (int) (((lw >>> n) & UNSIGNED_MASK) - ((rw >>> n) & UNSIGNED_MASK)); } ptr1 += SizeOf.SIZE_OF_LONG; @@ -89,7 +77,7 @@ public class UnSafeTupleBytesComparator { // The epilogue to cover the last (minLength % 8) elements. for (int i = minWords * Longs.BYTES; i < minLength; i++) { - int result = UNSAFE.getByte(ptr1++) - UNSAFE.getByte(ptr2++); + int result = UnsignedBytes.compare(UNSAFE.getByte(ptr1++), UNSAFE.getByte(ptr2++)); if (result != 0) { return result; } http://git-wip-us.apache.org/repos/asf/tajo/blob/45100ced/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java new file mode 100644 index 0000000..b2bde40 --- /dev/null +++ b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.tuple.memory; + +import org.apache.tajo.common.TajoDataTypes.DataType; +import org.apache.tajo.common.TajoDataTypes.Type; +import org.apache.tajo.tuple.RowBlockReader; +import org.junit.Test; + +import static org.junit.Assert.assertTrue; + +public class TestUnSafeTupleBytesComparator { + + @Test + public void testCompare() { + MemoryRowBlock rowBlock = null; + try { + rowBlock = new MemoryRowBlock(new DataType[]{ + DataType.newBuilder().setType(Type.TEXT).build() + }); + RowWriter builder = rowBlock.getWriter(); + builder.startRow(); + builder.putText("CÃTE D'IVOIRE"); + builder.endRow(); + builder.startRow(); + builder.putText("CANADA"); + builder.endRow(); + + RowBlockReader reader = rowBlock.getReader(); + + UnSafeTuple t1 = new UnSafeTuple(); + UnSafeTuple t2 = new UnSafeTuple(); + reader.next(t1); + reader.next(t2); + + // 'CÃTE D'IVOIRE' should occur later than 'CANADA' in ascending order. + int compare = UnSafeTupleBytesComparator.compare(t1.getFieldAddr(0), t2.getFieldAddr(0)); + assertTrue(compare > 0); + } finally { + if (rowBlock != null) { + rowBlock.release(); + } + } + } +}
