Jianfeng Jia has submitted this change and it was merged. Change subject: Fix ASTERIXDB-1566 fix UTF8 comparator and hash function. ......................................................................
Fix ASTERIXDB-1566 fix UTF8 comparator and hash function. Change-Id: I187bf1243abf143b3b265fa8098614b9a72c65ad Reviewed-on: https://asterix-gerrit.ics.uci.edu/1054 Sonar-Qube: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Reviewed-by: Yingyi Bu <[email protected]> Integration-Tests: Jenkins <[email protected]> --- A asterixdb/asterix-app/src/test/resources/runtimets/queries/string/string-equal-public/string-equal-public.1.query.aql A asterixdb/asterix-app/src/test/resources/runtimets/results/string/string-equal-public/string-equal-public.1.adm M asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml M asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryComparatorFactoryProvider.java M asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryHashFunctionFactoryProvider.java M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java 6 files changed, 44 insertions(+), 6 deletions(-) Approvals: Yingyi Bu: Looks good to me, approved Jenkins: Verified; No violations found; Verified diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/string/string-equal-public/string-equal-public.1.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/string/string-equal-public/string-equal-public.1.query.aql new file mode 100644 index 0000000..d5c2dff --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/string/string-equal-public/string-equal-public.1.query.aql @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +"的"="离" +"و"="ن" diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string-equal-public/string-equal-public.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string-equal-public/string-equal-public.1.adm new file mode 100644 index 0000000..4b095fd --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string-equal-public/string-equal-public.1.adm @@ -0,0 +1,2 @@ +false +false diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml index 82f5071..f410cbe 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml @@ -5411,6 +5411,11 @@ </compilation-unit> </test-case> <test-case FilePath="string"> + <compilation-unit name="string-equal-public"> + <output-dir compare="Text">string-equal-public</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="string"> <compilation-unit name="string-join1"> <output-dir compare="Text">string-join1</output-dir> </compilation-unit> diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryComparatorFactoryProvider.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryComparatorFactoryProvider.java index feb3228..a8c6eea 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryComparatorFactoryProvider.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryComparatorFactoryProvider.java @@ -18,8 +18,6 @@ */ package org.apache.asterix.formats.nontagged; -import java.io.Serializable; - import org.apache.asterix.dataflow.data.nontagged.comparators.ABinaryComparator; import org.apache.asterix.dataflow.data.nontagged.comparators.ACirclePartialBinaryComparatorFactory; import org.apache.asterix.dataflow.data.nontagged.comparators.ADurationPartialBinaryComparatorFactory; @@ -48,9 +46,11 @@ import org.apache.hyracks.data.std.primitive.FloatPointable; import org.apache.hyracks.data.std.primitive.IntegerPointable; import org.apache.hyracks.data.std.primitive.LongPointable; -import org.apache.hyracks.data.std.primitive.RawUTF8StringPointable; import org.apache.hyracks.data.std.primitive.ShortPointable; import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; + +import java.io.Serializable; public class AqlBinaryComparatorFactoryProvider implements IBinaryComparatorFactoryProvider, Serializable { @@ -69,7 +69,7 @@ public static final PointableBinaryComparatorFactory DOUBLE_POINTABLE_INSTANCE = new PointableBinaryComparatorFactory( DoublePointable.FACTORY); public static final PointableBinaryComparatorFactory UTF8STRING_POINTABLE_INSTANCE = new PointableBinaryComparatorFactory( - RawUTF8StringPointable.FACTORY); + UTF8StringPointable.FACTORY); // Equivalent to UTF8STRING_POINTABLE_INSTANCE but all characters are considered lower case to implement case-insensitive comparisons. public static final PointableBinaryComparatorFactory UTF8STRING_LOWERCASE_POINTABLE_INSTANCE = new PointableBinaryComparatorFactory( UTF8StringLowercasePointable.FACTORY); diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryHashFunctionFactoryProvider.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryHashFunctionFactoryProvider.java index 8cfe51e..23c245f 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryHashFunctionFactoryProvider.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/AqlBinaryHashFunctionFactoryProvider.java @@ -28,8 +28,8 @@ import org.apache.hyracks.data.std.primitive.DoublePointable; import org.apache.hyracks.data.std.primitive.FloatPointable; import org.apache.hyracks.data.std.primitive.IntegerPointable; -import org.apache.hyracks.data.std.primitive.RawUTF8StringPointable; import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable; +import org.apache.hyracks.data.std.primitive.UTF8StringPointable; public class AqlBinaryHashFunctionFactoryProvider implements IBinaryHashFunctionFactoryProvider, Serializable { @@ -42,7 +42,7 @@ public static final PointableBinaryHashFunctionFactory DOUBLE_POINTABLE_INSTANCE = new PointableBinaryHashFunctionFactory( DoublePointable.FACTORY); public static final PointableBinaryHashFunctionFactory UTF8STRING_POINTABLE_INSTANCE = new PointableBinaryHashFunctionFactory( - RawUTF8StringPointable.FACTORY); + UTF8StringPointable.FACTORY); // Equivalent to UTF8STRING_POINTABLE_INSTANCE but all characters are considered lower case to implement case-insensitive hashing. public static final PointableBinaryHashFunctionFactory UTF8STRING_LOWERCASE_POINTABLE_INSTANCE = new PointableBinaryHashFunctionFactory( UTF8StringLowercasePointable.FACTORY); diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java index f101ab1..f200384 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java @@ -38,11 +38,13 @@ import static org.apache.hyracks.util.string.UTF8StringUtil.normalize; import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo; import static org.apache.hyracks.util.string.UTF8StringUtil.hash; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; +import org.junit.Assert; import org.junit.Test; public class UTF8StringUtilTest { @@ -66,6 +68,14 @@ } @Test + public void testChinese() { + byte[] bufferDe = writeStringToBytes("的"); + byte[] bufferLi = writeStringToBytes("离"); + int ret = compareTo(bufferDe, 0, bufferLi, 0); + assertTrue(ret != 0); + } + + @Test public void testCompareToAndNormolize() throws Exception { testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD); testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD); -- To view, visit https://asterix-gerrit.ics.uci.edu/1054 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: I187bf1243abf143b3b265fa8098614b9a72c65ad Gerrit-PatchSet: 3 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Jianfeng Jia <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Jianfeng Jia <[email protected]> Gerrit-Reviewer: Taewoo Kim <[email protected]> Gerrit-Reviewer: Till Westmann <[email protected]> Gerrit-Reviewer: Wenhai Li <[email protected]> Gerrit-Reviewer: Yingyi Bu <[email protected]>
