This is an automated email from the ASF dual-hosted git repository.
dlych pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git
The following commit(s) were added to refs/heads/master by this push:
new 120d7ea [ASTERIXDB-2762] Use code point as unit in position()
120d7ea is described below
commit 120d7eac49ad855eb1ae8a295683c0250aa4fe9e
Author: Rui Guo <[email protected]>
AuthorDate: Mon Jul 27 13:33:21 2020 -0700
[ASTERIXDB-2762] Use code point as unit in position()
Change-Id: Icf1b8b3401599e4332dd09534bdf4787cd9d85d6
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7305
Integration-Tests: Jenkins <[email protected]>
Tested-by: Jenkins <[email protected]>
Reviewed-by: Dmitry Lychagin <[email protected]>
---
.../pos0_multi_code_point.1.query.sqlpp | 20 ++++++
.../pos1_multi_code_point.1.query.sqlpp | 20 ++++++
.../pos0_multi_code_point.1.adm | 1 +
.../pos1_multi_code_point.1.adm | 1 +
.../test/resources/runtimets/testsuite_sqlpp.xml | 10 +++
.../src/main/markdown/builtins/2_string_common.md | 18 +++--
.../functions/StringPositionDescriptor.java | 2 +-
.../functions/StringPositionOffset1Descriptor.java | 2 +-
.../data/std/primitive/UTF8StringPointable.java | 76 +++++++++++++++++++++-
.../std/primitive/UTF8StringPointableTest.java | 13 +++-
.../apache/hyracks/util/string/UTF8StringUtil.java | 12 ++--
.../hyracks/util/string/UTF8StringSample.java | 1 +
12 files changed, 164 insertions(+), 12 deletions(-)
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp
new file mode 100644
index 0000000..e50ada6
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pos0("๐ฉโ๐ฉโ๐งโ๐ฆ๐", "๐");
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp
new file mode 100644
index 0000000..55af74b
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pos1("๐ฉโ๐ฉโ๐งโ๐ฆ๐", "๐");
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm
new file mode 100644
index 0000000..7f8f011
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm
@@ -0,0 +1 @@
+7
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm
new file mode 100644
index 0000000..45a4fb7
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm
@@ -0,0 +1 @@
+8
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 4fd8b31..ad6bbe5 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9498,6 +9498,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="string">
+ <compilation-unit name="position/offset0/pos0_multi_code_point">
+ <output-dir
compare="Text">position/offset0/pos0_multi_code_point</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="string">
<compilation-unit name="position/offset1/position1">
<output-dir compare="Text">position/offset1/position1</output-dir>
</compilation-unit>
@@ -9508,6 +9513,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="string">
+ <compilation-unit name="position/offset1/pos1_multi_code_point">
+ <output-dir
compare="Text">position/offset1/pos1_multi_code_point</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="string">
<compilation-unit name="regexp_contains/regexp_contains">
<output-dir compare="Text">regexp_contains/regexp_contains</output-dir>
</compilation-unit>
diff --git
a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 86695bc..b4a7a87 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -226,10 +226,13 @@
position(string, string_pattern)
- * Returns the first position of `string_pattern` within `string`. The
function returns the 0-based position. Another
+ * Returns the first position of `string_pattern` within `string`.
+ The result is counted in the unit of code points.
+ See the following example for more details.
+
+ * The function returns the 0-based position. Another
version of the function returns the 1-based position. Below are the aliases
for each version:
- * Aliases:
* 0-based: `position`, `pos`, `position0`, `pos0`.
* 1-based: `position1`, `pos1`.
@@ -249,14 +252,21 @@
"v1": position("ppphonepp", "phone"),
"v2": position("hone", "phone"),
"v3": position1("ppphonepp", "phone"),
- "v4": position1("hone", "phone"),
+ "v4": position1("hone", "phone")
};
-
* The expected result is:
{ "v1": 2, "v2": -1, v3": 3, "v4": -1 }
+ * Example of multi-code-point character:
+
+ position("๐ฉโ๐ฉโ๐งโ๐ฆ๐", "๐");
+
+ * The expected result is (the emoji family character has 7 code points):
+
+ 7
+
### regexp_contains ###
* Syntax:
diff --git
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
index f7177fd..6c06056 100644
---
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
+++
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
@@ -48,7 +48,7 @@ public class StringPositionDescriptor extends
AbstractScalarFunctionDynamicDescr
@Override
protected int compute(UTF8StringPointable left,
UTF8StringPointable right) {
- return UTF8StringPointable.find(left, right, false);
+ return UTF8StringPointable.findInCodePoint(left,
right, false);
}
};
}
diff --git
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
index 10cc779..93ada0f 100644
---
a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
+++
b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
@@ -48,7 +48,7 @@ public class StringPositionOffset1Descriptor extends
AbstractScalarFunctionDynam
@Override
protected int compute(UTF8StringPointable left,
UTF8StringPointable right) {
- int pos = UTF8StringPointable.find(left, right, false);
+ int pos = UTF8StringPointable.findInCodePoint(left,
right, false);
return pos < 0 ? pos : pos + 1;
}
};
diff --git
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 9a38a4e..21c8a36 100644
---
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -18,6 +18,9 @@
*/
package org.apache.hyracks.data.std.primitive;
+import static
org.apache.hyracks.util.string.UTF8StringUtil.HIGH_SURROGATE_WITHOUT_LOW_SURROGATE;
+import static
org.apache.hyracks.util.string.UTF8StringUtil.LOW_SURROGATE_WITHOUT_HIGH_SURROGATE;
+
import java.io.IOException;
import java.nio.charset.Charset;
@@ -235,19 +238,56 @@ public final class UTF8StringPointable extends
AbstractPointable implements IHas
* the pattern string.
* @param ignoreCase,
* to ignore case or not.
+ * @return the offset in the unit of code point of the first character of
the matching string. Not including the MetaLength.
+ */
+ public static int findInCodePoint(UTF8StringPointable src,
UTF8StringPointable pattern, boolean ignoreCase) {
+ return findInByteOrCodePoint(src, pattern, ignoreCase, 0, false);
+ }
+
+ /**
+ * @param src,
+ * the source string.
+ * @param pattern,
+ * the pattern string.
+ * @param ignoreCase,
+ * to ignore case or not.
* @param startMatch,
* the start offset.
* @return the byte offset of the first character of the matching string
after <code>startMatchPos}</code>.
* Not including the MetaLength.
*/
public static int find(UTF8StringPointable src, UTF8StringPointable
pattern, boolean ignoreCase, int startMatch) {
+ return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch,
true);
+ }
+
+ /**
+ * @param src,
+ * the source string.
+ * @param pattern,
+ * the pattern string.
+ * @param ignoreCase,
+ * to ignore case or not.
+ * @param startMatch,
+ * the start offset.
+ * @return the offset in the unit of code point of the first character of
the matching string. Not including the MetaLength.
+ */
+ public static int findInCodePoint(UTF8StringPointable src,
UTF8StringPointable pattern, boolean ignoreCase,
+ int startMatch) {
+ return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch,
false);
+ }
+
+ // If resultInByte is true, then return the position in bytes, otherwise
return the position in code points
+ private static int findInByteOrCodePoint(UTF8StringPointable src,
UTF8StringPointable pattern, boolean ignoreCase,
+ int startMatch, boolean resultInByte) {
int startMatchPos = startMatch;
final int srcUtfLen = src.getUTF8Length();
final int pttnUtfLen = pattern.getUTF8Length();
final int srcStart = src.getMetaDataLength();
final int pttnStart = pattern.getMetaDataLength();
+ int codePointCount = 0;
int maxStart = srcUtfLen - pttnUtfLen;
+ boolean prevHighSurrogate = false;
while (startMatchPos <= maxStart) {
int c1 = startMatchPos;
int c2 = 0;
@@ -256,6 +296,14 @@ public final class UTF8StringPointable extends
AbstractPointable implements IHas
char ch2 = pattern.charAt(pttnStart + c2);
if (ch1 != ch2) {
+ // Currently, the ignoreCase is only valid for
one-surrogate characters
+ // (e.g. characters whose UTF-16 encoding is 2-byte (1
Java char) instead of 4-byte (2 Java chars).
+ // We may need to support the two-surrogate characters in
the future
+ //
+ // Another edge case is that one letter may have different
forms of lower cases in different languages
+ // For example, the letter I may have "i" as the lower
case in English but "ฤฑ" in Turkish.
+ // We may need to use methods such as
String.toLowerCase(Locale locale) to support other languages in the future
+ // Reference:
https://stackoverflow.com/questions/11063102/using-locales-with-javas-tolowercase-and-touppercase
if (!ignoreCase || Character.toLowerCase(ch1) !=
Character.toLowerCase(ch2)) {
break;
}
@@ -263,9 +311,35 @@ public final class UTF8StringPointable extends
AbstractPointable implements IHas
c1 += src.charSize(srcStart + c1);
c2 += pattern.charSize(pttnStart + c2);
}
+
if (c2 == pttnUtfLen) {
- return startMatchPos;
+ if (resultInByte) {
+ return startMatchPos;
+ } else {
+ if (prevHighSurrogate == true) {
+ throw new
IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+ }
+ return codePointCount;
+ }
+ }
+
+ // The result is counted in code point instead of bytes
+ if (resultInByte == false) {
+ char ch = src.charAt(srcStart + startMatchPos);
+ if (Character.isHighSurrogate(ch)) {
+ prevHighSurrogate = true;
+ } else if (Character.isLowSurrogate(ch)) {
+ if (prevHighSurrogate == true) {
+ codePointCount++;
+ prevHighSurrogate = false;
+ } else {
+ throw new
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+ } else {
+ codePointCount++;
+ }
}
+
startMatchPos += src.charSize(srcStart + startMatchPos);
}
return -1;
diff --git
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 387bc03..8b62765 100644
---
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -20,6 +20,7 @@
package org.apache.hyracks.data.std.primitive;
import static
org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.apache.hyracks.util.string.UTF8StringSample.EMOJI_BASKETBALL;
import static
org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_2;
import static
org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_4;
import static org.junit.Assert.assertEquals;
@@ -53,7 +54,7 @@ public class UTF8StringPointableTest {
generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_2);
@Test
- public void testGetStringLength() throws Exception {
+ public void testGetStringUTF8Length() throws Exception {
UTF8StringPointable utf8Ptr =
generateUTF8Pointable(UTF8StringSample.STRING_LEN_127);
assertEquals(127, utf8Ptr.getUTF8Length());
assertEquals(1, utf8Ptr.getMetaDataLength());
@@ -67,6 +68,16 @@ public class UTF8StringPointableTest {
}
@Test
+ public void testFindInCodePoint() {
+ UTF8StringPointable strp =
generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_4 + EMOJI_BASKETBALL);
+ UTF8StringPointable pattern = generateUTF8Pointable(EMOJI_BASKETBALL);
+
+ assertEquals(UTF8StringPointable.findInCodePoint(strp, pattern,
false), 7);
+
+ assertEquals(UTF8StringPointable.findInCodePoint(strp, pattern, true),
7);
+ }
+
+ @Test
public void testContains() throws Exception {
assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, false));
assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, true));
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 53271e4..d2cd050 100644
---
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -34,6 +34,11 @@ import
org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
*/
public class UTF8StringUtil {
+ public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
+ "Decoding error: got a low surrogate without a leading high
surrogate";
+ public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
+ "Decoding error: got a high surrogate without a following low
surrogate";
+
private UTF8StringUtil() {
}
@@ -95,7 +100,7 @@ public class UTF8StringUtil {
if (Character.isLowSurrogate(c1)) {
// In this case, the index s doesn't point to a correct position
- throw new IllegalArgumentException("decoding error: got a low
surrogate without a high surrogate");
+ throw new
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
if (Character.isHighSurrogate(c1)) {
@@ -106,8 +111,7 @@ public class UTF8StringUtil {
if (Character.isLowSurrogate(c2)) {
return Character.toCodePoint(c1, c2);
} else {
- throw new IllegalArgumentException(
- "decoding error: the high surrogate is not followed by
a low surrogate");
+ throw new
IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
}
}
@@ -119,7 +123,7 @@ public class UTF8StringUtil {
int size1 = charSize(b, s);
if (Character.isLowSurrogate(c1)) {
- throw new IllegalArgumentException("decoding error: got a low
surrogate without a high surrogate");
+ throw new
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
if (Character.isHighSurrogate(c1)) {
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index 1502f25..b114351 100644
---
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -42,6 +42,7 @@ public class UTF8StringSample {
public static final String STRING_EMOJI_FAMILY_OF_4 =
"\uD83D\uDC68\u200D\uD83D\uDC68\u200D\uD83D\uDC66\u200D\uD83D\uDC66";
public static final String STRING_EMOJI_FAMILY_OF_2 =
"\uD83D\uDC68\u200D\uD83D\uDC66";
+ public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
public static final String STRING_LEN_127 =
generateStringRepeatBy(ONE_ASCII_CHAR, 127);
public static final String STRING_LEN_128 =
generateStringRepeatBy(ONE_ASCII_CHAR, 128);