Murtadha Hubail has submitted this change and it was merged. Change subject: [ASTERIXDB-2303][API] Fix Supplementary Chars Printing ......................................................................
[ASTERIXDB-2303][API] Fix Supplementary Chars Printing - user model changes: no - storage format changes: no - interface changes: no Details: - Properly print supplementary chars as utf8 by converting their java surrogates to a string. - Add test case. Change-Id: I59e825c11ff750d5b651fb86712023c52e98367e Reviewed-on: https://asterix-gerrit.ics.uci.edu/2429 Tested-by: Jenkins <[email protected]> Contrib: Jenkins <[email protected]> Integration-Tests: Jenkins <[email protected]> Reviewed-by: Michael Blow <[email protected]> --- A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm M asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml M asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java 4 files changed, 50 insertions(+), 0 deletions(-) Approvals: Anon. E. Moose #1000171: Jenkins: Verified; ; Verified Michael Blow: Looks good to me, approved Objections: Jenkins: Violations found diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp new file mode 100644 index 0000000..88909ef --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +select value "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89 = 😢😢💉💉. Coffee ☕‼️😃. حسنا"; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm new file mode 100644 index 0000000..89c6334 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm @@ -0,0 +1 @@ +"😢😢💉💉 = 😢😢💉💉. Coffee ☕‼️😃. حسنا" \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml index 4265163..9fc0b4b 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml @@ -6445,6 +6445,11 @@ <output-dir compare="Text">varlen-encoding</output-dir> </compilation-unit> </test-case> + <test-case FilePath="string"> + <compilation-unit name="utf8"> + <output-dir compare="Text">utf8</output-dir> + </compilation-unit> + </test-case> </test-group> <test-group name="subquery"> <test-case FilePath="subquery"> diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java index b1039a5..8d05f0f 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java @@ -357,6 +357,13 @@ break; } break; + case 3: + // special treatment for surrogates + if (Character.isHighSurrogate(c)) { + position += writeSupplementaryChar(os, b, maxPosition, position, c, sz); + sz = 0; + } + break; } while (sz > 0) { os.write(b[position]); @@ -378,4 +385,22 @@ os.write(HexPrinter.hex(c & 0x0f, HexPrinter.CASE.LOWER_CASE)); } + /** + * Writes a supplementary char consisting of high and low surrogates + * + * @return The length of the surrogates + * @throws IOException + */ + private static int writeSupplementaryChar(OutputStream os, byte[] src, int limit, int highSurrogatePos, + char highSurrogate, int highSurrogateSize) throws IOException { + final int lowSurrogatePos = highSurrogatePos + highSurrogateSize; + if (lowSurrogatePos >= limit) { + throw new IllegalStateException("malformed utf8 input"); + } + final char lowSurrogate = UTF8StringUtil.charAt(src, lowSurrogatePos); + final int lowSurrogateSize = UTF8StringUtil.charSize(src, lowSurrogatePos); + os.write(new String(new char[] { highSurrogate, lowSurrogate }).getBytes()); + return highSurrogateSize + lowSurrogateSize; + } + } -- To view, visit https://asterix-gerrit.ics.uci.edu/2429 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: I59e825c11ff750d5b651fb86712023c52e98367e Gerrit-PatchSet: 4 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Murtadha Hubail <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]> Gerrit-Reviewer: Murtadha Hubail <[email protected]> Gerrit-Reviewer: Till Westmann <[email protected]>
