>From Michael Blow <[email protected]>:

Michael Blow has submitted this change. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21194?usp=email )

Change subject: [NO ISSUE][MISC] Update StringJsonParseEval for compat w/ 
Jackson 2.21[.3]
......................................................................

[NO ISSUE][MISC] Update StringJsonParseEval for compat w/ Jackson 2.21[.3]

... for GHSA-72hv-8253-57qq

Ext-ref: MB-71736
Change-Id: Ia7a7616dd362032e7442086debc52d3a6bbf5a4f
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21194
Reviewed-by: Michael Blow <[email protected]>
Tested-by: Michael Blow <[email protected]>
---
M 
asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java
1 file changed, 67 insertions(+), 11 deletions(-)

Approvals:
  Michael Blow: Looks good to me, approved; Verified




diff --git 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java
 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java
index 7589e33..ad9cf0f 100644
--- 
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java
+++ 
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java
@@ -22,6 +22,7 @@

 import java.io.DataOutput;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;

 import org.apache.asterix.common.exceptions.ErrorCode;
 import org.apache.asterix.external.parser.JSONDataParser;
@@ -41,8 +42,12 @@
 import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
 import org.apache.hyracks.data.std.util.ByteArrayAccessibleInputStream;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.LogRedactionUtil;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;

 public class StringJsonParseEval implements IScalarEvaluator {
+    private static final Logger LOGGER = LogManager.getLogger();
     private final IEvaluatorContext ctx;
     private final IScalarEvaluator inputEval;
     private final JSONDataParser parser;
@@ -53,6 +58,13 @@
     private final ArrayBackedValueStorage resultStorage;
     private final DataOutput out;

+    //@AiProvenance(agent = AiProvenance.Agent.CLAUDE_SONNET_4_6, tool = 
AiProvenance.Tool.GITHUB_COPILOT, contributionKind = 
AiProvenance.ContributionKind.GENERATED, notes = "Three-outcome enum to 
distinguish parse success, EOF, and error in tryParseAndSetResult")
+    private enum ParseOutcome {
+        SUCCESS,
+        EOF,
+        ERROR
+    }
+
     public StringJsonParseEval(IEvaluatorContext ctx, IScalarEvaluator 
inputEval, SourceLocation sourceLocation)
             throws IOException {
         this.ctx = ctx;
@@ -80,22 +92,30 @@
         if (bytes[offset] == ATypeTag.SERIALIZED_STRING_TYPE_TAG) {
             utf8Val.set(bytes, offset + 1, inputVal.getLength() - 1);
             inputStream.setContent(bytes, utf8Val.getCharStartOffset(), 
utf8Val.getUTF8Length());
-            resultStorage.reset();
-            try {
-                if (parser.parseAnyValue(out)) {
-                    result.set(resultStorage);
+            ParseOutcome outcome = tryParseAndSetResult(result);
+            if (outcome == ParseOutcome.SUCCESS) {
+                return;
+            }
+            resetParser();
+            if (outcome == ParseOutcome.ERROR
+                    && containsCesu8Surrogate(bytes, 
utf8Val.getCharStartOffset(), utf8Val.getUTF8Length())) {
+                // AsterixDB stores strings in CESU-8 (surrogate pairs encoded 
as two 3-byte sequences).
+                // Jackson 2.20+ rejects raw surrogate bytes in UTF-8 streams 
per RFC 3629.
+                // If the failure looks like a surrogate encoding issue, 
decode via CESU-8 to a Java
+                // String, re-encode as proper UTF-8, and retry once before 
treating as a real error.
+                byte[] utf8Bytes = 
utf8Val.toString().getBytes(StandardCharsets.UTF_8);
+                inputStream.setContent(utf8Bytes, 0, utf8Bytes.length);
+                outcome = tryParseAndSetResult(result);
+                if (outcome == ParseOutcome.SUCCESS) {
                     return;
-                } else {
-                    //Reset the parser: EOF was encountered
-                    resetParser();
                 }
-            } catch (HyracksDataException e) {
+                resetParser();
+            }
+            if (outcome == ParseOutcome.ERROR) {
                 IWarningCollector warningCollector = ctx.getWarningCollector();
                 if (warningCollector.shouldWarn()) {
                     warningCollector.warn(Warning.of(sourceLocation, 
ErrorCode.RECORD_READER_MALFORMED_INPUT_STREAM));
                 }
-                //Reset the parser: An error was encountered.
-                resetParser();
             }
         } else {
             ExceptionUtil.warnTypeMismatch(ctx, sourceLocation, 
STRING_PARSE_JSON, bytes[offset], 0, ATypeTag.STRING);
@@ -104,6 +124,26 @@
         PointableHelper.setNull(result);
     }

+    /**
+     * Attempts to parse the current inputStream content.
+     * Returns {@link ParseOutcome#SUCCESS} and sets {@code result} on success,
+     * {@link ParseOutcome#EOF} if the input was empty, or {@link 
ParseOutcome#ERROR} on a parse failure.
+     */
+    //@AiProvenance(agent = AiProvenance.Agent.CLAUDE_SONNET_4_6, tool = 
AiProvenance.Tool.GITHUB_COPILOT, contributionKind = 
AiProvenance.ContributionKind.GENERATED, notes = "Extracted to eliminate 
duplicated try/catch parse blocks; returns ParseOutcome to preserve distinct 
EOF vs error semantics")
+    private ParseOutcome tryParseAndSetResult(IPointable result) throws 
HyracksDataException {
+        resultStorage.reset();
+        try {
+            if (parser.parseAnyValue(out)) {
+                result.set(resultStorage);
+                return ParseOutcome.SUCCESS;
+            }
+            return ParseOutcome.EOF;
+        } catch (HyracksDataException e) {
+            LOGGER.debug("failed to parse json value: {}", 
LogRedactionUtil.userData(e.toString()));
+            return ParseOutcome.ERROR;
+        }
+    }
+
     private void resetParser() throws HyracksDataException {
         try {
             parser.reset(inputStream);
@@ -111,4 +151,20 @@
             throw HyracksDataException.create(e);
         }
     }
-}
\ No newline at end of file
+
+    /**
+     * Returns true if the byte range contains a CESU-8 encoded surrogate 
(0xED [0xA0-0xBF] ...).
+     * Such sequences are valid CESU-8 but invalid UTF-8, and are rejected by 
Jackson 2.20+.
+     * Scanning for 0xED is cheap and covers the vast majority of inputs with 
zero allocation.
+     */
+    //@AiProvenance(agent = AiProvenance.Agent.CLAUDE_SONNET_4_6, tool = 
AiProvenance.Tool.GITHUB_COPILOT, contributionKind = 
AiProvenance.ContributionKind.GENERATED, notes = "Fast pre-scan to detect 
CESU-8 surrogates (0xED [0xA0-0xBF]) before triggering the more expensive 
CESU-8 to UTF-8 re-encoding retry path")
+    private static boolean containsCesu8Surrogate(byte[] bytes, int offset, 
int length) {
+        int end = offset + length;
+        for (int i = offset; i < end - 1; i++) {
+            if ((bytes[i] & 0xFF) == 0xED && (bytes[i + 1] & 0xF0) == 0xA0) {
+                return true;
+            }
+        }
+        return false;
+    }
+}

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21194?usp=email
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings?usp=email

Gerrit-MessageType: merged
Gerrit-Project: asterixdb
Gerrit-Branch: stabilization-667a908755
Gerrit-Change-Id: Ia7a7616dd362032e7442086debc52d3a6bbf5a4f
Gerrit-Change-Number: 21194
Gerrit-PatchSet: 8
Gerrit-Owner: Michael Blow <[email protected]>
Gerrit-Reviewer: Anon. E. Moose #1000171
Gerrit-Reviewer: Hussain Towaileb <[email protected]>
Gerrit-Reviewer: Ian Maxon <[email protected]>
Gerrit-Reviewer: Jenkins <[email protected]>
Gerrit-Reviewer: Michael Blow <[email protected]>

Reply via email to