>From Michael Blow <[email protected]>: Michael Blow has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21194?usp=email )
Change subject: [NO ISSUE][MISC] Update StringJsonParseEval for compat w/ Jackson 2.21[.3] ...................................................................... [NO ISSUE][MISC] Update StringJsonParseEval for compat w/ Jackson 2.21[.3] ... for GHSA-72hv-8253-57qq Ext-ref: MB-71736 Change-Id: Ia7a7616dd362032e7442086debc52d3a6bbf5a4f Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21194 Reviewed-by: Michael Blow <[email protected]> Tested-by: Michael Blow <[email protected]> --- M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java 1 file changed, 67 insertions(+), 11 deletions(-) Approvals: Michael Blow: Looks good to me, approved; Verified diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java index 7589e33..ad9cf0f 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/evaluators/StringJsonParseEval.java @@ -22,6 +22,7 @@ import java.io.DataOutput; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.external.parser.JSONDataParser; @@ -41,8 +42,12 @@ import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; import org.apache.hyracks.data.std.util.ByteArrayAccessibleInputStream; import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference; +import org.apache.hyracks.util.LogRedactionUtil; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; public class StringJsonParseEval implements IScalarEvaluator { + private static final Logger LOGGER = LogManager.getLogger(); private final IEvaluatorContext ctx; private final IScalarEvaluator inputEval; private final JSONDataParser parser; @@ -53,6 +58,13 @@ private final ArrayBackedValueStorage resultStorage; private final DataOutput out; + //@AiProvenance(agent = AiProvenance.Agent.CLAUDE_SONNET_4_6, tool = AiProvenance.Tool.GITHUB_COPILOT, contributionKind = AiProvenance.ContributionKind.GENERATED, notes = "Three-outcome enum to distinguish parse success, EOF, and error in tryParseAndSetResult") + private enum ParseOutcome { + SUCCESS, + EOF, + ERROR + } + public StringJsonParseEval(IEvaluatorContext ctx, IScalarEvaluator inputEval, SourceLocation sourceLocation) throws IOException { this.ctx = ctx; @@ -80,22 +92,30 @@ if (bytes[offset] == ATypeTag.SERIALIZED_STRING_TYPE_TAG) { utf8Val.set(bytes, offset + 1, inputVal.getLength() - 1); inputStream.setContent(bytes, utf8Val.getCharStartOffset(), utf8Val.getUTF8Length()); - resultStorage.reset(); - try { - if (parser.parseAnyValue(out)) { - result.set(resultStorage); + ParseOutcome outcome = tryParseAndSetResult(result); + if (outcome == ParseOutcome.SUCCESS) { + return; + } + resetParser(); + if (outcome == ParseOutcome.ERROR + && containsCesu8Surrogate(bytes, utf8Val.getCharStartOffset(), utf8Val.getUTF8Length())) { + // AsterixDB stores strings in CESU-8 (surrogate pairs encoded as two 3-byte sequences). + // Jackson 2.20+ rejects raw surrogate bytes in UTF-8 streams per RFC 3629. + // If the failure looks like a surrogate encoding issue, decode via CESU-8 to a Java + // String, re-encode as proper UTF-8, and retry once before treating as a real error. + byte[] utf8Bytes = utf8Val.toString().getBytes(StandardCharsets.UTF_8); + inputStream.setContent(utf8Bytes, 0, utf8Bytes.length); + outcome = tryParseAndSetResult(result); + if (outcome == ParseOutcome.SUCCESS) { return; - } else { - //Reset the parser: EOF was encountered - resetParser(); } - } catch (HyracksDataException e) { + resetParser(); + } + if (outcome == ParseOutcome.ERROR) { IWarningCollector warningCollector = ctx.getWarningCollector(); if (warningCollector.shouldWarn()) { warningCollector.warn(Warning.of(sourceLocation, ErrorCode.RECORD_READER_MALFORMED_INPUT_STREAM)); } - //Reset the parser: An error was encountered. - resetParser(); } } else { ExceptionUtil.warnTypeMismatch(ctx, sourceLocation, STRING_PARSE_JSON, bytes[offset], 0, ATypeTag.STRING); @@ -104,6 +124,26 @@ PointableHelper.setNull(result); } + /** + * Attempts to parse the current inputStream content. + * Returns {@link ParseOutcome#SUCCESS} and sets {@code result} on success, + * {@link ParseOutcome#EOF} if the input was empty, or {@link ParseOutcome#ERROR} on a parse failure. + */ + //@AiProvenance(agent = AiProvenance.Agent.CLAUDE_SONNET_4_6, tool = AiProvenance.Tool.GITHUB_COPILOT, contributionKind = AiProvenance.ContributionKind.GENERATED, notes = "Extracted to eliminate duplicated try/catch parse blocks; returns ParseOutcome to preserve distinct EOF vs error semantics") + private ParseOutcome tryParseAndSetResult(IPointable result) throws HyracksDataException { + resultStorage.reset(); + try { + if (parser.parseAnyValue(out)) { + result.set(resultStorage); + return ParseOutcome.SUCCESS; + } + return ParseOutcome.EOF; + } catch (HyracksDataException e) { + LOGGER.debug("failed to parse json value: {}", LogRedactionUtil.userData(e.toString())); + return ParseOutcome.ERROR; + } + } + private void resetParser() throws HyracksDataException { try { parser.reset(inputStream); @@ -111,4 +151,20 @@ throw HyracksDataException.create(e); } } -} \ No newline at end of file + + /** + * Returns true if the byte range contains a CESU-8 encoded surrogate (0xED [0xA0-0xBF] ...). + * Such sequences are valid CESU-8 but invalid UTF-8, and are rejected by Jackson 2.20+. + * Scanning for 0xED is cheap and covers the vast majority of inputs with zero allocation. + */ + //@AiProvenance(agent = AiProvenance.Agent.CLAUDE_SONNET_4_6, tool = AiProvenance.Tool.GITHUB_COPILOT, contributionKind = AiProvenance.ContributionKind.GENERATED, notes = "Fast pre-scan to detect CESU-8 surrogates (0xED [0xA0-0xBF]) before triggering the more expensive CESU-8 to UTF-8 re-encoding retry path") + private static boolean containsCesu8Surrogate(byte[] bytes, int offset, int length) { + int end = offset + length; + for (int i = offset; i < end - 1; i++) { + if ((bytes[i] & 0xFF) == 0xED && (bytes[i + 1] & 0xF0) == 0xA0) { + return true; + } + } + return false; + } +} -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21194?usp=email To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: asterixdb Gerrit-Branch: stabilization-667a908755 Gerrit-Change-Id: Ia7a7616dd362032e7442086debc52d3a6bbf5a4f Gerrit-Change-Number: 21194 Gerrit-PatchSet: 8 Gerrit-Owner: Michael Blow <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Ian Maxon <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]>
