ASTERIXDB-1470 Fix escapes in String values in ClassAd Parser Change-Id: Id6e4e25263c5a6f0efe26773da7c3b8fcf7e2427 Reviewed-on: https://asterix-gerrit.ics.uci.edu/915 Reviewed-by: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Reviewed-by: Michael Blow <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/e639c704 Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/e639c704 Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/e639c704 Branch: refs/heads/master Commit: e639c70441b262acd826b94b5145b0a00ab4c5c6 Parents: 18d9580 Author: Abdullah Alamoudi <[email protected]> Authored: Fri Jun 10 00:39:52 2016 +0300 Committer: abdullah alamoudi <[email protected]> Committed: Fri Jun 10 14:41:16 2016 -0700 ---------------------------------------------------------------------- .../apache/asterix/external/classad/Util.java | 89 +- .../external/classad/test/ClassAdToADMTest.java | 89 +- .../asterix/external/library/ClassAdParser.java | 11 +- .../src/test/resources/escapes.txt | 4 + .../src/test/resources/jobads.txt | 34801 +++++++++++------ 5 files changed, 22445 insertions(+), 12549 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/asterixdb/blob/e639c704/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/Util.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/Util.java b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/Util.java index cbecb1b..789cf1d 100644 --- a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/Util.java +++ b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/Util.java @@ -18,32 +18,36 @@ */ package org.apache.asterix.external.classad; +import java.nio.charset.StandardCharsets; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Random; import java.util.TimeZone; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.asterix.om.base.AMutableInt32; public class Util { // convert escapes in-place // the string can only shrink while converting escapes so we can safely convert in-place. - // needs verification + private static final Pattern OCTAL = Pattern.compile("\\\\([0-3][0-7]{0,2})"); + public static boolean convertEscapes(AMutableCharArrayString text) { boolean validStr = true; - if (text.getLength() == 0) + if (text.getLength() == 0) { return true; - int length = text.getLength(); + } int dest = 0; - for (int source = 0; source < length; ++source) { + boolean hasOctal = false; + for (int source = 0; source < text.getLength(); ++source) { char ch = text.charAt(source); // scan for escapes, a terminating slash cannot be an escape - if (ch == '\\' && source < length - 1) { + if (ch == '\\' && source < text.getLength() - 1) { ++source; // skip the \ character ch = text.charAt(source); - // The escape part should be re-validated switch (ch) { case 'b': @@ -66,29 +70,8 @@ public class Util { break; default: if (Lexer.isodigit(ch)) { - int number = ch - '0'; - // There can be up to 3 octal digits in an octal escape - // \[0..3]nn or \nn or \n. We quit at 3 characters or - // at the first non-octal character. - if (source + 1 < length) { - char digit = text.charAt(source + 1); // is the next digit also - if (Lexer.isodigit(digit)) { - ++source; - number = (number << 3) + digit - '0'; - if (number < 0x20 && source + 1 < length) { - digit = text.charAt(source + 1); - if (Lexer.isodigit(digit)) { - ++source; - number = (number << 3) + digit - '0'; - } - } - } - } - if (ch == 0) { // "\\0" is an invalid substring within a string literal - validStr = false; - } - } else { - // pass char after \ unmodified. + hasOctal = true; + ++dest; } break; } @@ -99,40 +82,44 @@ public class Util { // text[dest] = ch; ++dest; } else { - text.erase(dest); - text.setChar(dest, ch); - ++dest; - --source; + try { + text.erase(dest); + text.setChar(dest, ch); + ++dest; + --source; + } catch (Throwable th) { + th.printStackTrace(); + } } } - if (dest < length) { + if (dest < text.getLength()) { text.erase(dest); - length = dest; + text.setLength(dest); } // silly, but to fulfull the original contract for this function // we need to remove the last character in the string if it is a '\0' // (earlier logic guaranteed that a '\0' can ONLY be the last character) - if (length > 0 && !(text.charAt(length - 1) == '\0')) { - //text.erase(length - 1); + if (text.getLength() > 0 && (text.charAt(text.getLength() - 1) == '\0')) { + text.erase(text.getLength() - 1); } + if (hasOctal) { + Matcher m = OCTAL.matcher(text.toString()); + StringBuffer out = new StringBuffer(); + while (m.find()) { + int octet = Integer.parseInt(m.group(1), 8); + if (octet == 0 || octet > 255) { + return false; + } + m.appendReplacement(out, String.valueOf((char) octet)); + } + m.appendTail(out); + text.setValue(new String(out.toString().getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8)); + } + return validStr; } - /*************************************************************** - * Copyright (C) 1990-2007, Condor Team, Computer Sciences Department, - * University of Wisconsin-Madison, WI. - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. You may - * obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ***************************************************************/ - public static Random initialized = new Random((new Date()).getTime()); public static int getRandomInteger() { http://git-wip-us.apache.org/repos/asf/asterixdb/blob/e639c704/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/test/ClassAdToADMTest.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/test/ClassAdToADMTest.java b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/test/ClassAdToADMTest.java index 493bd3b..a4a64c4 100644 --- a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/test/ClassAdToADMTest.java +++ b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/test/ClassAdToADMTest.java @@ -138,6 +138,44 @@ public class ClassAdToADMTest extends TestCase { } /** + * + */ + public void testEscaping() { + try { + ClassAdObjectPool objectPool = new ClassAdObjectPool(); + ClassAd pAd = new ClassAd(objectPool); + String[] files = new String[] { "/escapes.txt" }; + ClassAdParser parser = new ClassAdParser(objectPool); + CharArrayLexerSource lexerSource = new CharArrayLexerSource(); + for (String path : files) { + List<Path> paths = new ArrayList<>(); + paths.add(Paths.get(getClass().getResource(path).toURI())); + FileSystemWatcher watcher = new FileSystemWatcher(paths, null, false); + LocalFSInputStream in = new LocalFSInputStream(watcher); + SemiStructuredRecordReader recordReader = new SemiStructuredRecordReader(in, "[", "]"); + try { + Value val = new Value(objectPool); + while (recordReader.hasNext()) { + val.reset(); + IRawRecord<char[]> record = recordReader.next(); + lexerSource.setNewSource(record.get()); + parser.setLexerSource(lexerSource); + parser.parseNext(pAd); + Assert.assertEquals( + "[ Args = \"â-1 0.1 0.1 0.5 2e-07 0.001 10 -1â\"; GlobalJobId = \"submit-4.chtc.wisc.edu#3724038.0#1462893042\" ]", + pAd.toString()); + } + } finally { + recordReader.close(); + } + } + } catch (Exception e) { + e.printStackTrace(); + assertTrue(false); + } + } + + /** * */ public void testSchemaless() { @@ -153,33 +191,36 @@ public class ClassAdToADMTest extends TestCase { FileSystemWatcher watcher = new FileSystemWatcher(paths, null, false); LocalFSInputStream in = new LocalFSInputStream(watcher); SemiStructuredRecordReader recordReader = new SemiStructuredRecordReader(in, "[", "]"); - Value val = new Value(objectPool); - while (recordReader.hasNext()) { - val.reset(); - IRawRecord<char[]> record = recordReader.next(); - lexerSource.setNewSource(record.get()); - parser.setLexerSource(lexerSource); - parser.parseNext(pAd); - Map<CaseInsensitiveString, ExprTree> attrs = pAd.getAttrList(); - for (Entry<CaseInsensitiveString, ExprTree> entry : attrs.entrySet()) { - ExprTree tree = entry.getValue(); - switch (tree.getKind()) { - case ATTRREF_NODE: - case CLASSAD_NODE: - case EXPR_ENVELOPE: - case EXPR_LIST_NODE: - case FN_CALL_NODE: - case OP_NODE: - break; - case LITERAL_NODE: - break; - default: - System.out.println("Something is wrong"); - break; + try { + Value val = new Value(objectPool); + while (recordReader.hasNext()) { + val.reset(); + IRawRecord<char[]> record = recordReader.next(); + lexerSource.setNewSource(record.get()); + parser.setLexerSource(lexerSource); + parser.parseNext(pAd); + Map<CaseInsensitiveString, ExprTree> attrs = pAd.getAttrList(); + for (Entry<CaseInsensitiveString, ExprTree> entry : attrs.entrySet()) { + ExprTree tree = entry.getValue(); + switch (tree.getKind()) { + case ATTRREF_NODE: + case CLASSAD_NODE: + case EXPR_ENVELOPE: + case EXPR_LIST_NODE: + case FN_CALL_NODE: + case OP_NODE: + break; + case LITERAL_NODE: + break; + default: + System.out.println("Something is wrong"); + break; + } } } + } finally { + recordReader.close(); } - recordReader.close(); } } catch (Exception e) { e.printStackTrace(); http://git-wip-us.apache.org/repos/asf/asterixdb/blob/e639c704/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/ClassAdParser.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/ClassAdParser.java b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/ClassAdParser.java index 48cbeb2..a5e422c 100644 --- a/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/ClassAdParser.java +++ b/asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/library/ClassAdParser.java @@ -91,10 +91,12 @@ public class ClassAdParser extends AbstractDataParser implements IRecordDataPars private ARecordType recordType; private IObjectPool<IARecordBuilder, ATypeTag> recordBuilderPool = new ListObjectPool<IARecordBuilder, ATypeTag>( new RecordBuilderFactory()); - private IObjectPool<IAsterixListBuilder, ATypeTag> listBuilderPool = new ListObjectPool<IAsterixListBuilder, ATypeTag>( - new ListBuilderFactory()); - private IObjectPool<IMutableValueStorage, ATypeTag> abvsBuilderPool = new ListObjectPool<IMutableValueStorage, ATypeTag>( - new AbvsBuilderFactory()); + private IObjectPool<IAsterixListBuilder, ATypeTag> listBuilderPool = + new ListObjectPool<IAsterixListBuilder, ATypeTag>( + new ListBuilderFactory()); + private IObjectPool<IMutableValueStorage, ATypeTag> abvsBuilderPool = + new ListObjectPool<IMutableValueStorage, ATypeTag>( + new AbvsBuilderFactory()); private final ClassAd rootAd; private String exprPrefix = "expr="; private String exprSuffix = ""; @@ -1574,7 +1576,6 @@ public class ClassAdParser extends AbstractDataParser implements IRecordDataPars } isExpr = false; - // parse the expression parseExpression(tree); if (tree.getInnerTree() == null) { throw new HyracksDataException("parse expression returned empty tree"); http://git-wip-us.apache.org/repos/asf/asterixdb/blob/e639c704/asterixdb/asterix-external-data/src/test/resources/escapes.txt ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-external-data/src/test/resources/escapes.txt b/asterixdb/asterix-external-data/src/test/resources/escapes.txt new file mode 100644 index 0000000..50361e2 --- /dev/null +++ b/asterixdb/asterix-external-data/src/test/resources/escapes.txt @@ -0,0 +1,4 @@ +[ + Args = "\342\200\234-1 0.1 0.1 0.5 2e-07 0.001 10 -1\342\200\235"; + GlobalJobId = "submit-4.chtc.wisc.edu#3724038.0#1462893042"; + ]
