This is an automated email from the git hooks/post-receive script. henrich pushed a commit to branch debian/sid in repository jruby-joni.
commit 8dec518cf2da1e338713820b6c93b5bc02ec53cd Author: Hideki Yamane <[email protected]> Date: Sun Feb 16 17:13:59 2014 +0900 Imported Upstream version 2.1.0 --- README.md | 65 ++++++++++++++++++++++++++++ pom.xml | 29 ++++--------- src/org/joni/Analyser.java | 65 ++++++++++++++++++---------- src/org/joni/Lexer.java | 8 +++- src/org/joni/Parser.java | 54 +++++++++++++++++++---- src/org/joni/ScanEnvironment.java | 32 ++++++++++++++ src/org/joni/Syntax.java | 39 +++++++++++++++++ src/org/joni/ast/CClassNode.java | 6 ++- src/org/joni/ast/EncloseNode.java | 1 + src/org/joni/ast/QuantifierNode.java | 7 ++- src/org/joni/constants/SyntaxProperties.java | 1 + src/org/joni/exception/ErrorMessages.java | 1 + 12 files changed, 252 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md new file mode 100644 index 0000000..b51db7a --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +joni +==== + +Java port of Oniguruma regexp library + +## Usage + +### Imports + ```java + import org.jcodings.specific.UTF8Encoding; + import org.joni.Matcher; + import org.joni.Option; + import org.joni.Regex; + ``` + +### Matching + + ```java + + byte[] pattern = "a*".getBytes(); + byte[] str = "aaa".getBytes(); + + Regex regex = new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE); + Matcher matcher = regex.matcher(str); + int result = matcher.search(0, str.length, Option.DEFAULT); + ``` + +### Using captures + + ```java + byte[] pattern = "(a*)".getBytes(); + byte[] str = "aaa".getBytes(); + + Regex regex = new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE); + Matcher matcher = regex.matcher(str); + int result = matcher.search(0, str.length, Option.DEFAULT); + if (result != -1) { + Region region = matcher.getEagerRegion(); + } + ``` + +### Using named captures + + ```java + byte[] pattern = "(?<name>a*)".getBytes(); + byte[] str = "aaa".getBytes(); + + Regex regex = new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE); + Matcher matcher = regex.matcher(str); + int result = matcher.search(0, str.length, Option.DEFAULT); + if (result != -1) { + Region region = matcher.getEagerRegion(); + for (Iterator<NameEntry> entry = regex.namedBackrefIterator(); entry.hasNext();) { + NameEntry e = entry.next(); + int number = e.getBackRefs()[0]; // can have many refs per name + // int begin = region.beg[number]; + // int end = region.end[number]; + + } + } + ``` + +## License + +Joni is released under the [MIT License](http://www.opensource.org/licenses/MIT). diff --git a/pom.xml b/pom.xml index a1f09f3..3d4352d 100644 --- a/pom.xml +++ b/pom.xml @@ -1,11 +1,10 @@ <?xml version="1.0" ?> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.jruby.joni</groupId> <artifactId>joni</artifactId> <packaging>jar</packaging> - <version>2.0.0</version> + <version>2.1.0</version> <name>Joni</name> <description> Java port of Oniguruma: http://www.geocities.jp/kosako3/oniguruma @@ -16,6 +15,12 @@ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> + <parent> + <groupId>org.sonatype.oss</groupId> + <artifactId>oss-parent</artifactId> + <version>7</version> + </parent> + <issueManagement> <system>JIRA</system> <url>http://jira.codehaus.org/browse/JRUBY</url> @@ -35,24 +40,6 @@ </license> </licenses> - <distributionManagement> - <repository> - <id>codehaus-jruby-repository</id> - <name>JRuby Central Repository</name> - <url>dav:https://dav.codehaus.org/repository/jruby</url> - </repository> - <snapshotRepository> - <id>codehaus-jruby-snapshot-repository</id> - <name>JRuby Central Development Repository</name> - <url>dav:https://dav.codehaus.org/snapshots.repository/jruby</url> - </snapshotRepository> - <site> - <id>codehaus-jruby-site</id> - <name>JRuby Maven site</name> - <url>dav:https://dav.codehaus.org/jruby/info</url> - </site> - </distributionManagement> - <repositories> <repository> <id>codehaus</id> diff --git a/src/org/joni/Analyser.java b/src/org/joni/Analyser.java index 815a7dd..9665ba6 100644 --- a/src/org/joni/Analyser.java +++ b/src/org/joni/Analyser.java @@ -412,13 +412,19 @@ final class Analyser extends Parser { BackRefNode br = (BackRefNode)node; if (br.isRecursion()) break; - if (br.back[0] > env.numMem) newValueException(ERR_INVALID_BACKREF); - min = getMinMatchLength(env.memNodes[br.back[0]]); + if (br.back[0] > env.numMem) { + if (!syntax.op2OptionECMAScript()) newValueException(ERR_INVALID_BACKREF); + } else { + min = getMinMatchLength(env.memNodes[br.back[0]]); + } for (int i=1; i<br.backNum; i++) { - if (br.back[i] > env.numMem) newValueException(ERR_INVALID_BACKREF); - int tmin = getMinMatchLength(env.memNodes[br.back[i]]); - if (min > tmin) min = tmin; + if (br.back[i] > env.numMem) { + if (!syntax.op2OptionECMAScript()) newValueException(ERR_INVALID_BACKREF); + } else { + int tmin = getMinMatchLength(env.memNodes[br.back[i]]); + if (min > tmin) min = tmin; + } } break; @@ -546,9 +552,12 @@ final class Analyser extends Parser { } for (int i=0; i<br.backNum; i++) { - if (br.back[i] > env.numMem) newValueException(ERR_INVALID_BACKREF); - int tmax = getMaxMatchLength(env.memNodes[br.back[i]]); - if (max < tmax) max = tmax; + if (br.back[i] > env.numMem) { + if(!syntax.op2OptionECMAScript()) newValueException(ERR_INVALID_BACKREF); + } else { + int tmax = getMaxMatchLength(env.memNodes[br.back[i]]); + if (max < tmax) max = tmax; + } } break; @@ -1780,15 +1789,18 @@ final class Analyser extends Parser { case NodeType.BREF: BackRefNode br = (BackRefNode)node; for (int i=0; i<br.backNum; i++) { - if (br.back[i] > env.numMem) newValueException(ERR_INVALID_BACKREF); - env.backrefedMem = bsOnAt(env.backrefedMem, br.back[i]); - env.btMemStart = bsOnAt(env.btMemStart, br.back[i]); - if (Config.USE_BACKREF_WITH_LEVEL) { - if (br.isNestLevel()) { - env.btMemEnd = bsOnAt(env.btMemEnd, br.back[i]); - } - } // USE_BACKREF_AT_LEVEL - ((EncloseNode)env.memNodes[br.back[i]]).setMemBackrefed(); + if (br.back[i] > env.numMem) { + if (!syntax.op2OptionECMAScript()) newValueException(ERR_INVALID_BACKREF); + } else { + env.backrefedMem = bsOnAt(env.backrefedMem, br.back[i]); + env.btMemStart = bsOnAt(env.btMemStart, br.back[i]); + if (Config.USE_BACKREF_WITH_LEVEL) { + if (br.isNestLevel()) { + env.btMemEnd = bsOnAt(env.btMemEnd, br.back[i]); + } + } // USE_BACKREF_AT_LEVEL + ((EncloseNode)env.memNodes[br.back[i]]).setMemBackrefed(); + } } break; @@ -2081,14 +2093,21 @@ final class Analyser extends Parser { Node[]nodes = oenv.scanEnv.memNodes; - int min = getMinMatchLength(nodes[br.back[0]]); - int max = getMaxMatchLength(nodes[br.back[0]]); + int min = 0; + int max = 0; + + if (nodes != null && nodes[br.back[0]] != null) { + min = getMinMatchLength(nodes[br.back[0]]); + max = getMaxMatchLength(nodes[br.back[0]]); + } for (int i=1; i<br.backNum; i++) { - int tmin = getMinMatchLength(nodes[br.back[i]]); - int tmax = getMaxMatchLength(nodes[br.back[i]]); - if (min > tmin) min = tmin; - if (max < tmax) max = tmax; + if (nodes[br.back[i]] != null) { + int tmin = getMinMatchLength(nodes[br.back[i]]); + int tmax = getMaxMatchLength(nodes[br.back[i]]); + if (min > tmin) min = tmin; + if (max < tmax) max = tmax; + } } opt.length.set(min, max); break; diff --git a/src/org/joni/Lexer.java b/src/org/joni/Lexer.java index bc919ad..0b70271 100644 --- a/src/org/joni/Lexer.java +++ b/src/org/joni/Lexer.java @@ -184,7 +184,13 @@ class Lexer extends ScannerSupport { } private void fetchEscapedValueControl() { - if (!left()) newSyntaxException(ERR_END_PATTERN_AT_CONTROL); + if (!left()) { + if (syntax.op2OptionECMAScript()) { + return; + } else { + newSyntaxException(ERR_END_PATTERN_AT_CONTROL); + } + } fetch(); if (c == '?') { c = 0177; diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java index 3d56e9e..62208a7 100644 --- a/src/org/joni/Parser.java +++ b/src/org/joni/Parser.java @@ -157,7 +157,7 @@ class Parser extends Lexer { neg = false; } - if (token.type == TokenType.CC_CLOSE) { + if (token.type == TokenType.CC_CLOSE && !syntax.op2OptionECMAScript()) { if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS); env.ccEscWarn("]"); token.type = TokenType.CHAR; /* allow []...] */ @@ -429,6 +429,9 @@ class Parser extends Lexer { break; case '!': /* preceding read */ node = new AnchorNode(AnchorType.PREC_READ_NOT); + if (syntax.op2OptionECMAScript()) { + env.pushPrecReadNotNode(node); + } break; case '>': /* (?>...) stop backtrack */ node = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose @@ -579,10 +582,16 @@ class Parser extends Lexer { if (node.getType() == NodeType.ANCHOR) { AnchorNode an = (AnchorNode) node; an.setTarget(target); + if (syntax.op2OptionECMAScript() && an.type == AnchorType.PREC_READ_NOT) { + env.popPrecReadNotNode(an); + } } else { EncloseNode en = (EncloseNode)node; en.setTarget(target); if (en.type == EncloseType.MEMORY) { + if (syntax.op2OptionECMAScript()) { + en.containingAnchor = env.currentPrecReadNotNode(); + } /* Don't move this to previous of parse_subexp() */ env.setMemNode(en.regNum, node); } @@ -750,13 +759,37 @@ class Parser extends Lexer { break; case BACKREF: - int[]backRefs = token.getBackrefNum() > 1 ? token.getBackrefRefs() : new int[]{token.getBackrefRef1()}; - node = new BackRefNode(token.getBackrefNum(), - backRefs, - token.getBackrefByName(), - token.getBackrefExistLevel(), // #ifdef USE_BACKREF_AT_LEVEL - token.getBackrefLevel(), // ... - env); + if (syntax.op2OptionECMAScript() && token.getBackrefNum() == 1 && env.memNodes != null) { + EncloseNode encloseNode = (EncloseNode) env.memNodes[token.getBackrefRef1()]; + boolean shouldIgnore = false; + if (encloseNode != null && encloseNode.containingAnchor != null) { + shouldIgnore = true; + for (Node anchorNode : env.precReadNotNodes) { + if (anchorNode == encloseNode.containingAnchor) { + shouldIgnore = false; + break; + } + } + } + if (shouldIgnore) { + node = StringNode.EMPTY; + } else { + node = new BackRefNode(token.getBackrefNum(), + new int[]{token.getBackrefRef1()}, + token.getBackrefByName(), + token.getBackrefExistLevel(), // #ifdef USE_BACKREF_AT_LEVEL + token.getBackrefLevel(), // ... + env); + } + } else { + int[]backRefs = token.getBackrefNum() > 1 ? token.getBackrefRefs() : new int[]{token.getBackrefRef1()}; + node = new BackRefNode(token.getBackrefNum(), + backRefs, + token.getBackrefByName(), + token.getBackrefExistLevel(), // #ifdef USE_BACKREF_AT_LEVEL + token.getBackrefLevel(), // ... + env); + } break; @@ -857,6 +890,9 @@ class Parser extends Lexer { while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) { // repeat: if (target.isInvalidQuantifier()) newSyntaxException(ERR_TARGET_OF_REPEAT_OPERATOR_INVALID); + if (syntax.op2OptionECMAScript() && target.getType() == NodeType.QTFR) { + newSyntaxException(ERR_NESTED_REPEAT_NOT_ALLOWED); + } QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(), token.getRepeatUpper(), token.type == TokenType.INTERVAL); @@ -871,7 +907,7 @@ class Parser extends Lexer { qn = en; } - if (ret == 0) { + if (ret == 0 || (syntax.op2OptionECMAScript() && ret == 1)) { target = qn; } else if (ret == 2) { /* split case: /abc+/ */ target = ConsAltNode.newListNode(target, null); diff --git a/src/org/joni/ScanEnvironment.java b/src/org/joni/ScanEnvironment.java index 02a1ad7..0dbce24 100644 --- a/src/org/joni/ScanEnvironment.java +++ b/src/org/joni/ScanEnvironment.java @@ -55,6 +55,9 @@ public final class ScanEnvironment { int currMaxRegNum; boolean hasRecursion; + int numPrecReadNotNodes; + Node precReadNotNodes[]; + public ScanEnvironment(Regex regex, Syntax syntax) { this.reg = regex; option = regex.options; @@ -80,6 +83,9 @@ public final class ScanEnvironment { combExpMaxRegNum = 0; currMaxRegNum = 0; hasRecursion = false; + + numPrecReadNotNodes = 0; + precReadNotNodes = null; } public int addMemEntry() { @@ -102,6 +108,32 @@ public final class ScanEnvironment { } } + public void pushPrecReadNotNode(Node node) { + numPrecReadNotNodes++; + if (precReadNotNodes == null) { + precReadNotNodes = new Node[SCANENV_MEMNODES_SIZE]; + } else if (numPrecReadNotNodes >= precReadNotNodes.length) { + Node[]tmp = new Node[precReadNotNodes.length << 1]; + System.arraycopy(precReadNotNodes, 0, tmp, 0, precReadNotNodes.length); + precReadNotNodes = tmp; + } + precReadNotNodes[numPrecReadNotNodes - 1] = node; + } + + public void popPrecReadNotNode(Node node) { + if (precReadNotNodes != null && precReadNotNodes[numPrecReadNotNodes - 1] == node) { + precReadNotNodes[numPrecReadNotNodes - 1] = null; + numPrecReadNotNodes--; + } + } + + public Node currentPrecReadNotNode() { + if (numPrecReadNotNodes > 0) { + return precReadNotNodes[numPrecReadNotNodes - 1]; + } + return null; + } + public int convertBackslashValue(int c) { if (syntax.opEscControlChars()) { switch (c) { diff --git a/src/org/joni/Syntax.java b/src/org/joni/Syntax.java index 74662a8..4e7b5e7 100644 --- a/src/org/joni/Syntax.java +++ b/src/org/joni/Syntax.java @@ -278,6 +278,10 @@ public final class Syntax implements SyntaxProperties{ return isOp2(OP2_INEFFECTIVE_ESCAPE); } + public boolean op2OptionECMAScript() { + return isOp2(OP2_OPTION_ECMASCRIPT); + } + /** * BEHAVIOR * @@ -603,4 +607,39 @@ public final class Syntax implements SyntaxProperties{ INEFFECTIVE_META_CHAR /* anychar anytime */ ) ); + + public static final Syntax ECMAScript = new Syntax( + (( GNU_REGEX_OP | OP_QMARK_NON_GREEDY | + OP_ESC_OCTAL3 | OP_ESC_X_HEX2 | + OP_ESC_CONTROL_CHARS | OP_ESC_C_CONTROL | + OP_DECIMAL_BACKREF | OP_ESC_D_DIGIT | + OP_ESC_S_WHITE_SPACE | OP_ESC_W_WORD ) + & ~OP_ESC_LTGT_WORD_BEGIN_END ), + + ( OP2_ESC_CAPITAL_Q_QUOTE | + OP2_QMARK_GROUP_EFFECT | OP2_OPTION_PERL | + OP2_ESC_P_BRACE_CHAR_PROPERTY | + OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | + OP2_ESC_U_HEX4 | OP2_ESC_V_VTAB | + OP2_OPTION_ECMASCRIPT ), + + ( CONTEXT_INDEP_ANCHORS | + CONTEXT_INDEP_REPEAT_OPS | + CONTEXT_INVALID_REPEAT_OPS | + ALLOW_INVALID_INTERVAL | + BACKSLASH_ESCAPE_IN_CC | + ALLOW_DOUBLE_RANGE_OP_IN_CC | + DIFFERENT_LEN_ALT_LOOK_BEHIND ), + + Option.NONE, + + new MetaCharTable( + '\\', /* esc */ + INEFFECTIVE_META_CHAR, /* anychar '.' */ + INEFFECTIVE_META_CHAR, /* anytime '*' */ + INEFFECTIVE_META_CHAR, /* zero or one time '?' */ + INEFFECTIVE_META_CHAR, /* one or more time '+' */ + INEFFECTIVE_META_CHAR /* anychar anytime */ + ) + ); } diff --git a/src/org/joni/ast/CClassNode.java b/src/org/joni/ast/CClassNode.java index 7927023..6dd5d7f 100644 --- a/src/org/joni/ast/CClassNode.java +++ b/src/org/joni/ast/CClassNode.java @@ -335,8 +335,12 @@ public final class CClassNode extends Node { if (Config.NON_UNICODE_SDW) { switch(ctype) { - case CharacterType.D: case CharacterType.S: + if (!not && env.syntax.op2OptionECMAScript()) { + // treat \u2028 and \u2029 as whitespace + addCodeRange(env, 8232, 8233); + } + case CharacterType.D: case CharacterType.W: ctype ^= CharacterType.SPECIAL_MASK; if (not) { diff --git a/src/org/joni/ast/EncloseNode.java b/src/org/joni/ast/EncloseNode.java index 0a07ed1..7c45d14 100644 --- a/src/org/joni/ast/EncloseNode.java +++ b/src/org/joni/ast/EncloseNode.java @@ -34,6 +34,7 @@ public final class EncloseNode extends StateNode implements EncloseType { public int maxLength; // OnigDistance public int charLength; public int optCount; // referenced count in optimize_node_left() + public Node containingAnchor; // // node_new_enclose / onig_node_new_enclose public EncloseNode(int type) { diff --git a/src/org/joni/ast/QuantifierNode.java b/src/org/joni/ast/QuantifierNode.java index 8ec53cb..d75aa27 100644 --- a/src/org/joni/ast/QuantifierNode.java +++ b/src/org/joni/ast/QuantifierNode.java @@ -200,7 +200,12 @@ public final class QuantifierNode extends StateNode { } public int setQuantifier(Node tgt, boolean group, ScanEnvironment env, byte[]bytes, int p, int end) { - if (lower == 1 && upper == 1) return 1; + if (lower == 1 && upper == 1) { + if (env.syntax.op2OptionECMAScript()) { + setTarget(tgt); + } + return 1; + } switch(tgt.getType()) { diff --git a/src/org/joni/constants/SyntaxProperties.java b/src/org/joni/constants/SyntaxProperties.java index 61f2269..075324c 100644 --- a/src/org/joni/constants/SyntaxProperties.java +++ b/src/org/joni/constants/SyntaxProperties.java @@ -74,6 +74,7 @@ public interface SyntaxProperties { /* final int OP2_CHAR_PROPERTY_PREFIX_IS = (1<<18); */ final int OP2_ESC_H_XDIGIT = (1<<19); /* \h, \H */ final int OP2_INEFFECTIVE_ESCAPE = (1<<20); /* \ */ + final int OP2_OPTION_ECMASCRIPT = (1<<21); /* EcmaScript quirks */ /* syntax (behavior); */ final int CONTEXT_INDEP_ANCHORS = (1<<31); /* not implemented */ diff --git a/src/org/joni/exception/ErrorMessages.java b/src/org/joni/exception/ErrorMessages.java index f490713..683ff62 100644 --- a/src/org/joni/exception/ErrorMessages.java +++ b/src/org/joni/exception/ErrorMessages.java @@ -54,6 +54,7 @@ public interface ErrorMessages extends org.jcodings.exception.ErrorMessages { final String ERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS = "unmatched range specifier in char-class"; final String ERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED = "target of repeat operator is not specified"; final String ERR_TARGET_OF_REPEAT_OPERATOR_INVALID = "target of repeat operator is invalid"; + final String ERR_NESTED_REPEAT_NOT_ALLOWED = "nested repeat is not allowed"; final String ERR_NESTED_REPEAT_OPERATOR = "nested repeat operator"; final String ERR_UNMATCHED_CLOSE_PARENTHESIS = "unmatched close parenthesis"; final String ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS = "end pattern with unmatched parenthesis"; -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git _______________________________________________ pkg-java-commits mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-java-commits

