From: Ito Kazumitsu <[EMAIL PROTECTED]> Subject: Re: [cp-patches] RFC: java.util.regex.Matcher#hitEnd() Date: Mon, 07 Aug 2006 02:13:34 +0900 (JST)
> I have made a patch and a mauve test for this, and put it at > http://www.jsdi.or.jp/~maczuka/programs/regex-patched.tar.gz, > but I will be on vacation soon and have no time to prepare an RFC > message. Here it is. ChangeLog: 2006-08-16 Ito Kazumitsu <[EMAIL PROTECTED]> Fixes bug #28412 * gnu/java/util/regex/CharIndexed.java(move1, setHitEnd, hitEnd): New methods. * gnu/java/util/regex/CharIndexedCharSequence.java, gnu/java/util/regex/CharIndexedInputStream.java: Implemented the new methods above. * gnu/java/util/regex/RE.java(REG_FIX_STARTING_POSITION): New flag, (match): call the new method setHitEnd of the input, (getMatchImpl): Handle the new flag REG_FIX_STARTING_POSITION, Some optimization commented out, Use CharIndexed#move1 instead of move. * gnu/java/util/regex/REMatch.java: Made some debugging methods public. * gnu/java/util/regex/REToken.java(match): The method body has been moved to an internal private method, (matchFake): New method, (setHitEnd): New method. * gnu/java/util/regex/RETokenChar.java(matchThis): Call setHitEnd if the match is not complete, (matchOneString): Count the number of characters which matched the pattern. * gnu/java/util/regex/RETokenEnd.java(fake): New field, (setFake): New method, (match): Call super.match or super.matchFake. * gnu/java/util/regex/RETokenEndSub.java(setHitEnd): New method. * gnu/java/util/regex/RETokenOneOf.java(match): call the new method setHitEnd of the input, * gnu/java/util/regex/RETokenRepeated.java(match): Likewise. * java/util/regex/Matcher.java(lookingAt, match): Use the new flag RE.REG_FIX_STARTING_POSITION, (hitEnd, toString): New methods.
Index: classpath/gnu/java/util/regex/CharIndexed.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/CharIndexed.java,v retrieving revision 1.1 diff -u -r1.1 CharIndexed.java --- classpath/gnu/java/util/regex/CharIndexed.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/CharIndexed.java 16 Aug 2006 00:12:58 -0000 @@ -77,6 +77,13 @@ boolean move(int index); /** + * Shifts the input buffer by a given number of positions. Returns + * true if the new cursor position is valid or cursor position is at + * the end of input. + */ + boolean move1(int index); // I cannot think of a better name for this. + + /** * Returns true if the most recent move() operation placed the cursor * position at a valid position in the input. */ @@ -105,6 +112,16 @@ REMatch getLastMatch(); /** + * Sets the information used for hitEnd(). + */ + void setHitEnd(REMatch match); + + /** + * Returns whether the matcher has hit the end of input. + */ + boolean hitEnd(); + + /** * Returns the anchor. */ int getAnchor(); Index: classpath/gnu/java/util/regex/CharIndexedCharSequence.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/CharIndexedCharSequence.java,v retrieving revision 1.1 diff -u -r1.1 CharIndexedCharSequence.java --- classpath/gnu/java/util/regex/CharIndexedCharSequence.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/CharIndexedCharSequence.java 16 Aug 2006 00:12:58 -0000 @@ -62,6 +62,10 @@ return ((anchor += index) < len); } + public boolean move1(int index) { + return ((anchor += index) <= len); + } + public CharIndexed lookBehind(int index, int length) { if (length > (anchor + index)) length = anchor + index; return new CharIndexedCharSequence(s, anchor + index - length); @@ -77,6 +81,15 @@ lastMatch.anchor = anchor; } public REMatch getLastMatch() { return lastMatch; } + + private int rightmostTriedPosition = 0; + public void setHitEnd(REMatch match) { + int pos = anchor + match.index; + if (pos > rightmostTriedPosition) rightmostTriedPosition = pos; + } + public boolean hitEnd() { return rightmostTriedPosition >= len; } + public int getAnchor() { return anchor; } public void setAnchor(int anchor) { this.anchor = anchor; } + } Index: classpath/gnu/java/util/regex/CharIndexedInputStream.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/CharIndexedInputStream.java,v retrieving revision 1.1 diff -u -r1.1 CharIndexedInputStream.java --- classpath/gnu/java/util/regex/CharIndexedInputStream.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/CharIndexedInputStream.java 16 Aug 2006 00:12:58 -0000 @@ -166,6 +166,16 @@ "difficult to support getLastMatch for an input stream"); } + public void setHitEnd(REMatch match) { + throw new UnsupportedOperationException( + "difficult to support setHitEnd for an input stream"); + } + + public boolean hitEnd() { + throw new UnsupportedOperationException( + "difficult to support hitEnd for an input stream"); + } + public int getAnchor() { throw new UnsupportedOperationException( "difficult to support getAnchor for an input stream"); @@ -176,6 +186,10 @@ "difficult to support setAnchor for an input stream"); } + public boolean move1(int index) { + throw new UnsupportedOperationException( + "difficult to support move1 for an input stream"); + } } Index: classpath/gnu/java/util/regex/RE.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/RE.java,v retrieving revision 1.3 diff -u -r1.3 RE.java --- classpath/gnu/java/util/regex/RE.java 25 Jul 2006 14:34:51 -0000 1.3 +++ classpath/gnu/java/util/regex/RE.java 16 Aug 2006 00:12:59 -0000 @@ -252,6 +252,13 @@ */ public static final int REG_ICASE_USASCII = 0x0800; + /** + * Execution flag. + * Do not move the position at which the search begins. If not set, + * the starting position will be moved until a match is found. + */ + public static final int REG_FIX_STARTING_POSITION = 0x1000; + /** Returns a string representing the version of the gnu.regexp package. */ public static final String version() { return VERSION; @@ -1643,6 +1650,7 @@ /* Implements abstract method REToken.match() */ boolean match(CharIndexed input, REMatch mymatch) { + input.setHitEnd(mymatch); if (firstToken == null) { return next(input, mymatch); } @@ -1720,15 +1728,23 @@ REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) { boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0); + boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0); RE re = (tryEntireMatch ? (RE) this.clone() : this); if (tryEntireMatch) { - re.chain(new RETokenEnd(0, null)); + RETokenEnd reEnd = new RETokenEnd(0, null); + reEnd.setFake(true); + re.chain(reEnd); } // Create a new REMatch to hold results REMatch mymatch = new REMatch(numSubs, anchor, eflags); do { + /* The following potimization is commented out because + the matching should be tried even if the length of + input is obviously too short in order that + java.util.regex.Matcher#hitEnd() may work correctly. // Optimization: check if anchor + minimumLength > length if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { + */ if (re.match(input, mymatch)) { REMatch best = mymatch; // We assume that the match that coms first is the best. @@ -1749,13 +1765,17 @@ input.setLastMatch(best); return best; } - } + /* End of the optimization commented out + } + */ mymatch.clear(++anchor); // Append character to buffer if needed if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) { buffer.append(input.charAt(0)); } - } while (input.move(1)); + // java.util.regex.Matcher#hitEnd() requires that the search should + // be tried at the end of input, so we use move1(1) instead of move(1) + } while (doMove && input.move1(1)); // Special handling at end of input for e.g. "$" if (minimumLength == 0) { Index: classpath/gnu/java/util/regex/REMatch.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/REMatch.java,v retrieving revision 1.1 diff -u -r1.1 REMatch.java --- classpath/gnu/java/util/regex/REMatch.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/REMatch.java 16 Aug 2006 00:12:59 -0000 @@ -307,12 +307,12 @@ } /* The following are used for debugging purpose - static String d(REMatch m) { + public static String d(REMatch m) { if (m == null) return "null"; else return "[" + m.index + "]"; } - String substringUptoIndex(CharIndexed input) { + public String substringUptoIndex(CharIndexed input) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < index; i++) { sb.append(input.charAt(i)); Index: classpath/gnu/java/util/regex/REToken.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/REToken.java,v retrieving revision 1.1 diff -u -r1.1 REToken.java --- classpath/gnu/java/util/regex/REToken.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/REToken.java 16 Aug 2006 00:12:59 -0000 @@ -72,6 +72,16 @@ /** Returns true if the match succeeded, false if it failed. */ boolean match(CharIndexed input, REMatch mymatch) { + return match(input, mymatch, false); + } + boolean matchFake(CharIndexed input, REMatch mymatch) { + return match(input, mymatch, true); + } + + private boolean match(CharIndexed input, REMatch mymatch, boolean fake) { + if (!fake) { + setHitEnd(input, mymatch); + } REMatch m = matchThis(input, mymatch); if (m == null) return false; if (next(input, m)) { @@ -81,6 +91,11 @@ return false; } + /** Sets whether the matching occurs at the end of input */ + void setHitEnd(CharIndexed input, REMatch mymatch) { + input.setHitEnd(mymatch); + } + /** Returns true if the match succeeded, false if it failed. * The matching is done against this REToken only. Chained * tokens are not checked. Index: classpath/gnu/java/util/regex/RETokenChar.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/RETokenChar.java,v retrieving revision 1.1 diff -u -r1.1 RETokenChar.java --- classpath/gnu/java/util/regex/RETokenChar.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/RETokenChar.java 16 Aug 2006 00:12:59 -0000 @@ -58,15 +58,20 @@ } REMatch matchThis(CharIndexed input, REMatch mymatch) { - int z = ch.length; if (matchOneString(input, mymatch.index)) { - mymatch.index += z; + mymatch.index += matchedLength; return mymatch; } + // java.util.regex.Matcher#hitEnd() requires that the length of + // partial match be counted. + mymatch.index += matchedLength; + input.setHitEnd(mymatch); return null; } - boolean matchOneString(CharIndexed input, int index) { + private int matchedLength; + private boolean matchOneString(CharIndexed input, int index) { + matchedLength = 0; int z = ch.length; char c; for (int i=0; i<z; i++) { @@ -74,6 +79,7 @@ if (! charEquals(c, ch[i])) { return false; } + ++matchedLength; } return true; } Index: classpath/gnu/java/util/regex/RETokenEnd.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/RETokenEnd.java,v retrieving revision 1.2 diff -u -r1.2 RETokenEnd.java --- classpath/gnu/java/util/regex/RETokenEnd.java 25 Jul 2006 14:34:51 -0000 1.2 +++ classpath/gnu/java/util/regex/RETokenEnd.java 16 Aug 2006 00:12:59 -0000 @@ -45,6 +45,12 @@ private String newline; private boolean check_java_line_terminators; + /** + * Indicates whether this token is a real one generated at compile time, + * or a fake one temporarily added by RE#getMatchImpl. + */ + private boolean fake = false; + RETokenEnd(int subIndex,String newline) { super(subIndex); this.newline = newline; @@ -57,10 +63,19 @@ this.check_java_line_terminators = b; } + void setFake(boolean fake) { + this.fake = fake; + } + int getMaximumLength() { return 0; } + boolean match(CharIndexed input, REMatch mymatch) { + if (!fake) return super.match(input, mymatch); + return super.matchFake(input, mymatch); + } + REMatch matchThis(CharIndexed input, REMatch mymatch) { char ch = input.charAt(mymatch.index); if (ch == CharIndexed.OUT_OF_BOUNDS) Index: classpath/gnu/java/util/regex/RETokenEndSub.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/RETokenEndSub.java,v retrieving revision 1.1 diff -u -r1.1 RETokenEndSub.java --- classpath/gnu/java/util/regex/RETokenEndSub.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/RETokenEndSub.java 16 Aug 2006 00:12:59 -0000 @@ -58,6 +58,10 @@ return super.findMatch(input, mymatch); } + void setHitEnd(CharIndexed input, REMatch mymatch) { + // Do nothing + } + void dump(StringBuffer os) { // handled by RE // But add something for debugging. Index: classpath/gnu/java/util/regex/RETokenOneOf.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/RETokenOneOf.java,v retrieving revision 1.1 diff -u -r1.1 RETokenOneOf.java --- classpath/gnu/java/util/regex/RETokenOneOf.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/RETokenOneOf.java 16 Aug 2006 00:12:59 -0000 @@ -120,6 +120,7 @@ } boolean match(CharIndexed input, REMatch mymatch) { + setHitEnd(input, mymatch); if (matchesOneChar) return matchOneChar(input, mymatch); else return matchOneRE(input, mymatch); } Index: classpath/gnu/java/util/regex/RETokenRepeated.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/java/util/regex/RETokenRepeated.java,v retrieving revision 1.1 diff -u -r1.1 RETokenRepeated.java --- classpath/gnu/java/util/regex/RETokenRepeated.java 7 Jun 2006 19:30:06 -0000 1.1 +++ classpath/gnu/java/util/regex/RETokenRepeated.java 16 Aug 2006 00:12:59 -0000 @@ -318,6 +318,7 @@ } boolean match(CharIndexed input, REMatch mymatch) { + setHitEnd(input, mymatch); REMatch m1 = findMatch(input, mymatch); if (m1 != null) { mymatch.assignFrom(m1); Index: classpath/java/util/regex/Matcher.java =================================================================== RCS file: /cvsroot/classpath/classpath/java/util/regex/Matcher.java,v retrieving revision 1.17 diff -u -r1.17 Matcher.java --- classpath/java/util/regex/Matcher.java 7 Jun 2006 19:30:06 -0000 1.17 +++ classpath/java/util/regex/Matcher.java 16 Aug 2006 00:12:59 -0000 @@ -218,7 +218,7 @@ public boolean lookingAt () { - match = pattern.getRE().getMatch(inputCharIndexed, 0); + match = pattern.getRE().getMatch(inputCharIndexed, 0, RE.REG_FIX_STARTING_POSITION, null); if (match != null) { if (match.getStartIndex() == 0) @@ -243,7 +243,7 @@ */ public boolean matches () { - match = pattern.getRE().getMatch(inputCharIndexed, 0, RE.REG_TRY_ENTIRE_MATCH); + match = pattern.getRE().getMatch(inputCharIndexed, 0, RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION, null); if (match != null) { if (match.getStartIndex() == 0) @@ -309,6 +309,28 @@ return match.getStartIndex(group); } + /** + * @return True if and only if the matcher hit the end of input. + */ + public boolean hitEnd() + { + return inputCharIndexed.hitEnd(); + } + + /** + * @return A string expression of this matcher. + */ + public String toString() + { + StringBuilder sb = new StringBuilder(); + sb.append(this.getClass().getName()) + .append("[pattern=").append(pattern.pattern()) + .append(" region=").append("0").append(",").append(input.length()) + .append(" lastmatch=").append(match == null ? "" : match.toString()) + .append("]"); + return sb.toString(); + } + private void assertMatchOp() { if (match == null) throw new IllegalStateException();