This is an automated email from the ASF dual-hosted git repository.
elharo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/xerces-j.git
The following commit(s) were added to refs/heads/main by this push:
new dcacb2034 [XERCESJ-1781] Javadoc fixes in org.apache.xerces.impl (#41)
dcacb2034 is described below
commit dcacb2034e50cbd3f21f0547f907b65cf5264af8
Author: Samael <[email protected]>
AuthorDate: Wed Nov 5 18:01:46 2025 +0000
[XERCESJ-1781] Javadoc fixes in org.apache.xerces.impl (#41)
* [XERCESJ-1781] Javadoc fixes in org.apache.xerces.impl
---
.../impl/xpath/regex/ParserForXMLSchema.java | 10 +-
src/org/apache/xerces/impl/xpath/regex/REUtil.java | 13 +
.../apache/xerces/impl/xpath/regex/RangeToken.java | 2 +-
.../xerces/impl/xpath/regex/RegexParser.java | 16 +
.../xerces/impl/xpath/regex/RegularExpression.java | 610 ++++++++++-----------
5 files changed, 332 insertions(+), 319 deletions(-)
diff --git a/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
b/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
index fca38bb07..1ed862986 100644
--- a/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
+++ b/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
@@ -152,6 +152,7 @@ class ParserForXMLSchema extends RegexParser {
/**
* Parses a character-class-expression, not a character-class-escape.
*
+ * <pre>
* c-c-expression ::= '[' c-group ']'
* c-group ::= positive-c-group | negative-c-group |
c-c-subtraction
* positive-c-group ::= (c-range | c-c-escape)+
@@ -162,9 +163,12 @@ class ParserForXMLSchema extends RegexParser {
* single-range ::= multi-c-escape | category-c-escape |
block-c-escape | <any XML char>
* cc-normal-c ::= <any character except [, ], \>
* from-to-range ::= cc-normal-c '-' cc-normal-c
+ * </pre>
*
- * @param useNrage Ignored.
- * @return This returns no NrageToken.
+ * @param useNrange ignored
+ * @return returns no NRANGE token
+ * @throws ParseException if regex does not conform to the syntax
+ * @see Token#NRANGE
*/
protected RangeToken parseCharacterClass(boolean useNrange) throws
ParseException {
this.setContext(S_INBRACKETS);
@@ -184,7 +188,7 @@ class ParserForXMLSchema extends RegexParser {
}
int type;
boolean firstloop = true;
- while ((type = this.read()) != T_EOF) { // Don't use 'cotinue' for
this loop.
+ while ((type = this.read()) != T_EOF) { // Don't use 'continue' for
this loop.
wasDecoded = false;
// single-range | from-to-range | subtraction
diff --git a/src/org/apache/xerces/impl/xpath/regex/REUtil.java
b/src/org/apache/xerces/impl/xpath/regex/REUtil.java
index 2c547323c..f8a594412 100644
--- a/src/org/apache/xerces/impl/xpath/regex/REUtil.java
+++ b/src/org/apache/xerces/impl/xpath/regex/REUtil.java
@@ -95,6 +95,13 @@ public final class REUtil {
return ret;
}
+ /**
+ * Parses the regular expression options.
+ *
+ * @param opts a string of regular expression options consisting of "i"
"m" "s" "u" "w" "," "X" or null
+ * @return an int equivalent the regular expression options
+ * @throws ParseException if regex is not conforming to the syntax
+ */
static final int parseOptions(String opts) throws ParseException {
if (opts == null) return 0;
int options = 0;
@@ -107,6 +114,12 @@ public final class REUtil {
return options;
}
+ /**
+ * Creates a string representation of regular expression options from an
int.
+ *
+ * @param options an int value representation of regular expression options
+ * @return a string representation of regular expression options
+ */
static final String createOptionString(int options) {
StringBuffer sb = new StringBuffer(9);
if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION)
!= 0)
diff --git a/src/org/apache/xerces/impl/xpath/regex/RangeToken.java
b/src/org/apache/xerces/impl/xpath/regex/RangeToken.java
index 060eb6b31..5dc8cf9f7 100644
--- a/src/org/apache/xerces/impl/xpath/regex/RangeToken.java
+++ b/src/org/apache/xerces/impl/xpath/regex/RangeToken.java
@@ -317,7 +317,7 @@ final class RangeToken extends Token implements
java.io.Serializable {
}
/**
- * @param tok Ignore whether it is NRANGE or not.
+ * @param token ignore whether it is NRANGE or not
*/
protected void intersectRanges(Token token) {
RangeToken tok = (RangeToken)token;
diff --git a/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
b/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
index 2da0e14c4..234221932 100644
--- a/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
+++ b/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
@@ -84,10 +84,21 @@ class RegexParser {
public RegexParser() {
this.setLocale(Locale.getDefault());
}
+
+ /**
+ * Create a new RegexParser with the desired locale.
+ *
+ * @param locale the desired locale or null
+ */
public RegexParser(Locale locale) {
this.setLocale(locale);
}
+ /**
+ * Set the locale for the regex parser.
+ *
+ * @param locale the desired locale or null
+ */
public void setLocale(Locale locale) {
try {
if (locale != null) {
@@ -836,6 +847,11 @@ class RegexParser {
return tok;
}
+ /**
+ * @param c char data
+ * @return a {@link RangeToken}
+ * @throws ParseException if regex does not conform to the syntax
+ */
protected RangeToken processBacksolidus_pP(int c) throws ParseException {
this.next();
diff --git a/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
b/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
index 85e54ee97..78ceb0032 100644
--- a/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
+++ b/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
@@ -27,223 +27,212 @@ import org.apache.xerces.util.IntStack;
* A regular expression matching engine using Non-deterministic Finite
Automaton (NFA).
* This engine does not conform to the POSIX regular expression.
*
- * <hr width="50%">
* <h3>How to use</h3>
*
* <dl>
* <dt>A. Standard way
* <dd>
* <pre>
- * RegularExpression re = new RegularExpression(<var>regex</var>);
+ * {@code
+ * RegularExpression re = new RegularExpression(regex);
* if (re.matches(text)) { ... }
+ * }
* </pre>
*
* <dt>B. Capturing groups
* <dd>
* <pre>
- * RegularExpression re = new RegularExpression(<var>regex</var>);
+ * {@code
+ * RegularExpression re = new RegularExpression(regex);
* Match match = new Match();
* if (re.matches(text, match)) {
* ... // You can refer captured texts with methods of the
<code>Match</code> class.
* }
+ * }
* </pre>
*
* </dl>
*
* <h4>Case-insensitive matching</h4>
* <pre>
+ * {@code
* RegularExpression re = new RegularExpression(<var>regex</var>, "i");
* if (re.matches(text) >= 0) { ...}
+ * }
* </pre>
*
* <h4>Options</h4>
- * <p>You can specify options to <a href="#RegularExpression(java.lang.String,
java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>,
</code><var>options</var><code>)</code></a>
- * or <a href="#setPattern(java.lang.String,
java.lang.String)"><code>setPattern(</code><var>regex</var><code>,
</code><var>options</var><code>)</code></a>.
- * This <var>options</var> parameter consists of the following characters.
- * </p>
- * <dl>
- * <dt><a name="I_OPTION"><code>"i"</code></a>
- * <dd>This option indicates case-insensitive matching.
- * <dt><a name="M_OPTION"><code>"m"</code></a>
- * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL
characters within the text.
- * <dt><a name="S_OPTION"><code>"s"</code></a>
- * <dd class="REGEX"><kbd>.</kbd> matches any one character.
- * <dt><a name="U_OPTION"><code>"u"</code></a>
- * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd>
as becoming to Unicode.
- * <dt><a name="W_OPTION"><code>"w"</code></a>
- * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed
with the method of
- * 'Unicode Regular Expression Guidelines' Revision 4.
- * When "w" and "u" are specified at the same time,
- * <kbd>\b \B \< \></kbd> are processed for the "w" option.
- * <dt><a name="COMMA_OPTION"><code>","</code></a>
- * <dd>The parser treats a comma in a character class as a range separator.
- * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or
<kbd>b</kbd> without this option.
- * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd>
with this option.
- *
- * <dt><a name="X_OPTION"><code>"X"</code></a>
- * <dd class="REGEX">
- * By this option, the engine confoms to <a
href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema:
Regular Expression</a>.
- * The <code>match()</code> method does not do subsring matching
- * but entire string matching.
+ * <p>You can specify options to {@link #RegularExpression(String, String)} or
{@link #setPattern(String, String)}.</p>
+ * <p>This <code>options</code> parameter consists of the following
characters:</p>
+ * <ul>
+ * <li><code>i</code> : This option indicates case-insensitive matching.</li>
+ * <li><code>m</code> : <code>^</code> and <code>$</code> consider the EOL
characters within the text.</li>
+ * <li><code>s</code> : <code>.</code> matches any one character.</li>
+ * <li><code>u</code> : Redefines <code>\d \D \w \W \s \S \b \B \<
\></code> as being Unicode.</li>
+ * <li><code>w</code> : With this option, <code>\b \B \< \></code> are
processed with the method of 'Unicode Regular Expression Guidelines' Revision
4. When "w" and "u" are specified at the same time, <code>\b \B \< \></code>
are processed for the "w" option.</li>
+ * <li><code>,</code> : The parser treats a comma in a character class as a
range separator.
+ * <ul>
+ * <li><code>[a,b]</code> matches <code>a</code> or <code>,</code> or
<code>b</code> without this option.</li>
+ * <li><code>[a,b]</code> matches <code>a</code> or <code>b</code> with
this option.</li>
+ * </ul>
+ * </li>
+ * <li><code>X</code> : With this option, the engine conforms to <a
href="https://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema:
Regular Expression</a>. The <code>match()</code> method does not do substring
matching but entire string matching.</li>
+ * </ul>
*
- * </dl>
- *
- * <hr width="50%">
* <h3>Syntax</h3>
- * <table border="1" bgcolor="#ddeeff">
- * <tr>
- * <td>
- * <h4>Differences from the Perl 5 regular expression</h4>
- * <ul>
- * <li>There is 6-digit hexadecimal character representation
(<kbd>\u005cv</kbd><var>HHHHHH</var>.)
- * <li>Supports subtraction, union, and intersection operations for
character classes.
- * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character
representations),
- * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
- * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
- * <kbd>\E</kbd>, <kbd>\Q</kbd>,
<kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
- * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>,
<Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
- * </ul>
- * </td>
- * </tr>
- * </table>
*
- * <p>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</p>
+ * <h4>Differences from Perl 5 regular expression</h4>
+ * <ul>
+ * <li>There is 6-digit hexadecimal character representation
(<code>\vHHHHHH</code>).
+ * <li>Supports subtraction, union, and intersection operations for character
classes.
+ * <li>Not supported:
+ * <ul>
+ * <li><code>\ooo</code> (Octal character representations)</li>
+ * <li><code>\G</code>, <code>\C</code>, <code>\lc</code></li>
+ * <li><code>\u005cuc</code>, <code>\L</code>, <code>\U</code></li>
+ * <li><code>\E</code>, <code>\Q</code>, <code>\N{name}</code></li>
+ * <li><code>(?{code})</code>, <code>(??{code})</code></li>
+ * </ul>
+ * </ul>
+ *
+ * <p>Meta characters are <code>. * + ? { [ ( ) | \ ^ $</code>.</p>
* <ul>
* <li>Character
* <dl>
- * <dt class="REGEX"><kbd>.</kbd> (A period)
+ * <dt class="REGEX"><code>.</code> (A period)
* <dd>Matches any one character except the following characters.
- * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
- * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
+ * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), PARAGRAPH SEPARATOR
(U+2029), LINE SEPARATOR (U+2028)
* <dd>This expression matches one code point in Unicode. It can match a
pair of surrogates.
* <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
* it matches any character including the above four characters.
*
- * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
+ * <dt class="REGEX"><code>\e \f \n \r \t</code>
* <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
* CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
*
- * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
+ * <dt class="REGEX"><code>\cC</code>
* <dd>Matches a control character.
- * The <var>C</var> must be one of '<kbd>@</kbd>',
'<kbd>A</kbd>'-'<kbd>Z</kbd>',
- * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>',
'<kbd>^</kbd>', '<kbd>_</kbd>'.
- * It matches a control character of which the character code is
less than
- * the character code of the <var>C</var> by 0x0040.
- * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED
(U+000A),
- * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
+ * The <var>C</var> must be one of '<code>@</code>',
'<code>A</code>'-'<code>Z</code>',
+ * '<code>[</code>', '<code>\</code>', '<code>]</code>',
'<code>^</code>', '<code>_</code>'.
+ * It matches a control character of which the character code is
less than the character code of
+ * the <var>C</var> by 0x0040.
+ * <dd class="REGEX">For example, a <code>\cJ</code> matches a LINE FEED
(U+000A),
+ * and a <code>\c[</code> matches an ESCAPE (U+001B).
*
* <dt class="REGEX">a non-meta character
* <dd>Matches the character.
*
- * <dt class="REGEX"><KBD>\</KBD> + a meta character
+ * <dt class="REGEX"><code>\</code> + a meta character
* <dd>Matches the meta character.
*
- * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var>
<kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
- * <dd>Matches a character of which code point is <var>HH</var>
(Hexadecimal) in Unicode.
- * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>,
and
- * variable length digits for
<kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
+ * <dt class="REGEX"><code>\xHH</code> <code>\x{HHHH}</code>
+ * <dd>Matches a character of which code point is <var>HH</var>
(Hexadecimal) in Unicode. You can write
+ * just 2 digits for <code>\xHH</code>, and variable length digits
for <code>\x{HHHH}</code>.
*
- * <!--
- * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
- * <dd>Matches a character of which code point is <var>HHHH</var>
(Hexadecimal) in Unicode.
- * -->
- *
- * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
+ * <dt><code>\vHHHHHH</code>
* <dd>Matches a character of which code point is <var>HHHHHH</var>
(Hexadecimal) in Unicode.
*
- * <dt class="REGEX"><kbd>\g</kbd>
+ * <dt class="REGEX"><code>\g</code>
* <dd>Matches a grapheme.
- * <dd class="REGEX">It is equivalent to
<kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
+ * <dd class="REGEX">It is equivalent to
<code>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</code>
*
- * <dt class="REGEX"><kbd>\X</kbd>
- * <dd class="REGEX">Matches a combining character sequence.
- * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
+ * <dt class="REGEX"><code>\X</code>
+ * <dd class="REGEX">Matches a combining character sequence. It is
equivalent to <code>(?:\PM\pM*)</code>
* </dl>
* </li>
*
* <li>Character class
* <dl>
-+ * <dt
class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd>
(without <a href="#COMMA_OPTION">"," option</a>)
-+ * <dt
class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd>
(with <a href="#COMMA_OPTION">"," option</a>)
++ * <dt
class="REGEX">[<var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var>]
(without a {@link #SPECIAL_COMMA} option)</dt>
++ * <dt
class="REGEX">[<var>R<sub>1</sub></var>,<var>R<sub>2</sub></var>,<var>...</var>,<var>R<sub>n</sub></var>]
(with a {@link #SPECIAL_COMMA} option)</dt>
* <dd>Positive character class. It matches a character in ranges.
* <dd><var>R<sub>n</sub></var>:
* <ul>
- * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd>
<kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
<!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
- * <p>This range matches the character.
- * <li
class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
- * <p>This range matches a character which has a code point that
is >= <var>C<sub>1</sub></var>'s code point and <=
<var>C<sub>2</sub></var>'s code point.
-+ * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:]
[:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:]
[:space:] [:upper:] [:xdigit:]</kbd>,
-+ * and negative POSIX character classes in Perl like
<kbd>[:^alpha:]</kbd>
- * <p>...
- * <li class="REGEX"><kbd>\d \D \s \S \w \W
\p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
- * <p>These expressions specifies the same ranges as the following
expressions.
+ * <li class="REGEX">A character (including <code>\e \f \n \r \t \xHH
\x{HHHH} \vHHHHHH</code>)
+ * <p>This range matches the character.</p>
+ * </li>
+ * <li class="REGEX"><var>C<sub>1</sub></var>-<var>C<sub>2</sub></var>
+ * <p>This range matches a character which has a code point that
is >= <var>C1</var>'s code point and <= <var>C2</var>'s code point.</p>
+ * </li>
++ * <li class="REGEX">A POSIX character class: <code>[:alpha:]
[:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:]
[:space:] [:upper:] [:xdigit:]</code>,
++ * and negative POSIX character classes in Perl like
<code>[:^alpha:]</code></li>
+ * <li class="REGEX"><code>\d \D \s \S \w \W \p{name} \P{name}</code>
+ * <p>These expressions specify the same ranges as the following
expressions.</p>
+ * </li>
* </ul>
- * <p class="REGEX">Enumerated ranges are merged (union operation).
- * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
+ * <p>Enumerated ranges are merged (union operation).
<code>[a-ec-z]</code> is equivalent to <code>[a-z]</code></p>
+ * </dd>
*
- * <dt
class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd>
(without a <a href="#COMMA_OPTION">"," option</a>)
- * <dt
class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd>
(with a <a href="#COMMA_OPTION">"," option</a>)
- * <dd>Negative character class. It matches a character not in ranges.
+ * <dt
class="REGEX">[^<var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var>]
(without a {@link #SPECIAL_COMMA} option)</dt>
+ * <dt
class="REGEX">[^<var>R<sub>1</sub></var>,<var>R<sub>2</sub></var>,<var>...</var>,<var>R<sub>n</sub></var>]
(with a {@link #SPECIAL_COMMA} option)</dt>
+ * <dd>Negative character class. It matches a character not in
ranges.</dd>
*
- * <dt
class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd>
... <Kbd>)</kbd>
- * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
+ * <dt class="REGEX"><code>(?[ranges]op[ranges]op[ranges] ... )</code>
+ * (where <var>op</var> is <code>-</code>, <code>+</code> or
<code>&</code>.)
+ * </dt>
* <dd>Subtraction or union or intersection for character classes.
- * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent
to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is
equivalent to <kbd>[A-JL-Z]</kbd>.
- * <dd>The result of this operations is a <u>positive character class</u>
+ * <p>For example, <code>(?[A-Z]-[CF])</code> is equivalent to
<code>[A-BD-EG-Z]</code>, and <code>(?[0x00-0x7f]-[K]&[\p{Lu}])</code> is
equivalent to <code>[A-JL-Z]</code>.</p>
+ * <p>The result of this operation is a <u>positive character class</u>
* even if an expression includes any negative character classes.
- * You have to take care on this in case-insensitive matching.
- * For instance, <kbd>(?[^b])</kbd> is equivalent to
<kbd>[\x00-ac-\x{10ffff}]</kbd>,
- * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
- * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any
character because
- * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches
'<kbd>b</kbd>'
- * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
+ * You have to take care of this in case-insensitive matching.
+ * For instance, <code>(?[^b])</code> is equivalent to
<code>[\x00-ac-\x{10ffff}]</code>,
+ * which is equivalent to <code>[^b]</code> in case-sensitive
matching.
+ * But, in case-insensitive matching, <code>(?[^b])</code> matches
any character because
+ * it includes '<code>B</code>' and '<code>B</code>' matches
'<code>b</code>'
+ * though <code>[^b]</code> is processed as <code>[^Bb]</code>.</p>
+ * </dd>
*
- * <dt
class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd>
(with an <a href="#X_OPTION">"X" option</a>)</dt>
+ * <dt
class="REGEX">[<var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var>-[<var>R<sub>n</sub></var><var>R<sub>n+1</sub>...</var>]]
(with an <code>X</code> option; {@link #XMLSCHEMA_MODE})</dt>
* <dd>Character class subtraction for the XML Schema.
- * You can use this syntax when you specify an <a
href="#X_OPTION">"X" option</a>.
- *
- * <dt class="REGEX"><kbd>\d</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
- * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is
equivalent to
- * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
+ * You can use this syntax when you specify an <code>X</code> option
({@link #XMLSCHEMA_MODE}).
+ * </dd>
+ *
+ * <dt class="REGEX"><code>\d</code></dt>
+ * <dd class="REGEX">Equivalent to <code>[0-9]</code>.
+ * <p>When a <code>u</code> ({@link #USE_UNICODE_CATEGORY}) option is
set, it is equivalent to
+ * <code>\p{Nd}</code>.</p>
+ * </dd>
*
- * <dt class="REGEX"><kbd>\D</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
- * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is
equivalent to
- * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
+ * <dt class="REGEX"><code>\D</code></dt>
+ * <dd class="REGEX">Equivalent to <code>[^0-9]</code>
+ * <p>When a <code>u</code> ({@link #USE_UNICODE_CATEGORY}) option is
set, it is equivalent to
+ * <code>\P{Nd}</code>.</p>
+ * </dd>
*
- * <dt class="REGEX"><kbd>\s</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
- * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is
equivalent to
- * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
+ * <dt class="REGEX"><code>\s</code></dt>
+ * <dd class="REGEX">Equivalent to <code>[ \f\n\r\t]</code>
+ * <dd>When a <code>u</code> ({@link #USE_UNICODE_CATEGORY}) option is
set, it is equivalent to
+ * <code>[ \f\n\r\t\p{Z}]</code>.
*
- * <dt class="REGEX"><kbd>\S</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
- * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is
equivalent to
- * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
+ * <dt class="REGEX"><code>\S</code></dt>
+ * <dd class="REGEX">Equivalent to <code>[^ \f\n\r\t]</code>
+ * <p>When a <code>u</code> ({@link #USE_UNICODE_CATEGORY}) option is
set, it is equivalent to
+ * <code>[^ \f\n\r\t\p{Z}]</code>.</p>
+ * </dd>
*
- * <dt class="REGEX"><kbd>\w</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
- * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is
equivalent to
- * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
+ * <dt class="REGEX"><code>\w</code></dt>
+ * <dd class="REGEX">Equivalent to <code>[a-zA-Z0-9_]</code>
+ * <p>When a <code>u</code> ({@link #USE_UNICODE_CATEGORY}) option is
set, it is equivalent to
+ * <code>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</code>.</p>
+ * </dd>
*
- * <dt class="REGEX"><kbd>\W</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
- * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is
equivalent to
- * <span
class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
+ * <dt class="REGEX"><code>\W</code></dt>
+ * <dd class="REGEX">Equivalent to <code>[^a-zA-Z0-9_]</code>
+ * <p>When a <code>u</code> ({@link #USE_UNICODE_CATEGORY}) option is
set, it is equivalent to
+ * <code>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</code>.</p>
+ * </dd>
*
- * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
- * <dd>Matches one character in the specified General Category (the
second field in <a
href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>)
or the specified <a
href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
- * The following names are available:
+ * <dt class="REGEX"><code>\p{name}</code></dt>
+ * <dd>Matches one character in the specified General Category (the
second field in <a
href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt">UnicodeData.txt</a>)
or the specified <a
href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.</dd>
+ * <dd>The following names are available:
* <dl>
- * <dt>Unicode General Categories:
- * <dd><kbd>
- * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs,
Zl, Zp,
- * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
- * </kbd>
- * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
- * <dt>Unicode Blocks:
- * <dd><kbd>
+ * <dt>Unicode General Categories:</dt>
+ * <dd><code>L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd,
Nl, No, Zs, Zl, Zp, Cc, Cf, Cn,
+ * Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So</code>
+ * </dd>
+ * <dd>(Currently the Cn category includes U+10000-U+10FFFF
characters)</dd>
+ * <dt>Unicode Blocks:</dt>
+ * <dd>
* Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
* IPA Extensions, Spacing Modifier Letters, Combining Diacritical
Marks, Greek,
* Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi,
Gujarati,
@@ -261,175 +250,137 @@ import org.apache.xerces.util.IntStack;
* Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility
Forms,
* Small Form Variants, Arabic Presentation Forms-B, Specials,
* Halfwidth and Fullwidth Forms
- * </kbd>
- * <dt>Others:
- * <dd><kbd>ALL</kbd> (Equivalent to
<kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
- * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to
<kbd>\P{Cn}</kbd>)
- * <dd><kbd>UNASSGINED</kbd>
- * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
+ * </dd>
+ * <dt>Others:</dt>
+ * <dd><code>ALL</code> (Equivalent to
<code>[\u0000-\v10FFFF]</code>)</dd>
+ * <dd><code>ASSIGNED</code> (<code>\p{ASSIGNED}</code> is equivalent
to <code>\P{Cn}</code>)</dd>
+ * <dd><code>UNASSIGNED</code> (<code>\p{UNASSIGNED}</code> is
equivalent to <code>\p{Cn}</code>)</dd>
* </dl>
*
- * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
- * <dd>Matches one character not in the specified General Category or
the specified Block.
+ * <dt class="REGEX"><code>\P{name}</code></dt>
+ * <dd>Matches one character not in the specified General Category or
the specified Block.</dd>
* </dl>
* </li>
*
* <li>Selection and Quantifier
- * <dl>
- * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
- * <dd>...
- *
- * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
- * <dd>Matches 0 or more <var>X</var>.
- *
- * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
- * <dd>Matches 1 or more <var>X</var>.
- *
- * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
- * <dd>Matches 0 or 1 <var>X</var>.
- *
- * <dt
class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
- * <dd>Matches <var>number</var> times.
- *
- * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
- * <dd>...
- *
- * <dt
class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
- * <dd>...
- *
- * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
- * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
- * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
- * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
- * <dt
class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
- * <dd>Non-greedy matching.
- * </dl>
+ * <ul>
+ * <li><code>X | Y</code> matches either X or Y</li>
+ * <li><code>X*</code> matches 0 or more of X</li>
+ * <li><code>X+</code> matches 0 or more of X</li>
+ * <li><code>X?</code> matches 0 or one of X</li>
+ * <li><code>X{number}</code> matches <i>number</i> or more of X</li>
+ * <li><code>X{min,}</code> matches <i>min</i> or more of X</li>
+ * <li><code>X{min,max}</code> matches between <i>min</i> and <i>max</i>
of X</li>
+ * <li>Non-greedy equivalent of above
+ * <ul>
+ * <li><code>X*?</code> non-greedy</li>
+ * <li><code>X+?</code> non-greedy</li>
+ * <li><code>X??</code> non-greedy</li>
+ * <li><code>X{min,}?</code></li>
+ * <li><code>X{min,max}?</code></li>
+ * </ul>
+ * </li>
+ * </ul>
* </li>
*
* <li>Grouping, Capturing, and Back-reference
- * <dl>
- * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
- * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or
"<KBD>foooo</KBD>".
- * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
- * you have to write "<KBD>(?:foo)+</KBD>".
- *
- * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
- * <dd>Grouping with capturing.
- * It make a group and applications can know
- * where in target text a group matched with methods of a <code>Match</code>
instance
- * after <code><a href="#matches(java.lang.String,
org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
- * The 0th group means whole of this regular expression.
- * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left
parenthesis.
- *
- * <p>For instance, a regular expression is
- * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
- * and target text is
- * "<FONT color=red><KBD>From: TAMURA Kent
<[email protected]></KBD></FONT>":
- * <ul>
- * <li><code>Match.getCapturedText(0)</code>:
- * "<FONT color=red><KBD> TAMURA Kent
<[email protected]></KBD></FONT>"
- * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA
Kent</KBD></FONT>"
- * <li><code>Match.getCapturedText(2)</code>: "<FONT
color=red><KBD>[email protected]</KBD></FONT>"
- * </ul>
- *
- * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
- * <dd>
- *
- * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
- * <dd>Independent expression group. ................
- *
- * <dt
class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
- * <dt
class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
- * <dd>............................
- * <dd>The <var>options</var> or the <var>options2</var> consists of 'i'
'm' 's' 'w'.
- * Note that it can not contain 'u'.
- *
- * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
- * <dt
class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
- * <dd>......
- * <dd>These expressions must be at the beginning of a group.
- * </dl>
+ * <ul>
+ * <li><code>(?:X)</code> Grouping. <code>foo+</code> matches
<code>foo</code> or <code>foooo</code>.
+ * <p>If you want it matches <code>foofoo</code> or
<code>foofoofoo</code>, you have to write <code>(?:foo)+</code>.</p>
+ * </li>
+ * <li><code>(X)</code> Grouping with capturing.
+ * <p>It makes a capturing group know where in target text a group
matched with methods of a <code>Match</code> instance after {@link
#matches(String, Match)}.</p>
+ * <p>The 0th group means whole of this regular expression.</p>
+ * <p>The <i>N</i>th group is the inside of the <i>N</i>th left
parenthesis.</p>
+ * <p>For instance, with a regular expression of <code> *([^<:]*)
+<([^>]*)> *</code> and target text of</p>
+ * <pre>From: TAMURA Kent <[email protected]></pre>
+ * <p>The result should be as followed:</p>
+ * <ul>
+ * <li><code>Match.getCapturedText(0)</code> : "<code> TAMURA Kent
<[email protected]></code>"</li>
+ * <li><code>Match.getCapturedText(1)</code> : "<code>TAMURA
Kent</code>"</li>
+ * <li><code>Match.getCapturedText(2)</code> :
"<code>[email protected]</code>"</li>
+ * </ul>
+ * </li>
+ * <li><code>\1 \2 \3 \4 \5 \6 \7 \8 \9</code></li>
+ * <li><code>(?>X)</code> Independent expression group.
................</li>
+ * <li><code>(?options:X)</code> or <code>(?options-options2:X)</code>
The <i>options</i> or the <i>options2</i>
+ * consists of 'i' 'm' 's' 'w'. Note that it can not contain 'u'.</li>
+ * <li><code>(?options)</code> or <code>(?options-options2)</code> These
expressions must be at the beginning of a group.</li>
+ * </ul>
* </li>
*
* <li>Anchor
* <dl>
- * <dt class="REGEX"><kbd>\A</kbd>
- * <dd>Matches the beginnig of the text.
+ * <dt class="REGEX"><code>\A</code>
+ * <dd>Matches the beginning of the text.
*
- * <dt class="REGEX"><kbd>\Z</kbd>
+ * <dt class="REGEX"><code>\Z</code>
* <dd>Matches the end of the text, or before an EOL character at the
end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
*
- * <dt class="REGEX"><kbd>\z</kbd>
+ * <dt class="REGEX"><code>\z</code>
* <dd>Matches the end of the text.
*
- * <dt class="REGEX"><kbd>^</kbd>
- * <dd>Matches the beginning of the text. It is equivalent to <span
class="REGEX"><Kbd>\A</kbd></span>.
- * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
+ * <dt class="REGEX"><code>^</code>
+ * <dd>Matches the beginning of the text. It is equivalent to
<code>\A</code>.
+ * <dd>When the <code>m</code> ({@link #MULTIPLE_LINES}) option is set,
* it matches the beginning of the text, or after one of EOL
characters (
* LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR
(U+2028),
* PARAGRAPH SEPARATOR (U+2029).)
*
- * <dt class="REGEX"><kbd>$</kbd>
+ * <dt class="REGEX"><code>$</code>
* <dd>Matches the end of the text, or before an EOL character at the
end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
- * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
+ * <dd>When the <code>m</code> ({@link #MULTIPLE_LINES}) option is set,
* it matches the end of the text, or before an EOL character.
*
- * <dt class="REGEX"><kbd>\b</kbd>
- * <dd>Matches word boundary.
- * (See <a href="#W_OPTION">a "w" option</a>)
+ * <dt class="REGEX"><code>\b</code>
+ * <dd>Matches word boundary. (See {@link #UNICODE_WORD_BOUNDARY})
*
- * <dt class="REGEX"><kbd>\B</kbd>
- * <dd>Matches non word boundary.
- * (See <a href="#W_OPTION">a "w" option</a>)
+ * <dt class="REGEX"><code>\B</code>
+ * <dd>Matches non word boundary. (See {@link #UNICODE_WORD_BOUNDARY})
*
- * <dt class="REGEX"><kbd>\<</kbd>
- * <dd>Matches the beginning of a word.
- * (See <a href="#W_OPTION">a "w" option</a>)
+ * <dt class="REGEX"><code>\<</code>
+ * <dd>Matches the beginning of a word. (See {@link
#UNICODE_WORD_BOUNDARY})
*
- * <dt class="REGEX"><kbd>\></kbd>
- * <dd>Matches the end of a word.
- * (See <a href="#W_OPTION">a "w" option</a>)
+ * <dt class="REGEX"><code>\></code>
+ * <dd>Matches the end of a word. (See {@link #UNICODE_WORD_BOUNDARY})
* </dl>
* </li>
* <li>Lookahead and lookbehind
* <dl>
- * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
+ * <dt class="REGEX"><code>(?=X)</code>
* <dd>Lookahead.
*
- * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
+ * <dt class="REGEX"><code>(?!X)</code>
* <dd>Negative lookahead.
*
- * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
+ * <dt class="REGEX"><code>(?<=X)</code>
* <dd>Lookbehind.
* <dd>(Note for text capturing......)
*
- * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
+ * <dt class="REGEX"><code>(?<!X)</code>
* <dd>Negative lookbehind.
* </dl>
* </li>
*
* <li>Misc.
* <dl>
- * <dt
class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
- * <dt
class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
+ * <dt class="REGEX"><code>(?(condition)yes-pattern|no-pattern)</code>,
+ * <dt class="REGEX"><code>(?(condition)yes-pattern)</code>
* <dd>......
- * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
- * <dd>Comment. A comment string consists of characters except
'<kbd>)</kbd>'.
+ * <dt class="REGEX"><code>(?#comment)</code> Comment
+ * <dd>A comment string consists of characters except '<code>)</code>'.
* You can not write comments in character classes and before
quantifiers.
* </dl>
* </li>
* </ul>
*
- *
- * <hr width="50%">
- * <h3>BNF for the regular expression</h3>
+ * <h3>BNF grammar for the regular expression</h3>
* <pre>
* regex ::= ('(?' options ')')? term ('|' term)*
* term ::= factor+
- * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
- * | '(?#' [^)]* ')'
+ * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? | '(?#'
[^)]* ')'
* minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
* atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\'
[0-9]
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
@@ -437,12 +388,10 @@ import org.apache.xerces.util.IntStack;
* | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)?
')'
* options ::= [imsw]* ('-' [imsw]+)?
* anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
- * looks ::= '(?=' regex ')' | '(?!' regex ')'
- * | '(?<=' regex ')' | '(?<!' regex ')'
+ * looks ::= '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' |
'(?<!' regex ')'
* char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
* category-block ::= '\' [pP] category-symbol-1
- * | ('\p{' | '\P{') (category-symbol | block-name
- * | other-properties) '}'
+ * | ('\p{' | '\P{') (category-symbol | block-name |
other-properties) '}'
* category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
* category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
* | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
@@ -453,36 +402,24 @@ import org.apache.xerces.util.IntStack;
* other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
* character-1 ::= (any character except meta-characters)
*
- * char-class ::= '[' ranges ']'
- * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
+ * char-class ::= '[' ranges ']' | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
* ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
* range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
* | range-char | range-char '-' range-char
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point |
character-2
* code-point ::= '\x' hex-char hex-char
* | '\x{' hex-char+ '}'
- * <!-- | '\u005c u' hex-char hex-char hex-char hex-char
- * --> | '\v' hex-char hex-char hex-char hex-char hex-char
hex-char
+ * | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
* hex-char ::= [0-9a-fA-F]
* character-2 ::= (any character except \[]-,)
* </pre>
*
- * <hr width="50%">
- * <h3>TODO</h3>
- * <ul>
- * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode
Regular Expression Guidelines</a>
- * <ul>
- * <li>2.4 Canonical Equivalents
- * <li>Level 3
- * </ul>
- * <li>Parsing performance
- * </ul>
- *
- * <hr width="50%">
+ * <h3>Reference</h3>
+ * <a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular
Expression Guidelines</a>
*
* @xerces.internal
*
- * @author TAMURA Kent <[email protected]>
+ * @author TAMURA Kent <a
href="mailto:[email protected]">[email protected]</a>
* @version $Id$
*/
public class RegularExpression implements java.io.Serializable {
@@ -683,8 +620,8 @@ public class RegularExpression implements
java.io.Serializable {
/**
* Checks whether the <var>target</var> text <strong>contains</strong>
this pattern or not.
*
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not
match.
+ * @param match A Match instance for storing matching result
+ * @return Offset of the start position in <var>target</var>; or -1 if not
match
*/
public boolean matches(char[] target, Match match) {
return this.matches(target, 0, target .length , match);
@@ -695,10 +632,10 @@ public class RegularExpression implements
java.io.Serializable {
* Checks whether the <var>target</var> text <strong>contains</strong>
this pattern
* in specified range or not.
*
- * @param start Start offset of the range.
- * @param end End offset +1 of the range.
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not
match.
+ * @param start Start offset of the range
+ * @param end End offset +1 of the range
+ * @param match A Match instance for storing matching result
+ * @return Offset of the start position in <var>target</var>; or -1 if not
match
*/
public boolean matches(char[] target, int start, int end, Match match) {
@@ -869,8 +806,8 @@ public class RegularExpression implements
java.io.Serializable {
/**
* Checks whether the <var>target</var> text <strong>contains</strong>
this pattern or not.
*
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not
match.
+ * @param match A Match instance for storing matching result
+ * @return Offset of the start position in <var>target</var>; or -1 if not
match
*/
public boolean matches(String target, Match match) {
return this.matches(target, 0, target .length() , match);
@@ -880,10 +817,10 @@ public class RegularExpression implements
java.io.Serializable {
* Checks whether the <var>target</var> text <strong>contains</strong>
this pattern
* in specified range or not.
*
- * @param start Start offset of the range.
- * @param end End offset +1 of the range.
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not
match.
+ * @param start Start offset of the range
+ * @param end End offset +1 of the range
+ * @param match A Match instance for storing matching result
+ * @return Offset of the start position in <var>target</var>; or -1 if not
match
*/
public boolean matches(String target, int start, int end, Match match) {
@@ -1557,8 +1494,8 @@ public class RegularExpression implements
java.io.Serializable {
/**
* Checks whether the <var>target</var> text <strong>contains</strong>
this pattern or not.
*
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not
match.
+ * @param match A Match instance for storing matching result
+ * @return Offset of the start position in <var>target</var>; or -1 if not
match
*/
public boolean matches(CharacterIterator target, Match match) {
int start = target.getBeginIndex();
@@ -2155,10 +2092,10 @@ public class RegularExpression implements
java.io.Serializable {
}
}
- /**
+ /*
* An option.
- * If you specify this option, <span
class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
- * captures matched text, and <span
class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
+ * If you specify this option, <span><code>(X)</code></span>
+ * captures matched text, and <span><code>(:?X)</code></span>
* does not capture.
*
* @see #RegularExpression(java.lang.String,int)
@@ -2187,25 +2124,25 @@ public class RegularExpression implements
java.io.Serializable {
static final int EXTENDED_COMMENT = 1<<4;
/**
- * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s
\S</kbd></span>.
+ * This option redefines <span><code>\d \D \w \W \s \S</code></span>.
*
- * @see #RegularExpression(java.lang.String,int)
- * @see #setPattern(java.lang.String,int)
+ * @see #RegularExpression(String,String)
+ * @see #setPattern(String,int,Locale)
* @see #UNICODE_WORD_BOUNDARY
*/
static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
/**
* An option.
- * This enables to process locale-independent word boundary for <span
class="REGEX"><kbd>\b \B \< \></kbd></span>.
+ * This enables to process locale-independent word boundary for
<span><code>\b \B \< \></code></span>.
* <p>By default, the engine considers a position between a word character
- * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
+ * (<span><code>\w</code></span>) and a non word character
* is a word boundary.
* <p>By this option, the engine checks word boundaries with the method of
* 'Unicode Regular Expression Guidelines' Revision 4.
*
- * @see #RegularExpression(java.lang.String,int)
- * @see #setPattern(java.lang.String,int)
+ * @see #RegularExpression(String,String)
+ * @see #setPattern(String,int,Locale)
*/
static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
@@ -2234,8 +2171,8 @@ public class RegularExpression implements
java.io.Serializable {
/**
* Creates a new RegularExpression instance.
*
- * @param regex A regular expression
- * @exception org.apache.xerces.utils.regex.ParseException
<VAR>regex</VAR> is not conforming to the syntax.
+ * @param regex a regular expression
+ * @throws ParseException if regex is not conforming to the syntax
*/
public RegularExpression(String regex) throws ParseException {
this(regex, null);
@@ -2244,9 +2181,9 @@ public class RegularExpression implements
java.io.Serializable {
/**
* Creates a new RegularExpression instance with options.
*
- * @param regex A regular expression
- * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
- * @exception org.apache.xerces.utils.regex.ParseException
<VAR>regex</VAR> is not conforming to the syntax.
+ * @param regex a regular expression
+ * @param options a string of options consisted of "i" "m" "s" "u" "w" ","
"X" or null
+ * @throws ParseException if regex is not conforming to the syntax
*/
public RegularExpression(String regex, String options) throws
ParseException {
this.setPattern(regex, options);
@@ -2255,9 +2192,11 @@ public class RegularExpression implements
java.io.Serializable {
/**
* Creates a new RegularExpression instance with options.
*
- * @param regex A regular expression
- * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
- * @exception org.apache.xerces.utils.regex.ParseException
<VAR>regex</VAR> is not conforming to the syntax.
+ * @param regex a regular expression pattern
+ * @param options a string of options consisted of "i" "m" "s" "u" "w" ","
"X" or null
+ * @param locale value of the desired locale or null
+ * @throws ParseException if regex is not conforming to the syntax
+ * @see Locale
*/
public RegularExpression(String regex, String options, Locale locale)
throws ParseException {
this.setPattern(regex, options, locale);
@@ -2272,16 +2211,36 @@ public class RegularExpression implements
java.io.Serializable {
}
/**
+ * Set a new regular expression pattern with the default Locale.
*
+ * @param newPattern a new regular expression pattern
+ * @throws ParseException if regex is not conforming to the syntax
*/
public void setPattern(String newPattern) throws ParseException {
this.setPattern(newPattern, Locale.getDefault());
}
-
+
+ /**
+ * Set a new regular expression pattern with the provided Locale.
+ *
+ * @param newPattern a new regular expression pattern
+ * @param locale value of the desired locale or null
+ * @throws ParseException if regex is not conforming to the syntax
+ * @see Locale
+ */
public void setPattern(String newPattern, Locale locale) throws
ParseException {
this.setPattern(newPattern, this.options, locale);
}
+ /**
+ * Set a new regular expression pattern with the provided options and
Locale.
+ *
+ * @param newPattern a new regular expression pattern
+ * @param options an int value representation of regular expression options
+ * @param locale value of the desired locale or null
+ * @throws ParseException if regex is not conforming to the syntax
+ * @see Locale
+ */
private void setPattern(String newPattern, int options, Locale locale)
throws ParseException {
this.regex = newPattern;
this.options = options;
@@ -2294,36 +2253,55 @@ public class RegularExpression implements
java.io.Serializable {
this.operations = null;
this.context = null;
}
+
/**
+ * Set a new regular expression pattern with the provided options and the
default locale.
*
+ * @param newPattern a new regular expression pattern
+ * @param options a string of options consisted of "i" "m" "s" "u" "w" ","
"X" or null
+ * @throws ParseException if regex is not conforming to the syntax
+ * @see Locale#getDefault()
*/
public void setPattern(String newPattern, String options) throws
ParseException {
this.setPattern(newPattern, options, Locale.getDefault());
}
-
+
+ /**
+ * Set a new regular expression pattern with the provided options and
Locale.
+ *
+ * @param newPattern a new regular expression pattern
+ * @param options a string of options consisted of "i" "m" "s" "u" "w" ","
"X" or null
+ * @param locale value of the desired locale or null
+ * @throws ParseException if regex is not conforming to the syntax
+ * @see Locale
+ */
public void setPattern(String newPattern, String options, Locale locale)
throws ParseException {
this.setPattern(newPattern, REUtil.parseOptions(options), locale);
}
/**
+ * Returns the regular expression pattern.
*
+ * @return the regular expression pattern
*/
public String getPattern() {
return this.regex;
}
/**
- * Represents this instence in String.
+ * Represents this instance in String.
*/
public String toString() {
return this.tokentree.toString(this.options);
}
/**
- * Returns a option string.
- * The order of letters in it may be different from a string specified
- * in a constructor or <code>setPattern()</code>.
+ * Returns a string representation of the regular expression's options.
*
+ * <p>The order of letters in it may be different from a string specified
+ * in a constructor or <code>setPattern()</code>.</p>
+ *
+ * @return a string representation of the regular expression's options
* @see #RegularExpression(java.lang.String,java.lang.String)
* @see #setPattern(java.lang.String,java.lang.String)
*/
@@ -2332,7 +2310,9 @@ public class RegularExpression implements
java.io.Serializable {
}
/**
- * Return true if patterns are the same and the options are equivalent.
+ * Return true if patterns are the same and the options are equivalent.
+ *
+ * @return true if patterns are the same and the options are equivalent
*/
public boolean equals(Object obj) {
if (obj == null) return false;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]