This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 5b846a30 OPENNLP-1679: Extend JavaDoc of SgmlParser (#719)
5b846a30 is described below
commit 5b846a30c7fdfd8fa15e9dd0db78cf0f1344f807
Author: Martin Wiesner <[email protected]>
AuthorDate: Sat Dec 21 17:33:11 2024 +0100
OPENNLP-1679: Extend JavaDoc of SgmlParser (#719)
---
.../tools/formats/muc/DocumentSplitterStream.java | 2 +-
.../java/opennlp/tools/formats/muc/SgmlParser.java | 92 ++++++++++++++--------
.../opennlp/tools/formats/muc/SgmlParserTest.java | 17 +++-
3 files changed, 75 insertions(+), 36 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
index 691ef6b6..aed10c86 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
@@ -69,7 +69,7 @@ class DocumentSplitterStream extends
FilterObjectStream<String, String> {
}
}
- if (docs.size() > 0) {
+ if (!docs.isEmpty()) {
return docs.remove(0);
}
else {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/SgmlParser.java
b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/SgmlParser.java
index e85e9950..9100536d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/SgmlParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/SgmlParser.java
@@ -26,38 +26,61 @@ import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.StringUtil;
/**
- * SAX style SGML parser.
- * <p>
- * Note:<br>
- * The implementation is very limited, but good enough to
- * parse the MUC corpora. Its must very likely be extended/improved/fixed to
parse
- * a different SGML corpora.
+ * A SAX style <a
href="https://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html">SGML</a>
parser.
+ *
+ * @implNote The implementation is very limited, but good enough to parse the
+ * <a href="https://catalog.ldc.upenn.edu/LDC2003T13">MUC corpora</a>.
+ * Its must very likely be extended/improved/fixed to parse a different SGML
corpora.
*/
public class SgmlParser {
- public static abstract class ContentHandler {
-
- public void startElement(String name, Map<String, String> attributes)
throws InvalidFormatException {
- }
+ private static final char SYMBOL_CLOSE = '>';
+ private static final char SYMBOL_OPEN = '<';
+ private static final char SYMBOL_SLASH = '/';
+ private static final char SYMBOL_EQUALS = '=';
+ private static final char SYMBOL_QUOT = '"';
- public void characters(CharSequence chars) throws InvalidFormatException{
- }
+ /**
+ * Defines methods to handle content produced by a {@link SgmlParser}.
+ * A concrete implementation interprets the document specific details.
+ */
+ public static abstract class ContentHandler {
- public void endElement(String name) throws InvalidFormatException {
- }
+ /**
+ * Handles a SGML start element.
+ *
+ * @param name The name of the element's start tag.
+ * @param attributes The attributes supplied with the start tag. It may be
empty.
+ * @throws InvalidFormatException Thrown if parameters were invalid.
+ */
+ public abstract void startElement(String name, Map<String, String>
attributes)
+ throws InvalidFormatException;
+
+ /**
+ * Handles a set of characters between SGML start and end tag.
+ *
+ * @param chars The characters to process.
+ * @throws InvalidFormatException Thrown if parameters were invalid.
+ */
+ public abstract void characters(CharSequence chars)
+ throws InvalidFormatException;
+
+ /**
+ * Handles a SGML end element.
+ * @param name The name of the element's end tag.
+ */
+ public abstract void endElement(String name);
}
private static String extractTagName(CharSequence tagChars) throws
InvalidFormatException {
int fromOffset = 1;
-
- if (tagChars.length() > 1 && tagChars.charAt(1) == '/') {
+ if (tagChars.length() > 1 && tagChars.charAt(1) == SYMBOL_SLASH) {
fromOffset = 2;
}
for (int ci = 1; ci < tagChars.length(); ci++) {
-
- if (tagChars.charAt(ci) == '>' ||
StringUtil.isWhitespace(tagChars.charAt(ci))) {
+ if (tagChars.charAt(ci) == SYMBOL_CLOSE ||
StringUtil.isWhitespace(tagChars.charAt(ci))) {
return tagChars.subSequence(fromOffset, ci).toString();
}
}
@@ -90,7 +113,8 @@ public class SgmlParser {
extractKey = true;
}
// Equals sign indicated end of key name
- else if (extractKey && ('=' == tagChars.charAt(i) ||
StringUtil.isWhitespace(tagChars.charAt(i)))) {
+ else if (extractKey && (SYMBOL_EQUALS == tagChars.charAt(i) ||
+ StringUtil.isWhitespace(tagChars.charAt(i)))) {
extractKey = false;
}
// Inside key name, extract all chars
@@ -98,7 +122,7 @@ public class SgmlParser {
key.append(tagChars.charAt(i));
}
// " Indicates begin or end of value chars
- else if ('"' == tagChars.charAt(i)) {
+ else if (SYMBOL_QUOT == tagChars.charAt(i)) {
if (extractValue) {
attributes.put(key.toString(), value.toString());
@@ -107,7 +131,6 @@ public class SgmlParser {
key.setLength(0);
value.setLength(0);
}
-
extractValue = !extractValue;
}
// Inside value, extract all chars
@@ -119,6 +142,17 @@ public class SgmlParser {
return attributes;
}
+ /**
+ * Parses an SGML document available via the input in {@link Reader}.
+ * The specified {@link ContentHandler} is responsible of how to interpret
the document
+ * specific details.
+ *
+ * @param in A {@link Reader} that provides the data of the SGML
document.
+ * @param handler The {@link ContentHandler} to interpret the document with.
+ *
+ * @throws IOException Thrown if IO errors occurred.
+ * @throws InvalidFormatException Thrown if parameters were invalid.
+ */
public void parse(Reader in, ContentHandler handler) throws IOException {
StringBuilder buffer = new StringBuilder();
@@ -130,45 +164,37 @@ public class SgmlParser {
int c;
while ((c = in.read()) != -1) {
- if ('<' == c) {
+ if (SYMBOL_OPEN == c) {
if (isInsideTag) {
throw new InvalidFormatException("Did not expect < char!");
}
-
- if (buffer.toString().trim().length() > 0) {
+ if (!buffer.toString().trim().isEmpty()) {
handler.characters(buffer.toString().trim());
}
-
buffer.setLength(0);
-
isInsideTag = true;
isStartTag = true;
}
-
buffer.appendCodePoint(c);
- if ('/' == c && lastChar == '<') {
+ if (SYMBOL_SLASH == c && lastChar == SYMBOL_OPEN) {
isStartTag = false;
}
- if ('>' == c) {
+ if (SYMBOL_CLOSE == c) {
if (!isInsideTag) {
throw new InvalidFormatException("Did not expect > char!");
}
-
if (isStartTag) {
handler.startElement(extractTagName(buffer), getAttributes(buffer));
}
else {
handler.endElement(extractTagName(buffer));
}
-
buffer.setLength(0);
-
isInsideTag = false;
}
-
lastChar = c;
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/formats/muc/SgmlParserTest.java
b/opennlp-tools/src/test/java/opennlp/tools/formats/muc/SgmlParserTest.java
index 486da1e0..16bd54c3 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/formats/muc/SgmlParserTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/muc/SgmlParserTest.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
+import java.util.Map;
import org.junit.jupiter.api.Test;
@@ -34,9 +35,21 @@ public class SgmlParserTest extends AbstractFormatTest {
try (Reader in = new
InputStreamReader(getResourceStream("muc/parsertest1.sgml"),
StandardCharsets.UTF_8)) {
SgmlParser parser = new SgmlParser();
- parser.parse(in, new SgmlParser.ContentHandler() {
- });
+ parser.parse(in, new DummyContentHandler());
}
}
+ private static class DummyContentHandler extends SgmlParser.ContentHandler {
+ @Override
+ public void startElement(String name, Map<String, String> attributes) {
+ }
+
+ @Override
+ public void characters(CharSequence chars) {
+ }
+
+ @Override
+ public void endElement(String name) {
+ }
+ }
}