This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch migrate-wikinews-importer-to-opennlp-tools-2_1_0 in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 608c287b4571529f9408f9d98abafdb39bb03a1f Author: Martin Wiesner <[email protected]> AuthorDate: Sun Jan 22 07:45:20 2023 +0100 updates sandbox component 'wikinews-importer' to be compatible with latest opennlp-tools release - adjusts parent project (org.apache.apache) to version 18 - adjusts Java language level to 11 - updates `uimaj` dependencies to version 3.3.1 - modernizes resource handling in `WikinewsConverter` - corrects some formatting issues - removes unused imports --- wikinews-importer/pom.xml | 50 +++++--------- .../wikinews_importer/AnnotatingMarkupParser.java | 77 ++++++++++------------ .../apache/opennlp/wikinews_importer/UimaUtil.java | 8 +-- .../wikinews_importer/WikinewsConverter.java | 67 +++++++------------ 4 files changed, 76 insertions(+), 126 deletions(-) diff --git a/wikinews-importer/pom.xml b/wikinews-importer/pom.xml index efec79e..3b67865 100644 --- a/wikinews-importer/pom.xml +++ b/wikinews-importer/pom.xml @@ -21,75 +21,55 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> - <parent> <groupId>org.apache</groupId> <artifactId>apache</artifactId> - <version>9</version> + <!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. --> + <version>18</version> <relativePath /> </parent> <groupId>org.apache.opennlp</groupId> <artifactId>wikinews-importer</artifactId> - <version>0.0.1-incubating-SNAPSHOT</version> + <version>2.1.1-incubating-SNAPSHOT</version> <packaging>jar</packaging> - <name>OpenNLP Wikinews Importer</name> - - <prerequisites> - <maven>3.0</maven> - </prerequisites> + <name>Apache OpenNLP Wikinews Importer</name> - <repositories> - <repository> - <id>maven2-repository.java.net</id> - <name>Java.net Repository for Maven</name> - <url>http://download.java.net/maven/2/</url> - <layout>default</layout> - </repository> + <properties> + <uimaj.version>3.3.1</uimaj.version> + </properties> - <repository> - <id>info-bliki-repository</id> - <url>http://gwtwiki.googlecode.com/svn/maven-repository/</url> - <releases> - <enabled>true</enabled> - </releases> - <snapshots> - <enabled>false</enabled> - </snapshots> - </repository> - </repositories> - <dependencies> <dependency> <groupId>com.sun.jersey</groupId> <artifactId>jersey-json</artifactId> - <version>1.8</version> + <version>1.19.4</version> </dependency> <dependency> <groupId>com.sun.jersey</groupId> <artifactId>jersey-client</artifactId> - <version>1.8</version> + <version>1.19.4</version> </dependency> <dependency> <groupId>info.bliki.wiki</groupId> <artifactId>bliki-core</artifactId> - <version>3.0.16</version> + <version>3.0.19</version> </dependency> <dependency> <groupId>org.apache.uima</groupId> <artifactId>uimaj-core</artifactId> - <version>2.3.1</version> + <version>${uimaj.version}</version> <scope>compile</scope> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.8.1</version> + <version>4.13.2</version> <scope>test</scope> </dependency> </dependencies> @@ -100,9 +80,9 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <configuration> - <source>1.6</source> - <target>1.6</target> - <compilerArgument>-Xlint</compilerArgument> + <source>11</source> + <target>11</target> + <compilerArgument>-Xlint</compilerArgument> </configuration> </plugin> </plugins> diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java index b1f9b00..f341a29 100644 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java +++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java @@ -17,6 +17,13 @@ package org.apache.opennlp.wikinews_importer; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + import info.bliki.htmlcleaner.ContentToken; import info.bliki.htmlcleaner.TagNode; import info.bliki.wiki.filter.ITextConverter; @@ -28,24 +35,15 @@ import info.bliki.wiki.model.ImageFormat; import info.bliki.wiki.model.WikiModel; import info.bliki.wiki.tags.WPATag; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - /** * Parse mediawiki markup to strip the formatting info and extract a simple text * version suitable for NLP along with header, paragraph and link position * annotations. - * + * <p> * Use the {@code #convert(String)} and {@code #getWikiLinks()} methods. - * - * Due to the constraints imposed by the {@code ITextConverter} / - * {@code WikiModel} API, this class is not thread safe: only one instance + * <p> + * Due to the constraints imposed by the {@link ITextConverter} / + * {@link WikiModel} API, this class is not thread safe: only one instance * should be run by thread. */ public class AnnotatingMarkupParser implements ITextConverter { @@ -58,19 +56,17 @@ public class AnnotatingMarkupParser implements ITextConverter { public static final String WIKIOBJECT_ATTR_KEY = "wikiobject"; - public static final Set<String> PARAGRAPH_TAGS = new HashSet<String>( - Arrays.asList("p")); + public static final Set<String> PARAGRAPH_TAGS = Set.of("p"); - public static final Set<String> HEADING_TAGS = new HashSet<String>( - Arrays.asList("h1", "h2", "h3", "h4", "h5", "h6")); + public static final Set<String> HEADING_TAGS = Set.of("h1", "h2", "h3", "h4", "h5", "h6"); public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*"); - protected final List<Annotation> wikilinks = new ArrayList<Annotation>(); + protected final List<Annotation> wikilinks = new ArrayList<>(); - protected final List<Annotation> headers = new ArrayList<Annotation>(); + protected final List<Annotation> headers = new ArrayList<>(); - protected final List<Annotation> paragraphs = new ArrayList<Annotation>(); + protected final List<Annotation> paragraphs = new ArrayList<>(); protected String languageCode = "en"; @@ -91,24 +87,21 @@ public class AnnotatingMarkupParser implements ITextConverter { model = makeWikiModel(languageCode); } - public WikiModel makeWikiModel(String languageCode) { - return new WikiModel(String.format( - "http:/%s.wikipedia.org/wiki/${image}", languageCode), - String.format("http://%s.wikipedia.org/wiki/${title}", - languageCode)) { + public WikiModel makeWikiModel(String langCode) { + return new WikiModel(String.format("https:/%s.wikipedia.org/wiki/${image}", langCode), + String.format("https://%s.wikipedia.org/wiki/${title}", langCode)) { @Override public String getRawWikiContent(String namespace, String articleName, Map<String, String> templateParameters) { // disable template support - // TODO: we need to readd template support at least for dates + // TODO: we need to read template support at least for dates return ""; } }; } - - public void nodesToText(List<? extends Object> nodes, Appendable buffer, - IWikiModel model) throws IOException { + @Override + public void nodesToText(List<?> nodes, Appendable buffer, IWikiModel model) throws IOException { CountingAppendable countingBuffer; if (buffer instanceof CountingAppendable) { countingBuffer = (CountingAppendable) buffer; @@ -179,22 +172,18 @@ public class AnnotatingMarkupParser implements ITextConverter { // sentences with links to entities hasSpecialHandling = true; ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY); - imageNodeToText(tagNode, iformat, countingBuffer, - model); + imageNodeToText(tagNode, iformat, countingBuffer, model); } if (!hasSpecialHandling) { - nodesToText(tagNode.getChildren(), countingBuffer, - model); + nodesToText(tagNode.getChildren(), countingBuffer, model); } if (PARAGRAPH_TAGS.contains(tagName)) { paragraphs.add(new Annotation(tagBegin, - countingBuffer.currentPosition, - "paragraph", tagName)); + countingBuffer.currentPosition, "paragraph", tagName)); countingBuffer.append("\n\n"); } else if (HEADING_TAGS.contains(tagName)) { headers.add(new Annotation(tagBegin, - countingBuffer.currentPosition, "heading", - tagName)); + countingBuffer.currentPosition, "heading", tagName)); countingBuffer.append("\n\n"); } else if ("a".equals(tagName)) { String href = attributes.get(HREF_ATTR_KEY); @@ -212,11 +201,13 @@ public class AnnotatingMarkupParser implements ITextConverter { } } + @Override public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat, Appendable buffer, IWikiModel model) throws IOException { // nodesToText(tagNode.getChildren(), buffer, model); } + @Override public boolean noLinks() { return true; } @@ -234,7 +225,7 @@ public class AnnotatingMarkupParser implements ITextConverter { } public List<String> getParagraphs() { - List<String> texts = new ArrayList<String>(); + List<String> texts = new ArrayList<>(); for (Annotation p : paragraphs) { texts.add(text.substring(p.begin, p.end)); } @@ -242,7 +233,7 @@ public class AnnotatingMarkupParser implements ITextConverter { } public List<String> getHeaders() { - List<String> texts = new ArrayList<String>(); + List<String> texts = new ArrayList<>(); for (Annotation h : headers) { texts.add(text.substring(h.begin, h.end)); } @@ -253,7 +244,7 @@ public class AnnotatingMarkupParser implements ITextConverter { return redirect; } - public class CountingAppendable implements Appendable { + public static class CountingAppendable implements Appendable { public int currentPosition = 0; @@ -263,18 +254,20 @@ public class AnnotatingMarkupParser implements ITextConverter { this.wrappedBuffer = wrappedBuffer; } + @Override public Appendable append(CharSequence charSeq) throws IOException { currentPosition += charSeq.length(); return wrappedBuffer.append(charSeq); } + @Override public Appendable append(char aChar) throws IOException { currentPosition += 1; return wrappedBuffer.append(aChar); } - public Appendable append(CharSequence charSeq, int start, int end) - throws IOException { + @Override + public Appendable append(CharSequence charSeq, int start, int end) throws IOException { currentPosition += end - start; return wrappedBuffer.append(charSeq, start, end); } diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java index a9fd480..745ec11 100644 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java +++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java @@ -58,8 +58,7 @@ public class UimaUtil { TypeSystemDescription typeSystemDesciptor; try { - typeSystemDesciptor = (TypeSystemDescription) xmlParser - .parse(xmlTypeSystemSource); + typeSystemDesciptor = (TypeSystemDescription) xmlParser.parse(xmlTypeSystemSource); typeSystemDesciptor.resolveImports(); } catch (InvalidXMLException e) { @@ -109,11 +108,10 @@ public class UimaUtil { throw new IllegalStateException("SAX error while creating parser!", e); } - XmiCasDeserializer dezerializer = new XmiCasDeserializer( - cas.getTypeSystem()); + XmiCasDeserializer deserializer = new XmiCasDeserializer(cas.getTypeSystem()); try { - saxParser.parse(xmiIn, dezerializer.getXmiCasHandler(cas)); + saxParser.parse(xmiIn, deserializer.getXmiCasHandler(cas)); } catch (SAXException e) { throw new IOException("Invalid XMI input!", e); } diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java index abbcc54..9c03e74 100644 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java +++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java @@ -22,8 +22,8 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.io.UnsupportedEncodingException; import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -49,45 +49,38 @@ public class WikinewsConverter { private final TypeSystemDescription tsDesc; private final File outputFolder; - private List<String> endOfArtilceMarkers = new ArrayList<String>(); + private final List<String> endOfArticleMarkers = new ArrayList<>(); CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) { this.tsDesc = tsDesc; this.outputFolder = outputFolder; - endOfArtilceMarkers.add("{{haveyoursay}}"); - endOfArtilceMarkers.add("== Sources =="); - endOfArtilceMarkers.add("==Sources=="); - endOfArtilceMarkers.add("== Source =="); - endOfArtilceMarkers.add("==Source=="); - endOfArtilceMarkers.add("==References=="); - endOfArtilceMarkers.add("== References =="); - endOfArtilceMarkers.add("=== References==="); + endOfArticleMarkers.add("{{haveyoursay}}"); + endOfArticleMarkers.add("== Sources =="); + endOfArticleMarkers.add("==Sources=="); + endOfArticleMarkers.add("== Source =="); + endOfArticleMarkers.add("==Source=="); + endOfArticleMarkers.add("==References=="); + endOfArticleMarkers.add("== References =="); + endOfArticleMarkers.add("=== References==="); } - - public static String titleToUri(String title) { - try { - return URLEncoder.encode(title.replaceAll(" ", "_"), "UTF-8"); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException(e); - } + public static String titleToUri(String title) { + return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8); } - + + @Override public void process(WikiArticle page, Siteinfo siteinfo) throws SAXException { if (page.getIntegerNamespace() == 0 && page.isMain()) { - if (page.getText().toLowerCase().contains("{publish}")) { String pageText = page.getText(); - - int cutIndex = pageText.length(); - for (String endMarker : endOfArtilceMarkers) { + for (String endMarker : endOfArticleMarkers) { int endMarkerIndex = pageText.indexOf(endMarker); if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) { cutIndex = endMarkerIndex; @@ -98,8 +91,9 @@ public class WikinewsConverter { pageText = pageText.substring(0, cutIndex); } - WikinewsWikiModel wikiModel = new WikinewsWikiModel("http://en.wikinews.org/wiki/${image}", - "http://en.wikinews.org/wiki/${title}"); + WikinewsWikiModel wikiModel = new WikinewsWikiModel( + "https://en.wikinews.org/wiki/${image}", + "https://en.wikinews.org/wiki/${title}"); AnnotatingMarkupParser converter = new AnnotatingMarkupParser(); String plainStr = wikiModel.render(converter, pageText); @@ -137,8 +131,7 @@ public class WikinewsConverter { } for (Annotation subHeadAnn : converter.getHeaderAnnotations()) { - AnnotationFS subHeadAnnFS = articleCAS.createAnnotation( - articleCAS.getTypeSystem() + AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem() .getType("org.apache.opennlp.annotations.SubHeadline"), bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end); @@ -150,8 +143,7 @@ public class WikinewsConverter { Feature linkFeature = wikiLinkType.getFeatureByBaseName("link"); for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) { - AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation( - articleCAS.getTypeSystem() + AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem() .getType("org.apache.opennlp.annotations.WikiLink"), bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end); @@ -164,32 +156,19 @@ public class WikinewsConverter { markupCas.setDocumentText(page.toString()); // now serialize CAS - OutputStream casOut = null; - try { - casOut = new FileOutputStream(outputFolder.getAbsolutePath() + - File.separator + titleToUri(page.getTitle()) + ".xmi"); - + try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() + + File.separator + titleToUri(page.getTitle()) + ".xmi")) { + UimaUtil.serializeCASToXmi(articleCAS, casOut); } catch (IOException e) { e.printStackTrace(); } - finally { - try { - if (casOut != null) - casOut.close(); - } catch (IOException e) { - } - } - } } } } - /** - * @param args - */ public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: Parser <XML-File> <Output-Folder>");
