[jira] [Commented] (OPENNLP-652) Add 20Newsgroups format support to the doccat component

ASF GitHub Bot (JIRA) Tue, 21 Nov 2017 00:47:16 -0800

    [ 
https://issues.apache.org/jira/browse/OPENNLP-652?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16260435#comment-16260435
 ]


ASF GitHub Bot commented on OPENNLP-652:
----------------------------------------

kottmann closed pull request #287: WIP[dont merge] OPENNLP-652 Add 20 Newsgroup 
format support
URL: https://github.com/apache/opennlp/pull/287
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 48b80256f..c078164f9 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -36,6 +36,7 @@
 import opennlp.tools.formats.ParseSampleStreamFactory;
 import opennlp.tools.formats.SentenceSampleStreamFactory;
 import opennlp.tools.formats.TokenSampleStreamFactory;
+import opennlp.tools.formats.TwentyNewsgroupSampleStreamFactory;
 import opennlp.tools.formats.WordTagSampleStreamFactory;
 import opennlp.tools.formats.ad.ADChunkSampleStreamFactory;
 import opennlp.tools.formats.ad.ADNameSampleStreamFactory;
@@ -110,6 +111,7 @@
     ADSentenceSampleStreamFactory.registerFactory();
     ADPOSSampleStreamFactory.registerFactory();
     ADTokenSampleStreamFactory.registerFactory();
+    TwentyNewsgroupSampleStreamFactory.registerFactory();
 
     Muc6NameSampleStreamFactory.registerFactory();
 
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
new file mode 100644
index 000000000..98b38f1b2
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.ObjectStream;
+
+
+public class TwentyNewsgroupSampleStream implements 
ObjectStream<DocumentSample> {
+
+  private Tokenizer tokenizer;
+
+  private Map<Path, String> catFileMap = new HashMap<>();
+  private Iterator<Map.Entry<Path, String>> catFileTupleIterator;
+
+  TwentyNewsgroupSampleStream(Tokenizer tokenizer, Path dataDir) throws 
IOException {
+    this.tokenizer = tokenizer;
+
+    for (Path dir : Files.newDirectoryStream(dataDir, entry -> 
Files.isDirectory(entry))) {
+      for (Path file : Files.newDirectoryStream(dir)) {
+        catFileMap.put(file, dir.getFileName().toString());
+      }
+    }
+
+    reset();
+  }
+
+  @Override
+  public DocumentSample read() throws IOException {
+
+    if (catFileTupleIterator.hasNext()) {
+      Map.Entry<Path, String> catFileTuple = catFileTupleIterator.next();
+
+      String text = new String(Files.readAllBytes(catFileTuple.getKey()));
+      return new DocumentSample(catFileTuple.getValue(), 
tokenizer.tokenize(text));
+    }
+
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    catFileTupleIterator = catFileMap.entrySet().iterator();
+  }
+
+  @Override
+  public void close() throws IOException {
+  }
+}
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
new file mode 100644
index 000000000..5583c7c68
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class TwentyNewsgroupSampleStreamFactory extends 
AbstractSampleStreamFactory<DocumentSample> {
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(DocumentSample.class,
+        "20newsgroup",
+        new 
TwentyNewsgroupSampleStreamFactory(TwentyNewsgroupSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> TwentyNewsgroupSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<DocumentSample> create(String[] args) {
+
+    TwentyNewsgroupSampleStreamFactory.Parameters params =
+        ArgumentParser.parse(args, 
TwentyNewsgroupSampleStreamFactory.Parameters.class);
+
+    Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+
+    if (params.getTokenizerModel() != null) {
+      try {
+        tokenizer = new TokenizerME(new 
TokenizerModel(params.getTokenizerModel()));
+      } catch (IOException e) {
+        throw new TerminateToolException(-1, "Failed to load tokenizer 
model!", e);
+      }
+    }
+    else if (params.getRuleBasedTokenizer() != null) {
+      String tokenizerName = params.getRuleBasedTokenizer();
+
+      if ("simple".equals(tokenizerName)) {
+        tokenizer = SimpleTokenizer.INSTANCE;
+      }
+      else if ("whitespace".equals(tokenizerName)) {
+        tokenizer = WhitespaceTokenizer.INSTANCE;
+      }
+      else {
+        throw new TerminateToolException(-1, "Unkown tokenizer: " + 
tokenizerName);
+      }
+    }
+
+    try {
+      return new TwentyNewsgroupSampleStream(
+          tokenizer, params.getDataDir().toPath());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "IO error while opening sample 
data: " + e.getMessage(), e);
+    }
+  }
+
+  interface Parameters extends EncodingParameter {
+    @ArgumentParser.ParameterDescription(valueName = "dataDir",
+        description = "dir containing the 20newsgroup folders")
+    File getDataDir();
+
+    @ArgumentParser.ParameterDescription(valueName = "modelFile")
+    @ArgumentParser.OptionalParameter
+    File getTokenizerModel();
+
+    @ArgumentParser.ParameterDescription(valueName = "name")
+    @ArgumentParser.OptionalParameter
+    String getRuleBasedTokenizer();
+  }
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Add 20Newsgroups format support to the doccat component
> -------------------------------------------------------
>
>                 Key: OPENNLP-652
>                 URL: https://issues.apache.org/jira/browse/OPENNLP-652
>             Project: OpenNLP
>          Issue Type: Improvement
>          Components: Doccat, Formats
>            Reporter: Joern Kottmann
>            Priority: Minor
>              Labels: help-wanted
>
> It would be nice to have formats support for the 20Newsgroups data. The data 
> would be nice to have for a real demonstration of the doccat component.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (OPENNLP-652) Add 20Newsgroups format support to the doccat component

Reply via email to