[
https://issues.apache.org/jira/browse/OPENNLP-1166?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16298140#comment-16298140
]
ASF GitHub Bot commented on OPENNLP-1166:
-----------------------------------------
kottmann closed pull request #294: OPENNLP-1166: TwoPassDataIndexer fails if
features contain \n
URL: https://github.com/apache/opennlp/pull/294
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
index 5e347e886..4121e36c1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/model/TwoPassDataIndexer.java
@@ -17,13 +17,16 @@
package opennlp.tools.ml.model;
-import java.io.BufferedWriter;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
+import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -59,20 +62,28 @@ public void index(ObjectStream<Event> eventStream) throws
IOException {
File tmp = File.createTempFile("events", null);
tmp.deleteOnExit();
int numEvents;
- try (Writer osw = new BufferedWriter(new OutputStreamWriter(new
FileOutputStream(tmp),
- StandardCharsets.UTF_8))) {
- numEvents = computeEventCounts(eventStream, osw, predicateIndex, cutoff);
+ BigInteger writeHash;
+ HashSumEventStream writeEventStream = new HashSumEventStream(eventStream);
// do not close.
+ try (DataOutputStream dos = new DataOutputStream(new
BufferedOutputStream(new FileOutputStream(tmp)))) {
+ numEvents = computeEventCounts(writeEventStream, dos, predicateIndex,
cutoff);
}
+ writeHash = writeEventStream.calculateHashSum();
+
display("done. " + numEvents + " events\n");
display("\tIndexing... ");
List<ComparableEvent> eventsToCompare;
- try (FileEventStream fes = new FileEventStream(tmp)) {
- eventsToCompare = index(fes, predicateIndex);
+ BigInteger readHash = null;
+ try (HashSumEventStream readStream = new HashSumEventStream(new
EventStream(tmp))) {
+ eventsToCompare = index(readStream, predicateIndex);
+ readHash = readStream.calculateHashSum();
}
-
tmp.delete();
+
+ if (readHash.compareTo(writeHash) != 0)
+ throw new IOException("Event hash for writing and reading events did not
match.");
+
display("done.\n");
if (sort) {
@@ -91,12 +102,19 @@ public void index(ObjectStream<Event> eventStream) throws
IOException {
* occur at least <tt>cutoff</tt> times are added to the
* <tt>predicatesInOut</tt> map along with a unique integer index.
*
+ * Protocol:
+ * 1 - (utf string) - Event outcome
+ * 2 - (int) - Event context array length
+ * 3+ - (utf string) - Event context string
+ * 4 - (int) - Event values array length
+ * 5+ - (float) - Event value
+ *
* @param eventStream an <code>EventStream</code> value
* @param eventStore a writer to which the events are written to for later
processing.
* @param predicatesInOut a <code>TObjectIntHashMap</code> value
* @param cutoff an <code>int</code> value
*/
- private int computeEventCounts(ObjectStream<Event> eventStream, Writer
eventStore,
+ private int computeEventCounts(ObjectStream<Event> eventStream,
DataOutputStream eventStore,
Map<String,Integer> predicatesInOut, int cutoff) throws IOException {
Map<String,Integer> counter = new HashMap<>();
int eventCount = 0;
@@ -104,9 +122,23 @@ private int computeEventCounts(ObjectStream<Event>
eventStream, Writer eventStor
Event ev;
while ((ev = eventStream.read()) != null) {
eventCount++;
- eventStore.write(FileEventStream.toLine(ev));
+
+ eventStore.writeUTF(ev.getOutcome());
+
+ eventStore.writeInt(ev.getContext().length);
String[] ec = ev.getContext();
update(ec, counter);
+ for (String ctxString : ec)
+ eventStore.writeUTF(ctxString);
+
+ if (ev.getValues() == null) {
+ eventStore.writeInt(0);
+ }
+ else {
+ eventStore.writeInt(ev.getValues().length);
+ for (float value : ev.getValues())
+ eventStore.writeFloat(value);
+ }
}
String[] predicateSet = counter.entrySet().stream()
@@ -122,4 +154,45 @@ private int computeEventCounts(ObjectStream<Event>
eventStream, Writer eventStor
return eventCount;
}
+
+ private static class EventStream implements ObjectStream<Event> {
+
+ private final DataInputStream inputStream;
+
+ public EventStream(File file) throws IOException {
+ inputStream = new DataInputStream(new BufferedInputStream(new
FileInputStream(file)));
+ }
+
+ @Override
+ public Event read() throws IOException {
+ if (inputStream.available() != 0) {
+ String outcome = inputStream.readUTF();
+ int contextLenght = inputStream.readInt();
+ String[] context = new String[contextLenght];
+ for (int i = 0; i < contextLenght; i++)
+ context[i] = inputStream.readUTF();
+ int valuesLength = inputStream.readInt();
+ float[] values = null;
+ if (valuesLength > 0) {
+ values = new float[valuesLength];
+ for (int i = 0; i < valuesLength; i++)
+ values[i] = inputStream.readFloat();
+ }
+ return new Event(outcome, context, values);
+ }
+ else {
+ return null;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void close() throws IOException {
+ inputStream.close();
+ }
+ }
}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
index c246936c3..a8a1b222a 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java
@@ -23,8 +23,15 @@
import org.junit.Assert;
import org.junit.Test;
+import opennlp.tools.namefind.DefaultNameContextGenerator;
+import opennlp.tools.namefind.NameContextGenerator;
+import opennlp.tools.namefind.NameFinderEventStream;
+import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
public class TwoPassDataIndexerTest {
@@ -61,4 +68,25 @@ public void testIndex() throws IOException {
Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"},
indexer.getOutcomeLabels());
Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts());
}
+
+ @Test
+ public void testIndexWithNewline() throws IOException {
+
+ String[] sentence = "He belongs to Apache \n Software Foundation
.".split(" ");
+
+ NameContextGenerator CG = new DefaultNameContextGenerator(
+ (AdaptiveFeatureGenerator[]) null);
+
+ NameSample nameSample = new NameSample(sentence,
+ new Span[] { new Span(3, 7) }, false);
+
+ ObjectStream<Event> eventStream = new NameFinderEventStream(
+ ObjectStreamUtils.createObjectStream(nameSample), "org", CG, null);
+
+ DataIndexer indexer = new TwoPassDataIndexer();
+ indexer.init(new TrainingParameters(Collections.emptyMap()), null);
+ indexer.index(eventStream);
+ Assert.assertEquals(5, indexer.getContexts().length);
+
+ }
}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> TwoPassDataIndexer fails if features contain \n
> -----------------------------------------------
>
> Key: OPENNLP-1166
> URL: https://issues.apache.org/jira/browse/OPENNLP-1166
> Project: OpenNLP
> Issue Type: Improvement
> Components: Machine Learning
> Affects Versions: 1.8.3
> Reporter: Peter Thygesen
> Assignee: Peter Thygesen
>
> Training a model with Newline tokens causes TwoPassDataIndexer to throw
> exception
> Exception in thread "main" java.util.NoSuchElementException
> at java.util.StringTokenizer.nextToken(StringTokenizer.java:349)
> at opennlp.tools.ml.model.FileEventStream.read(FileEventStream.java:71)
> at opennlp.tools.ml.model.FileEventStream.read(FileEventStream.java:35)
> at
> opennlp.tools.ml.model.AbstractDataIndexer.index(AbstractDataIndexer.java:168)
> at
> opennlp.tools.ml.model.TwoPassDataIndexer.index(TwoPassDataIndexer.java:72)
> at
> opennlp.tools.ml.AbstractEventTrainer.getDataIndexer(AbstractEventTrainer.java:68)
> at
> opennlp.tools.ml.AbstractEventTrainer.train(AbstractEventTrainer.java:90)
> at opennlp.tools.namefind.NameFinderME.train(NameFinderME.java:244)
> at
> opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool.run(TokenNameFinderTrainerTool.java:169)
> at opennlp.tools.cmdline.CLI.main(CLI.java:256)
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)