Author: ragerri
Date: Mon Mar 9 21:16:12 2015
New Revision: 1665360
URL: http://svn.apache.org/r1665360
Log:
OPENNLP-715 refactoring from specific word2vec naming to wordcluster namings
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGenWithSerializerMapping.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java?rev=1665360&r1=1665359&r2=1665360&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
Mon Mar 9 21:16:12 2015
@@ -36,7 +36,7 @@ import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
import opennlp.tools.util.featuregen.AggregatedFeatureGenerator;
import opennlp.tools.util.featuregen.BrownCluster;
-import opennlp.tools.util.featuregen.W2VClassesDictionary;
+import opennlp.tools.util.featuregen.WordClusterDictionary;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -253,7 +253,7 @@ public class TokenNameFinderModel extend
* objects, the convention is to add its element tag name as key of the
serializer map.
* For example, the element tag name for the {@code
WordClusterFeatureGenerator} which
* uses {@code W2VClassesDictionary} objects serialized by the {@code
W2VClassesDictionarySerializer}
- * is 'w2vwordcluster', which is the key used to add the serializer to the
map.
+ * is 'wordcluster', which is the key used to add the serializer to the map.
* @return the map containing the added serializers
*/
public static Map<String, ArtifactSerializer> createArtifactSerializers() {
@@ -268,7 +268,7 @@ public class TokenNameFinderModel extend
Map<String, ArtifactSerializer> serializers =
BaseModel.createArtifactSerializers();
serializers.put("featuregen", new ByteArraySerializer());
- serializers.put("w2vwordcluster", new
W2VClassesDictionary.W2VClassesDictionarySerializer());
+ serializers.put("wordcluster", new
WordClusterDictionary.WordClusterDictionarySerializer());
serializers.put("brownclustertoken", new
BrownCluster.BrownClusterSerializer());
serializers.put("brownclustertokenclass", new
BrownCluster.BrownClusterSerializer());
serializers.put("brownclusterbigram", new
BrownCluster.BrownClusterSerializer());
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java?rev=1665360&r1=1665359&r2=1665360&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
Mon Mar 9 21:16:12 2015
@@ -278,7 +278,7 @@ public class GeneratorFactory {
* 'w2vwordcluster' as a tag name; these clusters are typically produced by
* word2vec or clark pos induction systems.
*/
- static class W2VClassesFeatureGeneratorFactory implements
XmlFeatureGeneratorFactory {
+ static class WordClusterFeatureGeneratorFactory implements
XmlFeatureGeneratorFactory {
public AdaptiveFeatureGenerator create(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager) throws
InvalidFormatException {
@@ -288,15 +288,15 @@ public class GeneratorFactory {
Object dictResource = resourceManager.getResource(dictResourceKey);
- if (!(dictResource instanceof W2VClassesDictionary)) {
- throw new InvalidFormatException("Not a W2VClassesDictionary resource
for key: " + dictResourceKey);
+ if (!(dictResource instanceof WordClusterDictionary)) {
+ throw new InvalidFormatException("Not a WordClusterDictionary resource
for key: " + dictResourceKey);
}
- return new WordClusterFeatureGenerator((W2VClassesDictionary)
dictResource, dictResourceKey);
+ return new WordClusterFeatureGenerator((WordClusterDictionary)
dictResource, dictResourceKey);
}
static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
- factoryMap.put("w2vwordcluster", new
W2VClassesFeatureGeneratorFactory());
+ factoryMap.put("wordcluster", new WordClusterFeatureGeneratorFactory());
}
}
@@ -628,7 +628,7 @@ public class GeneratorFactory {
PrefixFeatureGeneratorFactory.register(factories);
SuffixFeatureGeneratorFactory.register(factories);
WindowFeatureGeneratorFactory.register(factories);
- W2VClassesFeatureGeneratorFactory.register(factories);
+ WordClusterFeatureGeneratorFactory.register(factories);
BrownClusterTokenFeatureGeneratorFactory.register(factories);
BrownClusterTokenClassFeatureGeneratorFactory.register(factories);
BrownClusterBigramFeatureGeneratorFactory.register(factories);
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java?rev=1665360&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterDictionary.java
Mon Mar 9 21:16:12 2015
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.SerializableArtifact;
+
+public class WordClusterDictionary implements SerializableArtifact {
+
+ public static class WordClusterDictionarySerializer implements
ArtifactSerializer<WordClusterDictionary> {
+
+ public WordClusterDictionary create(InputStream in) throws IOException,
+ InvalidFormatException {
+ return new WordClusterDictionary(in);
+ }
+
+ public void serialize(WordClusterDictionary artifact, OutputStream out)
+ throws IOException {
+ artifact.serialize(out);
+ }
+ }
+
+ private Map<String, String> tokenToClusterMap = new HashMap<String,
String>();
+
+ /**
+ * Read word2vec and clark clustering style lexicons.
+ * @param in the inputstream
+ * @throws IOException the io exception
+ */
+ public WordClusterDictionary(InputStream in) throws IOException {
+
+ BufferedReader reader = new BufferedReader(new InputStreamReader(in,
Charset.forName("UTF-8")));
+
+ String line;
+ while ((line = reader.readLine()) != null) {
+ String parts[] = line.split(" ");
+ if (parts.length == 3) {
+ tokenToClusterMap.put(parts[0], parts[1]);
+ } else if (parts.length == 2) {
+ tokenToClusterMap.put(parts[0], parts[1]);
+ }
+ }
+ }
+
+ public String lookupToken(String string) {
+ return tokenToClusterMap.get(string);
+ }
+
+ public void serialize(OutputStream out) throws IOException {
+ Writer writer = new BufferedWriter(new OutputStreamWriter(out));
+
+ for (Map.Entry<String, String> entry : tokenToClusterMap.entrySet()) {
+ writer.write(entry.getKey() + " " + entry.getValue() + "\n");
+ }
+
+ writer.flush();
+ }
+
+ public Class<?> getArtifactSerializerClass() {
+ return WordClusterDictionarySerializer.class;
+ }
+}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java?rev=1665360&r1=1665359&r2=1665360&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WordClusterFeatureGenerator.java
Mon Mar 9 21:16:12 2015
@@ -21,10 +21,10 @@ import java.util.List;
public class WordClusterFeatureGenerator extends FeatureGeneratorAdapter {
- private W2VClassesDictionary tokenDictionary;
+ private WordClusterDictionary tokenDictionary;
private String resourceName;
- public WordClusterFeatureGenerator(W2VClassesDictionary dict, String
dictResourceKey) {
+ public WordClusterFeatureGenerator(WordClusterDictionary dict, String
dictResourceKey) {
tokenDictionary = dict;
resourceName = dictResourceKey;
}
Modified:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGenWithSerializerMapping.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGenWithSerializerMapping.java?rev=1665360&r1=1665359&r2=1665360&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGenWithSerializerMapping.java
(original)
+++
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGenWithSerializerMapping.java
Mon Mar 9 21:16:12 2015
@@ -44,7 +44,7 @@ public class FeatureGenWithSerializerMap
@Override
public Map<String, ArtifactSerializer<?>> getArtifactSerializerMapping() {
Map<String, ArtifactSerializer<?>> mapping = new HashMap<>();
- mapping.put("test.resource", new
W2VClassesDictionary.W2VClassesDictionarySerializer());
+ mapping.put("test.resource", new
WordClusterDictionary.WordClusterDictionarySerializer());
return Collections.unmodifiableMap(mapping);
}
Modified:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java?rev=1665360&r1=1665359&r2=1665360&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
(original)
+++
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
Mon Mar 9 21:16:12 2015
@@ -28,7 +28,7 @@ import java.util.Collection;
import java.util.Map;
import opennlp.tools.util.InvalidFormatException;
-import
opennlp.tools.util.featuregen.W2VClassesDictionary.W2VClassesDictionarySerializer;
+import
opennlp.tools.util.featuregen.WordClusterDictionary.WordClusterDictionarySerializer;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.SerializableArtifact;
@@ -113,6 +113,6 @@ public class GeneratorFactoryTest {
Map<String, ArtifactSerializer<?>> mapping =
GeneratorFactory.extractCustomArtifactSerializerMappings(descIn);
- assertTrue(mapping.get("test.resource") instanceof
W2VClassesDictionarySerializer);
+ assertTrue(mapping.get("test.resource") instanceof
WordClusterDictionarySerializer);
}
}
\ No newline at end of file