Repository: opennlp Updated Branches: refs/heads/master 041507d3a -> f74a86f4b
OPENNLP-1034: Move serializers to resource mapping to GeneratorFactory Closes #173 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f74a86f4 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f74a86f4 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f74a86f4 Branch: refs/heads/master Commit: f74a86f4b6a6f93d3a1e10f2a4852c5898feefb3 Parents: 041507d Author: Jörn Kottmann <[email protected]> Authored: Wed Apr 19 18:34:15 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Thu Apr 20 18:05:40 2017 +0200 ---------------------------------------------------------------------- .../TokenNameFinderCrossValidatorTool.java | 10 ++- .../namefind/TokenNameFinderTrainerTool.java | 77 ++++---------------- .../postag/POSTaggerCrossValidatorTool.java | 9 ++- .../cmdline/postag/POSTaggerTrainerTool.java | 11 ++- .../tools/util/featuregen/GeneratorFactory.java | 41 ++++++++++- .../util/featuregen/GeneratorFactoryTest.java | 2 +- 6 files changed, 78 insertions(+), 72 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java index 0ee3738..6e62577 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java @@ -71,8 +71,14 @@ public final class TokenNameFinderCrossValidatorTool byte[] featureGeneratorBytes = TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen()); - Map<String, Object> resources = - TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen()); + Map<String, Object> resources; + + try { + resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen()); + } + catch (IOException e) { + throw new TerminateToolException(-1,"IO error while loading resources", e); + } if (params.getNameTypes() != null) { String[] nameTypes = params.getNameTypes().split(","); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java index 4fb8cb9..f3cef48 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java @@ -20,13 +20,9 @@ package opennlp.tools.cmdline.namefind; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; import java.util.Map; -import org.w3c.dom.Element; - import opennlp.tools.cmdline.AbstractTrainerTool; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.TerminateToolException; @@ -89,79 +85,31 @@ public final class TokenNameFinderTrainerTool * @param featureGenDescriptor the feature xml descriptor * @return a map consisting of the file name of the resource and its corresponding Object */ - public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) { + public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) + throws IOException { Map<String, Object> resources = new HashMap<>(); if (resourcePath != null) { + Map<String, ArtifactSerializer> artifactSerializers = new HashMap<>(); - Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel - .createArtifactSerializers(); - List<Element> elements = new ArrayList<>(); - ArtifactSerializer serializer = null; - - - // TODO: If there is descriptor file, it should be consulted too if (featureGenDescriptor != null) { try (InputStream xmlDescriptorIn = CmdLineUtil.openInFile(featureGenDescriptor)) { artifactSerializers.putAll( - GeneratorFactory.extractCustomArtifactSerializerMappings(xmlDescriptorIn)); - } catch (IOException e) { - // TODO: Improve error handling! - e.printStackTrace(); - } - - try (InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor)) { - elements = GeneratorFactory.getDescriptorElements(inputStreamXML); - } catch (IOException e) { - e.printStackTrace(); + GeneratorFactory.extractArtifactSerializerMappings(xmlDescriptorIn)); } } - File[] resourceFiles = resourcePath.listFiles(); - - for (File resourceFile : resourceFiles) { - String resourceName = resourceFile.getName(); - //gettting the serializer key from the element tag name - //if the element contains a dict attribute - for (Element xmlElement : elements) { - String dictName = xmlElement.getAttribute("dict"); - if (dictName != null && dictName.equals(resourceName)) { - serializer = artifactSerializers.get(xmlElement.getTagName()); - } - } - // TODO: Do different? For now just ignore .... - if (serializer == null) - continue; - - try (InputStream resourceIn = CmdLineUtil.openInFile(resourceFile)) { - resources.put(resourceName, serializer.create(resourceIn)); - } catch (IOException e) { - // TODO: Fix exception handling - e.printStackTrace(); + for (Map.Entry<String, ArtifactSerializer> serializerMapping : artifactSerializers.entrySet()) { + String resourceName = serializerMapping.getKey(); + try (InputStream resourceIn = CmdLineUtil.openInFile(new File(resourcePath, resourceName))) { + resources.put(resourceName, serializerMapping.getValue().create(resourceIn)); } } } return resources; } - /** - * Calls a loadResources method above to load any external resource required for training. - * @param resourceDirectory the directory where the resources are to be found - * @param featureGeneratorDescriptor the xml feature generator - * @return a map containing the file name of the resource and its mapped Object - */ - static Map<String, Object> loadResources(String resourceDirectory, File featureGeneratorDescriptor) { - - if (resourceDirectory != null) { - File resourcePath = new File(resourceDirectory); - - return loadResources(resourcePath, featureGeneratorDescriptor); - } - - return new HashMap<>(); - } - public void run(String format, String[] args) { super.run(format, args); @@ -174,12 +122,17 @@ public final class TokenNameFinderTrainerTool byte[] featureGeneratorBytes = openFeatureGeneratorBytes(params.getFeaturegen()); - // TODO: Support Custom resources: // Must be loaded into memory, or written to tmp file until descriptor // is loaded which defines parses when model is loaded - Map<String, Object> resources = loadResources(params.getResources(), params.getFeaturegen()); + Map<String, Object> resources; + try { + resources = loadResources(params.getResources(), params.getFeaturegen()); + } + catch (IOException e) { + throw new TerminateToolException(-1, e.getMessage(), e); + } CmdLineUtil.checkOutputFile("name finder model", modelOutFile); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java index 67ad2b9..c6a37a8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java @@ -77,8 +77,13 @@ public final class POSTaggerCrossValidatorTool } } - Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources( - params.getResources(), params.getFeaturegen()); + Map<String, Object> resources; + try { + resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen()); + } + catch (IOException e) { + throw new TerminateToolException(-1,"IO error while loading resources", e); + } byte[] featureGeneratorBytes = TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java index b922176..ca614f9 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java @@ -67,8 +67,15 @@ public final class POSTaggerTrainerTool File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile); - Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources( - params.getResources(), params.getFeaturegen()); + Map<String, Object> resources; + + try { + resources = TokenNameFinderTrainerTool.loadResources( + params.getResources(), params.getFeaturegen()); + } + catch (IOException e) { + throw new TerminateToolException(-1,"IO error while loading resources", e); + } byte[] featureGeneratorBytes = TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java index a1ac72b..5060961 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java @@ -755,7 +755,7 @@ public class GeneratorFactory { return createGenerator(generatorElement, resourceManager); } - public static Map<String, ArtifactSerializer<?>> extractCustomArtifactSerializerMappings( + public static Map<String, ArtifactSerializer<?>> extractArtifactSerializerMappings( InputStream xmlDescriptorIn) throws IOException { Map<String, ArtifactSerializer<?>> mapping = new HashMap<>(); @@ -764,7 +764,6 @@ public class GeneratorFactory { XPath xPath = XPathFactory.newInstance().newXPath(); - NodeList customElements; try { XPathExpression exp = xPath.compile("//custom"); @@ -774,7 +773,6 @@ public class GeneratorFactory { } for (int i = 0; i < customElements.getLength(); i++) { - if (customElements.item(i) instanceof Element) { Element customElement = (Element) customElements.item(i); @@ -788,6 +786,43 @@ public class GeneratorFactory { } } } + + NodeList allElements; + try { + XPathExpression exp = xPath.compile("//*"); + allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET); + } catch (XPathExpressionException e) { + throw new IllegalStateException("The hard coded XPath expression should always be valid!"); + } + + for (int i = 0; i < allElements.getLength(); i++) { + if (allElements.item(i) instanceof Element) { + Element xmlElement = (Element) allElements.item(i); + + String dictName = xmlElement.getAttribute("dict"); + if (dictName != null) { + + switch (xmlElement.getTagName()) { + case "wordcluster": + mapping.put(dictName, new WordClusterDictionary.WordClusterDictionarySerializer()); + break; + + case "brownclustertoken": + mapping.put(dictName, new BrownCluster.BrownClusterSerializer()); + break; + + case "brownclustertokenclass"://, ; + mapping.put(dictName, new BrownCluster.BrownClusterSerializer()); + break; + + case "brownclusterbigram": //, ; + mapping.put(dictName, new BrownCluster.BrownClusterSerializer()); + break; + } + } + } + } + return mapping; } http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java index 8a48575..dd569b0 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java @@ -120,7 +120,7 @@ public class GeneratorFactoryTest { "/opennlp/tools/util/featuregen/CustomClassLoadingWithSerializers.xml"); Map<String, ArtifactSerializer<?>> mapping = - GeneratorFactory.extractCustomArtifactSerializerMappings(descIn); + GeneratorFactory.extractArtifactSerializerMappings(descIn); Assert.assertTrue(mapping.get("test.resource") instanceof WordClusterDictionarySerializer); }
