This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch migrate-opennlp-coref-to-opennlp-tools-2_1_0 in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit ba7fcf589ec2fd34b738e9b19c57fb374f1f3553 Author: Martin Wiesner <[email protected]> AuthorDate: Thu Jan 19 14:07:10 2023 +0100 updates sandbox component 'opennlp-coref' to be compatible with latest opennlp-tools release - adjusts opennlp-tools to 2.1.0 - adjusts parent project (org.apache.apache) to version 18 - adjusts Java language level to 11 --- opennlp-coref/pom.xml | 10 +- .../tools/cmdline/coref/CoreferencerTool.java | 37 +++-- .../tools/coref/resolver/AbstractResolver.java | 16 +- .../resolver/DefaultNonReferentialResolver.java | 35 +++-- .../tools/coref/resolver/MaxentResolver.java | 44 ++++-- .../java/opennlp/tools/coref/sim/GenderModel.java | 124 +++++++-------- .../java/opennlp/tools/coref/sim/NumberModel.java | 88 ++++++----- .../opennlp/tools/coref/sim/SimilarityModel.java | 170 ++++++++++----------- .../tools/formats/CorefSampleStreamFactory.java | 19 ++- 9 files changed, 280 insertions(+), 263 deletions(-) diff --git a/opennlp-coref/pom.xml b/opennlp-coref/pom.xml index 033ffc2..a3d3d14 100644 --- a/opennlp-coref/pom.xml +++ b/opennlp-coref/pom.xml @@ -25,12 +25,12 @@ <parent> <groupId>org.apache</groupId> <artifactId>apache</artifactId> - <version>13</version> + <version>18</version> <relativePath /> </parent> <artifactId>opennlp-coref</artifactId> - <version>1.6.0-SNAPSHOT</version> + <version>2.1.0-SNAPSHOT</version> <packaging>jar</packaging> <name>Apache OpenNLP Coreferencer</name> @@ -38,7 +38,7 @@ <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> - <version>1.6.0</version> + <version>2.1.0</version> <scope>compile</scope> </dependency> @@ -69,8 +69,8 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <configuration> - <source>1.8</source> - <target>1.8</target> + <source>11</source> + <target>11</target> <compilerArgument>-Xlint</compilerArgument> </configuration> </plugin> diff --git a/opennlp-coref/src/main/java/opennlp/tools/cmdline/coref/CoreferencerTool.java b/opennlp-coref/src/main/java/opennlp/tools/cmdline/coref/CoreferencerTool.java index 885951c..9ad4276 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/cmdline/coref/CoreferencerTool.java +++ b/opennlp-coref/src/main/java/opennlp/tools/cmdline/coref/CoreferencerTool.java @@ -18,7 +18,7 @@ package opennlp.tools.cmdline.coref; import java.io.IOException; -import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -30,6 +30,7 @@ import opennlp.tools.cmdline.BasicCmdLineTool; import opennlp.tools.cmdline.CLI; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.PerformanceMonitor; +import opennlp.tools.cmdline.SystemInputStreamFactory; import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.coref.DiscourseEntity; import opennlp.tools.coref.LinkerMode; @@ -47,12 +48,12 @@ public class CoreferencerTool extends BasicCmdLineTool { class CorefParse { - private Map<Parse, Integer> parseMap; - private List<Parse> parses; + private final Map<Parse, Integer> parseMap; + private final List<Parse> parses; public CorefParse(List<Parse> parses, DiscourseEntity[] entities) { this.parses = parses; - parseMap = new HashMap<Parse, Integer>(); + parseMap = new HashMap<>(); for (int ei = 0, en = entities.length; ei < en;ei++) { if (entities[ei].getNumMentions() > 1) { for (Iterator<MentionContext> mi = entities[ei].getMentions(); mi.hasNext();) { @@ -65,8 +66,7 @@ public class CoreferencerTool extends BasicCmdLineTool { } public void show() { - for (int pi = 0, pn = parses.size(); pi < pn;pi++) { - Parse p = parses.get(pi); + for (Parse p : parses) { show(p); System.out.println(); } @@ -85,8 +85,7 @@ public class CoreferencerTool extends BasicCmdLineTool { System.out.print(" "); } Parse[] children = p.getChildren(); - for (int pi = 0, pn = children.length; pi < pn;pi++) { - Parse c = children[pi]; + for (Parse c : children) { Span s = c.getSpan(); if (start < s.getStart()) { System.out.print(p.getText().substring(start, s.getStart())); @@ -104,7 +103,8 @@ public class CoreferencerTool extends BasicCmdLineTool { public String getShortDescription() { return "learnable noun phrase coreferencer"; } - + + @Override public void run(String[] args) { if (args.length != 1) { System.out.println(getHelp()); @@ -118,17 +118,15 @@ public class CoreferencerTool extends BasicCmdLineTool { throw new TerminateToolException(-1, "Failed to load all coreferencer models!", e); } - ObjectStream<String> lineStream = - new PlainTextByLineStream(new InputStreamReader(System.in)); - PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "parses"); perfMon.start(); - try { + try (ObjectStream<String> lineStream = new PlainTextByLineStream( + new SystemInputStreamFactory(), StandardCharsets.UTF_8)) { int sentenceNumber = 0; - List<Mention> document = new ArrayList<Mention>(); - List<Parse> parses = new ArrayList<Parse>(); + List<Mention> document = new ArrayList<>(); + List<Parse> parses = new ArrayList<>(); String line; while ((line = lineStream.read()) != null) { @@ -148,14 +146,14 @@ public class CoreferencerTool extends BasicCmdLineTool { Mention[] extents = treebankLinker.getMentionFinder().getMentions(new DefaultParse(p,sentenceNumber)); //construct new parses for mentions which don't have constituents. - for (int ei = 0, en = extents.length; ei < en;ei++) { + for (Mention extent : extents) { //System.err.println("PennTreebankLiner.main: "+ei+" "+extents[ei]); - if (extents[ei].getParse() == null) { + if (extent.getParse() == null) { //not sure how to get head index, but its not used at this point. - Parse snp = new Parse(p.getText(),extents[ei].getSpan(),"NML",1.0,0); + Parse snp = new Parse(p.getText(), extent.getSpan(), "NML", 1.0, 0); p.insert(snp); - extents[ei].setParse(new DefaultParse(snp,sentenceNumber)); + extent.setParse(new DefaultParse(snp, sentenceNumber)); } } @@ -174,6 +172,7 @@ public class CoreferencerTool extends BasicCmdLineTool { } } + @Override public String getHelp() { return "Usage: " + CLI.CMD + " " + getName() + " model_directory < parses"; } diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java index 77b1384..370e209 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/AbstractResolver.java @@ -18,12 +18,13 @@ package opennlp.tools.coref.resolver; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import opennlp.tools.coref.DiscourseEntity; import opennlp.tools.coref.DiscourseModel; import opennlp.tools.coref.mention.MentionContext; import opennlp.tools.coref.mention.Parse; -import opennlp.tools.util.CountedSet; /** * Default implementation of some methods in the {@link Resolver} interface. @@ -46,7 +47,7 @@ public abstract class AbstractResolver implements Resolver { * Debugging variable which holds statistics about mention distances * during training. */ - protected CountedSet<Integer> distances; + protected Map<Integer, Integer> distances; /** * The number of sentences back this resolver should look for a referent. @@ -56,7 +57,7 @@ public abstract class AbstractResolver implements Resolver { public AbstractResolver(int neb) { numEntitiesBack = neb; showExclusions = true; - distances = new CountedSet<Integer>(); + distances = new HashMap<>(); } /** @@ -169,7 +170,14 @@ public abstract class AbstractResolver implements Resolver { DiscourseEntity cde = dm.getEntity(ei); MentionContext cec = cde.getLastExtent(); // candidate extent context if (cec.getId() == mention.getId()) { - distances.add(ei); + // adding counts + Integer count = distances.get(ei); + if (count == null ) { + distances.put(ei, 1); + } + else { + distances.put(ei, count + 1); + } return cde; } } diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java index 142bab1..7439e76 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/DefaultNonReferentialResolver.java @@ -17,8 +17,10 @@ package opennlp.tools.coref.resolver; +import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; @@ -27,13 +29,14 @@ import java.util.List; import opennlp.tools.coref.mention.MentionContext; import opennlp.tools.coref.mention.Parse; -import opennlp.tools.ml.maxent.GIS; +import opennlp.tools.ml.maxent.GISModel; +import opennlp.tools.ml.maxent.GISTrainer; import opennlp.tools.ml.maxent.io.BinaryGISModelReader; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelReader; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter; +import opennlp.tools.ml.maxent.io.BinaryGISModelWriter; import opennlp.tools.ml.model.Event; import opennlp.tools.ml.model.MaxentModel; import opennlp.tools.util.ObjectStreamUtils; +import opennlp.tools.util.TrainingParameters; /** * Default implementation of the {@link NonReferentialResolver} interface. @@ -43,10 +46,10 @@ public class DefaultNonReferentialResolver implements NonReferentialResolver { private MaxentModel model; private List<Event> events; private boolean loadAsResource; - private boolean debugOn = false; - private ResolverMode mode; - private String modelName; - private String modelExtension = ".bin.gz"; + private final boolean debugOn = false; + private final ResolverMode mode; + private final String modelName; + private final String modelExtension = ".bin.gz"; private int nonRefIndex; public DefaultNonReferentialResolver(String projectName, String name, ResolverMode mode) @@ -62,7 +65,10 @@ public class DefaultNonReferentialResolver implements NonReferentialResolver { this.getClass().getResourceAsStream(modelName))).getModel(); } else { - model = (new SuffixSensitiveGISModelReader(new File(modelName + modelExtension))).getModel(); + try (DataInputStream dis = new DataInputStream( + new BufferedInputStream(new FileInputStream(modelName + modelExtension)))) { + model = new BinaryGISModelReader(dis).getModel(); + } } nonRefIndex = model.getIndex(MaxentResolver.SAME); } @@ -71,6 +77,7 @@ public class DefaultNonReferentialResolver implements NonReferentialResolver { } } + @Override public double getNonReferentialProbability(MentionContext mention) { List<String> features = getFeatures(mention); double r = model.eval(features.toArray(new String[features.size()]))[nonRefIndex]; @@ -78,6 +85,7 @@ public class DefaultNonReferentialResolver implements NonReferentialResolver { return r; } + @Override public void addEvent(MentionContext ec) { List<String> features = getFeatures(ec); if (-1 == ec.getId()) { @@ -115,6 +123,7 @@ public class DefaultNonReferentialResolver implements NonReferentialResolver { return features; } + @Override public void train() throws IOException { if (ResolverMode.TRAIN == mode) { System.err.println(this + " referential"); @@ -126,9 +135,13 @@ public class DefaultNonReferentialResolver implements NonReferentialResolver { } writer.close(); } - new SuffixSensitiveGISModelWriter(GIS.trainModel( - ObjectStreamUtils.createObjectStream(events),100,10), - new File(modelName + modelExtension)).persist(); + TrainingParameters params = TrainingParameters.defaultParams(); + params.put(TrainingParameters.ITERATIONS_PARAM, 100); + params.put(TrainingParameters.CUTOFF_PARAM, 10); + GISTrainer trainer = new GISTrainer(); + trainer.init(params, null); + GISModel trainedModel = trainer.trainModel(ObjectStreamUtils.createObjectStream(events)); + new BinaryGISModelWriter(trainedModel, new File(modelName + modelExtension)).persist(); } } } diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java index 12ff359..3710608 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/resolver/MaxentResolver.java @@ -17,23 +17,27 @@ package opennlp.tools.coref.resolver; +import java.io.BufferedInputStream; +import java.io.DataInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import opennlp.tools.coref.DiscourseEntity; import opennlp.tools.coref.DiscourseModel; import opennlp.tools.coref.mention.MentionContext; import opennlp.tools.coref.sim.TestSimilarityModel; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelReader; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter; +import opennlp.tools.ml.maxent.GISModel; +import opennlp.tools.ml.maxent.GISTrainer; +import opennlp.tools.ml.maxent.io.BinaryGISModelReader; +import opennlp.tools.ml.maxent.io.BinaryGISModelWriter; import opennlp.tools.ml.model.Event; import opennlp.tools.ml.model.MaxentModel; import opennlp.tools.util.ObjectStreamUtils; +import opennlp.tools.util.TrainingParameters; /** * Provides common functionality used by classes which implement the {@link Resolver} class @@ -118,7 +122,10 @@ public abstract class MaxentResolver extends AbstractResolver { this.mode = mode; this.modelName = modelDirectory + "/" + name; if (ResolverMode.TEST == this.mode) { - model = (new SuffixSensitiveGISModelReader(new File(modelName + modelExtension))).getModel(); + try (DataInputStream dis = new DataInputStream( + new BufferedInputStream(new FileInputStream(modelName + modelExtension)))) { + model = new BinaryGISModelReader(dis).getModel(); + } sameIndex = model.getIndex(SAME); } else if (ResolverMode.TRAIN == this.mode) { @@ -169,6 +176,7 @@ public abstract class MaxentResolver extends AbstractResolver { new FixedNonReferentialResolver(nonReferentialProbability)); } + @Override public DiscourseEntity resolve(MentionContext ec, DiscourseModel dm) { DiscourseEntity de; int ei = 0; @@ -229,8 +237,8 @@ public abstract class MaxentResolver extends AbstractResolver { /** * Returns whether the specified entity satisfies the criteria for being a default referent. - * This criteria is used to perform sample selection on the training data and to select a single - * non-referent entity. Typically the criteria is a heuristic for a likely referent. + * These criteria are used to perform sample selection on the training data and to select a single + * non-referent entity. Typically, the criteria is a heuristic for a likely referent. * @param de The discourse entity being considered for non-reference. * @return True if the entity should be used as a default referent, false otherwise. */ @@ -286,7 +294,14 @@ public abstract class MaxentResolver extends AbstractResolver { events.add(new Event(SAME, features.toArray(new String[features.size()]))); de = cde; //System.err.println("MaxentResolver.retain: resolved at "+ei); - distances.add(ei); + // adding counts + Integer count = distances.get(ei); + if (count == null ) { + distances.put(ei, 1); + } + else { + distances.put(ei, count + 1); + } } else if (!pairedSampleSelection || (!nonReferentFound && useAsDifferentExample)) { nonReferentFound = true; @@ -333,14 +348,19 @@ public abstract class MaxentResolver extends AbstractResolver { if (debugOn) { System.err.println(this + " referential"); FileWriter writer = new FileWriter(modelName + ".events"); - for (Iterator<Event> ei = events.iterator(); ei.hasNext();) { - Event e = ei.next(); + for (Event e : events) { writer.write(e.toString() + "\n"); } writer.close(); } - (new SuffixSensitiveGISModelWriter(GIS.trainModel(ObjectStreamUtils.createObjectStream(events), - 100,10),new File(modelName + modelExtension))).persist(); + TrainingParameters params = TrainingParameters.defaultParams(); + params.put(TrainingParameters.ITERATIONS_PARAM, 100); + params.put(TrainingParameters.CUTOFF_PARAM, 10); + GISTrainer trainer = new GISTrainer(); + trainer.init(params, null); + GISModel trainedModel = trainer.trainModel(ObjectStreamUtils.createObjectStream(events)); + new BinaryGISModelWriter(trainedModel, new File(modelName + modelExtension)).persist(); + nonReferentialResolver.train(); } } diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java index 13e8300..2c06836 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/GenderModel.java @@ -18,34 +18,31 @@ package opennlp.tools.coref.sim; +import java.io.BufferedInputStream; import java.io.BufferedReader; +import java.io.DataInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Set; import opennlp.tools.coref.resolver.ResolverUtils; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelReader; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter; -import opennlp.tools.ml.model.AbstractModel; +import opennlp.tools.ml.maxent.GISModel; +import opennlp.tools.ml.maxent.GISTrainer; +import opennlp.tools.ml.maxent.io.BinaryGISModelReader; +import opennlp.tools.ml.maxent.io.BinaryGISModelWriter; import opennlp.tools.ml.model.Event; import opennlp.tools.ml.model.MaxentModel; -import opennlp.tools.util.HashList; import opennlp.tools.util.ObjectStreamUtils; - -//import opennlp.maxent.GIS; -//import opennlp.maxent.io.SuffixSensitiveGISModelReader; -//import opennlp.maxent.io.SuffixSensitiveGISModelWriter; -//import opennlp.model.Event; -//import opennlp.model.MaxentModel; +import opennlp.tools.util.TrainingParameters; /** * Class which models the gender of a particular mentions and entities made up of mentions. @@ -56,27 +53,25 @@ public class GenderModel implements TestGenderModel, TrainSimilarityModel { private int femaleIndex; private int neuterIndex; - private String modelName; - private String modelExtension = ".bin.gz"; + private final String modelName; + private final String modelExtension = ".bin.gz"; private MaxentModel testModel; private Collection<Event> events; - private boolean debugOn = true; + private final boolean debugOn = true; - private Set<String> maleNames; - private Set<String> femaleNames; + private final Set<String> maleNames; + private final Set<String> femaleNames; public static TestGenderModel testModel(String name) throws IOException { - GenderModel gm = new GenderModel(name, false); - return gm; + return new GenderModel(name, false); } public static TrainSimilarityModel trainModel(String name) throws IOException { - GenderModel gm = new GenderModel(name, true); - return gm; + return new GenderModel(name, true); } private Set<String> readNames(String nameFile) throws IOException { - Set<String> names = new HashSet<String>(); + Set<String> names = new HashSet<>(); BufferedReader nameReader = new BufferedReader(new FileReader(nameFile)); for (String line = nameReader.readLine(); line != null; line = nameReader.readLine()) { names.add(line); @@ -89,17 +84,16 @@ public class GenderModel implements TestGenderModel, TrainSimilarityModel { maleNames = readNames(modelName + ".mas"); femaleNames = readNames(modelName + ".fem"); if (train) { - events = new ArrayList<Event>(); + events = new ArrayList<>(); } else { - //if (MaxentResolver.loadAsResource()) { - // testModel = (new BinaryGISModelReader(new DataInputStream( - // this.getClass().getResourceAsStream(modelName)))).getModel(); - //} - testModel = (new SuffixSensitiveGISModelReader(new File(modelName + modelExtension))).getModel(); - maleIndex = testModel.getIndex(GenderEnum.MALE.toString()); - femaleIndex = testModel.getIndex(GenderEnum.FEMALE.toString()); - neuterIndex = testModel.getIndex(GenderEnum.NEUTER.toString()); + try (DataInputStream dis = new DataInputStream( + new BufferedInputStream(new FileInputStream(modelName + modelExtension)))) { + testModel = new BinaryGISModelReader(dis).getModel(); + maleIndex = testModel.getIndex(GenderEnum.MALE.toString()); + femaleIndex = testModel.getIndex(GenderEnum.FEMALE.toString()); + neuterIndex = testModel.getIndex(GenderEnum.NEUTER.toString()); + } } } @@ -168,8 +162,7 @@ public class GenderModel implements TestGenderModel, TrainSimilarityModel { } private GenderEnum getGender(List<Context> entity) { - for (Iterator<Context> ci = entity.iterator(); ci.hasNext();) { - Context ec = ci.next(); + for (Context ec : entity) { GenderEnum ge = getGender(ec); if (ge != GenderEnum.UNKNOWN) { return ge; @@ -181,62 +174,51 @@ public class GenderModel implements TestGenderModel, TrainSimilarityModel { @SuppressWarnings("unchecked") public void setExtents(Context[] extentContexts) { - HashList entities = new HashList(); - List<Context> singletons = new ArrayList<Context>(); - for (int ei = 0, el = extentContexts.length; ei < el; ei++) { - Context ec = extentContexts[ei]; + HashMap<Integer,Context> entities = new HashMap<>(); + List<Context> singletons = new ArrayList<>(); + for (Context ec : extentContexts) { //System.err.println("GenderModel.setExtents: ec("+ec.getId()+") "+ec.toText()); if (ec.getId() != -1) { entities.put(ec.getId(), ec); - } - else { + } else { singletons.add(ec); } } - List<Context> males = new ArrayList<Context>(); - List<Context> females = new ArrayList<Context>(); - List<Context> eunuches = new ArrayList<Context>(); + List<Context> males = new ArrayList<>(); + List<Context> females = new ArrayList<>(); + List<Context> eunuches = new ArrayList<>(); //coref entities - for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) { - Integer key = ei.next(); + for (Integer key : entities.keySet()) { List<Context> entityContexts = (List<Context>) entities.get(key); GenderEnum gender = getGender(entityContexts); if (gender != null) { if (gender == GenderEnum.MALE) { males.addAll(entityContexts); - } - else if (gender == GenderEnum.FEMALE) { + } else if (gender == GenderEnum.FEMALE) { females.addAll(entityContexts); - } - else if (gender == GenderEnum.NEUTER) { + } else if (gender == GenderEnum.NEUTER) { eunuches.addAll(entityContexts); } } } //non-coref entities - for (Iterator<Context> ei = singletons.iterator(); ei.hasNext();) { - Context ec = ei.next(); + for (Context ec : singletons) { GenderEnum gender = getGender(ec); if (gender == GenderEnum.MALE) { males.add(ec); - } - else if (gender == GenderEnum.FEMALE) { + } else if (gender == GenderEnum.FEMALE) { females.add(ec); - } - else if (gender == GenderEnum.NEUTER) { + } else if (gender == GenderEnum.NEUTER) { eunuches.add(ec); } } - for (Iterator<Context> mi = males.iterator(); mi.hasNext();) { - Context ec = mi.next(); + for (Context ec : males) { addEvent(GenderEnum.MALE.toString(), ec); } - for (Iterator<Context> fi = females.iterator(); fi.hasNext();) { - Context ec = fi.next(); + for (Context ec : females) { addEvent(GenderEnum.FEMALE.toString(), ec); } - for (Iterator<Context> ei = eunuches.iterator(); ei.hasNext();) { - Context ec = ei.next(); + for (Context ec : eunuches) { addEvent(GenderEnum.NEUTER.toString(), ec); } } @@ -259,38 +241,40 @@ public class GenderModel implements TestGenderModel, TrainSimilarityModel { } } + @Override public double[] genderDistribution(Context np1) { List<String> features = getFeatures(np1); - if (debugOn) { - //System.err.println("GenderModel.genderDistribution: "+features); - } + //System.err.println("GenderModel.genderDistribution: "+features); return testModel.eval(features.toArray(new String[features.size()])); } + @Override public void trainModel() throws IOException { if (debugOn) { FileWriter writer = new FileWriter(modelName + ".events"); - for (Iterator<Event> ei = events.iterator();ei.hasNext();) { - Event e = ei.next(); + for (Event e : events) { writer.write(e.toString() + "\n"); } writer.close(); } - - new SuffixSensitiveGISModelWriter( - // GIS.trainModel((EventStream)new CollectionEventStream(events), true)).persist(); - (AbstractModel) GIS.trainModel(ObjectStreamUtils.createObjectStream(events), true), - new File(modelName + modelExtension)).persist(); + GISTrainer trainer = new GISTrainer(); + trainer.init(TrainingParameters.defaultParams(), null); + trainer.setSmoothing(true); + GISModel trainedModel = trainer.trainModel(ObjectStreamUtils.createObjectStream(events)); + new BinaryGISModelWriter(trainedModel, new File(modelName + modelExtension)).persist(); } + @Override public int getFemaleIndex() { return femaleIndex; } + @Override public int getMaleIndex() { return maleIndex; } + @Override public int getNeuterIndex() { return neuterIndex; } diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java index 6f3be6d..fa8070a 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/NumberModel.java @@ -17,34 +17,33 @@ package opennlp.tools.coref.sim; +import java.io.BufferedInputStream; +import java.io.DataInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; -import java.util.Iterator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import opennlp.tools.coref.resolver.ResolverUtils; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelReader; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter; +import opennlp.tools.ml.maxent.GISModel; +import opennlp.tools.ml.maxent.GISTrainer; +import opennlp.tools.ml.maxent.io.BinaryGISModelReader; +import opennlp.tools.ml.maxent.io.BinaryGISModelWriter; import opennlp.tools.ml.model.Event; import opennlp.tools.ml.model.MaxentModel; -import opennlp.tools.util.HashList; import opennlp.tools.util.ObjectStreamUtils; - -//import opennlp.maxent.GIS; -//import opennlp.maxent.io.SuffixSensitiveGISModelReader; -//import opennlp.maxent.io.SuffixSensitiveGISModelWriter; -//import opennlp.model.Event; -//import opennlp.model.MaxentModel; +import opennlp.tools.util.TrainingParameters; /** * Class which models the number of particular mentions and the entities made up of mentions. */ public class NumberModel implements TestNumberModel, TrainSimilarityModel { - private String modelName; - private String modelExtension = ".bin.gz"; + private final String modelName; + private final String modelExtension = ".bin.gz"; private MaxentModel testModel; private List<Event> events; @@ -52,13 +51,11 @@ public class NumberModel implements TestNumberModel, TrainSimilarityModel { private int pluralIndex; public static TestNumberModel testModel(String name) throws IOException { - NumberModel nm = new NumberModel(name, false); - return nm; + return new NumberModel(name, false); } public static TrainSimilarityModel trainModel(String modelName) throws IOException { - NumberModel gm = new NumberModel(modelName, true); - return gm; + return new NumberModel(modelName, true); } private NumberModel(String modelName, boolean train) throws IOException { @@ -67,18 +64,17 @@ public class NumberModel implements TestNumberModel, TrainSimilarityModel { events = new ArrayList<Event>(); } else { - //if (MaxentResolver.loadAsResource()) { - // testModel = (new PlainTextGISModelReader(new BufferedReader(new InputStreamReader( - // this.getClass().getResourceAsStream(modelName))))).getModel(); - //} - testModel = (new SuffixSensitiveGISModelReader(new File(modelName + modelExtension))).getModel(); + try (DataInputStream dis = new DataInputStream( + new BufferedInputStream(new FileInputStream(modelName + modelExtension)))) { + testModel = new BinaryGISModelReader(dis).getModel(); + } singularIndex = testModel.getIndex(NumberEnum.SINGULAR.toString()); pluralIndex = testModel.getIndex(NumberEnum.PLURAL.toString()); } } private List<String> getFeatures(Context np1) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); features.add("default"); Object[] npTokens = np1.getTokens(); for (int ti = 0, tl = npTokens.length - 1; ti < tl; ti++) { @@ -107,8 +103,7 @@ public class NumberModel implements TestNumberModel, TrainSimilarityModel { } private NumberEnum getNumber(List<Context> entity) { - for (Iterator<Context> ci = entity.iterator(); ci.hasNext();) { - Context ec = ci.next(); + for (Context ec : entity) { NumberEnum ne = getNumber(ec); if (ne != NumberEnum.UNKNOWN) { return ne; @@ -117,10 +112,11 @@ public class NumberModel implements TestNumberModel, TrainSimilarityModel { return NumberEnum.UNKNOWN; } + @Override @SuppressWarnings("unchecked") public void setExtents(Context[] extentContexts) { - HashList entities = new HashList(); - List<Context> singletons = new ArrayList<Context>(); + Map<Integer,Context> entities = new HashMap<>(); + List<Context> singletons = new ArrayList<>(); for (int ei = 0, el = extentContexts.length; ei < el; ei++) { Context ec = extentContexts[ei]; //System.err.println("NumberModel.setExtents: ec("+ec.getId()+") "+ec.toText()); @@ -131,58 +127,60 @@ public class NumberModel implements TestNumberModel, TrainSimilarityModel { singletons.add(ec); } } - List<Context> singles = new ArrayList<Context>(); - List<Context> plurals = new ArrayList<Context>(); + List<Context> singles = new ArrayList<>(); + List<Context> plurals = new ArrayList<>(); // coref entities - for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) { - Integer key = ei.next(); + for (Integer key : entities.keySet()) { List<Context> entityContexts = (List<Context>) entities.get(key); NumberEnum number = getNumber(entityContexts); if (number == NumberEnum.SINGULAR) { singles.addAll(entityContexts); - } - else if (number == NumberEnum.PLURAL) { + } else if (number == NumberEnum.PLURAL) { plurals.addAll(entityContexts); } } // non-coref entities. - for (Iterator<Context> ei = singletons.iterator(); ei.hasNext();) { - Context ec = ei.next(); + for (Context ec : singletons) { NumberEnum number = getNumber(ec); if (number == NumberEnum.SINGULAR) { singles.add(ec); - } - else if (number == NumberEnum.PLURAL) { + } else if (number == NumberEnum.PLURAL) { plurals.add(ec); } } - for (Iterator<Context> si = singles.iterator(); si.hasNext();) { - Context ec = si.next(); + for (Context ec : singles) { addEvent(NumberEnum.SINGULAR.toString(), ec); } - for (Iterator<Context> fi = plurals.iterator(); fi.hasNext();) { - Context ec = fi.next(); - addEvent(NumberEnum.PLURAL.toString(),ec); + for (Context ec : plurals) { + addEvent(NumberEnum.PLURAL.toString(), ec); } } + @Override public double[] numberDist(Context c) { List<String> feats = getFeatures(c); return testModel.eval(feats.toArray(new String[feats.size()])); } + @Override public int getSingularIndex() { return singularIndex; } + @Override public int getPluralIndex() { return pluralIndex; } + @Override public void trainModel() throws IOException { - new SuffixSensitiveGISModelWriter(GIS.trainModel( - ObjectStreamUtils.createObjectStream(events),100,10), - new File(modelName + modelExtension)).persist(); + TrainingParameters params = TrainingParameters.defaultParams(); + params.put(TrainingParameters.ITERATIONS_PARAM, 100); + params.put(TrainingParameters.CUTOFF_PARAM, 10); + GISTrainer trainer = new GISTrainer(); + trainer.init(params, null); + GISModel trainedModel = trainer.trainModel(ObjectStreamUtils.createObjectStream(events)); + new BinaryGISModelWriter(trainedModel, new File(modelName + modelExtension)).persist(); } } diff --git a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java index e54c427..8bf468c 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java +++ b/opennlp-coref/src/main/java/opennlp/tools/coref/sim/SimilarityModel.java @@ -17,27 +17,30 @@ package opennlp.tools.coref.sim; +import java.io.BufferedInputStream; import java.io.BufferedReader; +import java.io.DataInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import opennlp.tools.coref.resolver.ResolverUtils; -import opennlp.tools.ml.maxent.GIS; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelReader; -import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter; +import opennlp.tools.ml.maxent.GISModel; +import opennlp.tools.ml.maxent.GISTrainer; +import opennlp.tools.ml.maxent.io.BinaryGISModelReader; +import opennlp.tools.ml.maxent.io.BinaryGISModelWriter; import opennlp.tools.ml.model.Event; import opennlp.tools.ml.model.MaxentModel; -import opennlp.tools.util.HashList; import opennlp.tools.util.ObjectStreamUtils; +import opennlp.tools.util.TrainingParameters; /** * Models semantic similarity between two mentions and returns a score based on @@ -45,31 +48,33 @@ import opennlp.tools.util.ObjectStreamUtils; */ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityModel { - private String modelName; - private String modelExtension = ".bin.gz"; + private final String modelName; + private final String modelExtension = ".bin.gz"; private MaxentModel testModel; private List<Event> events; private int SAME_INDEX; private static final String SAME = "same"; private static final String DIFF = "diff"; - private boolean debugOn = false; + private final boolean debugOn = false; public static TestSimilarityModel testModel(String name) throws IOException { return new SimilarityModel(name, false); } public static TrainSimilarityModel trainModel(String name) throws IOException { - SimilarityModel sm = new SimilarityModel(name, true); - return sm; + return new SimilarityModel(name, true); } private SimilarityModel(String modelName, boolean train) throws IOException { this.modelName = modelName; if (train) { - events = new ArrayList<Event>(); + events = new ArrayList<>(); } else { - testModel = (new SuffixSensitiveGISModelReader(new File(modelName + modelExtension))).getModel(); + try (DataInputStream dis = new DataInputStream( + new BufferedInputStream(new FileInputStream(modelName + modelExtension)))) { + testModel = new BinaryGISModelReader(dis).getModel(); + } SAME_INDEX = testModel.getIndex(SAME); } } @@ -98,16 +103,15 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode */ private Set<String> constructHeadSet(List<Context> mentions) { Set<String> headSet = new HashSet<String>(); - for (Iterator<Context> ei = mentions.iterator(); ei.hasNext();) { - Context ec = ei.next(); + for (Context ec : mentions) { headSet.add(ec.getHeadTokenText().toLowerCase()); } return headSet; } private boolean hasSameHead(Set<String> entityHeadSet, Set<String> candidateHeadSet) { - for (Iterator<String> hi = entityHeadSet.iterator(); hi.hasNext();) { - if (candidateHeadSet.contains(hi.next())) { + for (String s : entityHeadSet) { + if (candidateHeadSet.contains(s)) { return true; } } @@ -115,8 +119,8 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private boolean hasSameNameType(Set<String> entityNameSet, Set<String> candidateNameSet) { - for (Iterator<String> hi = entityNameSet.iterator(); hi.hasNext();) { - if (candidateNameSet.contains(hi.next())) { + for (String s : entityNameSet) { + if (candidateNameSet.contains(s)) { return true; } } @@ -124,10 +128,9 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private boolean hasSuperClass(List<Context> entityContexts, List<Context> candidateContexts) { - for (Iterator<Context> ei = entityContexts.iterator(); ei.hasNext();) { - Context ec = ei.next(); - for (Iterator<Context> cei = candidateContexts.iterator(); cei.hasNext();) { - if (inSuperClass(ec, cei.next())) { + for (Context ec : entityContexts) { + for (Context candidateContext : candidateContexts) { + if (inSuperClass(ec, candidateContext)) { return true; } } @@ -149,48 +152,39 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode * with entity indicated by the specified key. */ @SuppressWarnings("unchecked") - private Set<Context> constructExclusionSet(Integer entityKey, HashList entities, Map<Integer, + private Set<Context> constructExclusionSet(Integer entityKey, Map<Integer, Context> entities, Map<Integer, Set<String>> headSets, Map<Integer, Set<String>> nameSets, List<Context> singletons) { - Set<Context> exclusionSet = new HashSet<Context>(); + Set<Context> exclusionSet = new HashSet<>(); Set<String> entityHeadSet = headSets.get(entityKey); Set<String> entityNameSet = nameSets.get(entityKey); List<Context> entityContexts = (List<Context>) entities.get(entityKey); //entities - for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) { - Integer key = ei.next(); + for (Integer key : entities.keySet()) { List<Context> candidateContexts = (List<Context>) entities.get(key); if (key.equals(entityKey)) { exclusionSet.addAll(candidateContexts); - } - else if (nameSets.get(key).isEmpty()) { + } else if (nameSets.get(key).isEmpty()) { exclusionSet.addAll(candidateContexts); - } - else if (hasSameHead(entityHeadSet, headSets.get(key))) { + } else if (hasSameHead(entityHeadSet, headSets.get(key))) { exclusionSet.addAll(candidateContexts); - } - else if (hasSameNameType(entityNameSet, nameSets.get(key))) { + } else if (hasSameNameType(entityNameSet, nameSets.get(key))) { exclusionSet.addAll(candidateContexts); - } - else if (hasSuperClass(entityContexts, candidateContexts)) { + } else if (hasSuperClass(entityContexts, candidateContexts)) { exclusionSet.addAll(candidateContexts); } } //singles - List<Context> singles = new ArrayList<Context>(1); - for (Iterator<Context> si = singletons.iterator(); si.hasNext();) { - Context sc = si.next(); + List<Context> singles = new ArrayList<>(1); + for (Context sc : singletons) { singles.clear(); singles.add(sc); if (entityHeadSet.contains(sc.getHeadTokenText().toLowerCase())) { exclusionSet.add(sc); - } - else if (sc.getNameType() == null) { + } else if (sc.getNameType() == null) { exclusionSet.add(sc); - } - else if (entityNameSet.contains(sc.getNameType())) { + } else if (entityNameSet.contains(sc.getNameType())) { exclusionSet.add(sc); - } - else if (hasSuperClass(entityContexts, singles)) { + } else if (hasSuperClass(entityContexts, singles)) { exclusionSet.add(sc); } } @@ -206,10 +200,9 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode * generated from the mentions associated with that key. */ @SuppressWarnings("unchecked") - private Map<Integer, Set<String>> constructHeadSets(HashList entities) { - Map<Integer, Set<String>> headSets = new HashMap<Integer, Set<String>>(); - for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) { - Integer key = ei.next(); + private Map<Integer, Set<String>> constructHeadSets(Map<Integer, Context> entities) { + Map<Integer, Set<String>> headSets = new HashMap<>(); + for (Integer key : entities.keySet()) { List<Context> entityContexts = (List<Context>) entities.get(key); headSets.put(key, constructHeadSet(entityContexts)); } @@ -221,12 +214,11 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode * * @param mentions A list of mentions. * - * @return A set set of name types assigned to the specified mentions. + * @return A set of name types assigned to the specified mentions. */ private Set<String> constructNameSet(List<Context> mentions) { - Set<String> nameSet = new HashSet<String>(); - for (Iterator<Context> ei = mentions.iterator(); ei.hasNext();) { - Context ec = ei.next(); + Set<String> nameSet = new HashSet<>(); + for (Context ec : mentions) { if (ec.getNameType() != null) { nameSet.add(ec.getNameType()); } @@ -243,10 +235,9 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode * with the each mention of that entity. */ @SuppressWarnings("unchecked") - private Map<Integer, Set<String>> constructNameSets(HashList entities) { - Map<Integer, Set<String>> nameSets = new HashMap<Integer, Set<String>>(); - for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) { - Integer key = ei.next(); + private Map<Integer, Set<String>> constructNameSets(Map<Integer, Context> entities) { + Map<Integer, Set<String>> nameSets = new HashMap<>(); + for (Integer key : entities.keySet()) { List<Context> entityContexts = (List<Context>) entities.get(key); nameSets.put(key, constructNameSet(entityContexts)); } @@ -259,8 +250,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } else { int numCommonSynsets = 0; - for (Iterator<String> si = ec.getSynsets().iterator(); si.hasNext();) { - String synset = si.next(); + for (String synset : ec.getSynsets()) { if (cec.getSynsets().contains(synset)) { numCommonSynsets++; } @@ -283,20 +273,19 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } */ + @Override @SuppressWarnings("unchecked") public void setExtents(Context[] extentContexts) { - HashList entities = new HashList(); - /** Extents which are not in a coreference chain. */ - List<Context> singletons = new ArrayList<Context>(); - List<Context> allExtents = new ArrayList<Context>(); + Map<Integer, Context> entities = new HashMap<>(); + /* Extents which are not in a coreference chain. */ + List<Context> singletons = new ArrayList<>(); + List<Context> allExtents = new ArrayList<>(); //populate data structures - for (int ei = 0, el = extentContexts.length; ei < el; ei++) { - Context ec = extentContexts[ei]; + for (Context ec : extentContexts) { //System.err.println("SimilarityModel: setExtents: ec("+ec.getId()+") "+ec.getNameType()+" "+ec); if (ec.getId() == -1) { singletons.add(ec); - } - else { + } else { entities.put(ec.getId(), ec); } allExtents.add(ec); @@ -306,8 +295,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode Map<Integer, Set<String>> headSets = constructHeadSets(entities); Map<Integer, Set<String>> nameSets = constructNameSets(entities); - for (Iterator<Integer> ei = entities.keySet().iterator(); ei.hasNext();) { - Integer key = ei.next(); + for (Integer key : entities.keySet()) { Set<String> entityNameSet = nameSets.get(key); if (entityNameSet.isEmpty()) { continue; @@ -333,7 +321,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode axi = (axi + 1) % allExtents.size(); if (!exclusionSet.contains(sec1)) { if (debugOn) System.err.println(ec1.toString() + " " + entityNameSet + " " - + sec1.toString() + " " + nameSets.get(sec1.getId())); + + sec1.toString() + " " + nameSets.get(sec1.getId())); addEvent(false, ec1, sec1); break; } @@ -354,6 +342,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode * @return a number between 0 and 1 which represents the models belief that the specified * mentions are compatible. */ + @Override public double compatible(Context mention1, Context mention2) { List<String> feats = getFeatures(mention1, mention2); if (debugOn) System.err.println("SimilarityModel.compatible: feats=" + feats); @@ -364,18 +353,22 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode * Train a model based on the previously supplied evidence. * @see #setExtents(Context[]) */ + @Override public void trainModel() throws IOException { if (debugOn) { FileWriter writer = new FileWriter(modelName + ".events"); - for (Iterator<Event> ei = events.iterator();ei.hasNext();) { - Event e = ei.next(); + for (Event e : events) { writer.write(e.toString() + "\n"); } writer.close(); } - new SuffixSensitiveGISModelWriter(GIS.trainModel( - ObjectStreamUtils.createObjectStream(events),100,10), - new File(modelName + modelExtension)).persist(); + TrainingParameters params = TrainingParameters.defaultParams(); + params.put(TrainingParameters.ITERATIONS_PARAM, 100); + params.put(TrainingParameters.CUTOFF_PARAM, 10); + GISTrainer trainer = new GISTrainer(); + trainer.init(params, null); + GISModel trainedModel = trainer.trainModel(ObjectStreamUtils.createObjectStream(events)); + new BinaryGISModelWriter(trainedModel, new File(modelName + modelExtension)).persist(); } private boolean isName(Context np) { @@ -399,8 +392,8 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode List<String> features = new ArrayList<>(2 + synsets.size()); features.add("nn=" + name.getNameType() + "," + common.getNameType()); features.add("nw=" + name.getNameType() + "," + common.getHeadTokenText().toLowerCase()); - for (Iterator<String> si = synsets.iterator(); si.hasNext();) { - features.add("ns=" + name.getNameType() + "," + si.next()); + for (String synset : synsets) { + features.add("ns=" + name.getNameType() + "," + synset); } if (name.getNameType() == null) { //features.addAll(getCommonCommonFeatures(name,common)); @@ -409,14 +402,14 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getNameNumberFeatures(Context name, Context number) { - List<String> features = new ArrayList<String>(2); + List<String> features = new ArrayList<>(2); features.add("nt=" + name.getNameType() + "," + number.getHeadTokenTag()); features.add("nn=" + name.getNameType() + "," + number.getNameType()); return features; } private List<String> getNamePronounFeatures(Context name, Context pronoun) { - List<String> features = new ArrayList<String>(2); + List<String> features = new ArrayList<>(2); features.add("nw=" + name.getNameType() + "," + pronoun.getHeadTokenText().toLowerCase()); features.add("ng=" + name.getNameType() + "," + ResolverUtils.getPronounGender( pronoun.getHeadTokenText().toLowerCase())); @@ -424,13 +417,12 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getCommonPronounFeatures(Context common, Context pronoun) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); Set<String> synsets1 = common.getSynsets(); String p = pronoun.getHeadTokenText().toLowerCase(); String gen = ResolverUtils.getPronounGender(p); features.add("wn=" + p + "," + common.getNameType()); - for (Iterator<String> si = synsets1.iterator(); si.hasNext();) { - String synset = si.next(); + for (String synset : synsets1) { features.add("ws=" + p + "," + synset); features.add("gs=" + gen + "," + synset); } @@ -438,10 +430,9 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getCommonNumberFeatures(Context common, Context number) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); Set<String> synsets1 = common.getSynsets(); - for (Iterator<String> si = synsets1.iterator(); si.hasNext();) { - String synset = si.next(); + for (String synset : synsets1) { features.add("ts=" + number.getHeadTokenTag() + "," + synset); features.add("ns=" + number.getNameType() + "," + synset); } @@ -450,7 +441,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getNumberPronounFeatures(Context number, Context pronoun) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); String p = pronoun.getHeadTokenText().toLowerCase(); String gen = ResolverUtils.getPronounGender(p); features.add("wt=" + p + "," + number.getHeadTokenTag()); @@ -461,7 +452,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getNameNameFeatures(Context name1, Context name2) { - List<String> features = new ArrayList<String>(1); + List<String> features = new ArrayList<>(1); if (name1.getNameType() == null && name2.getNameType() == null) { features.add("nn=" + name1.getNameType() + "," + name2.getNameType()); //features.addAll(getCommonCommonFeatures(name1,name2)); @@ -489,7 +480,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getCommonCommonFeatures(Context common1, Context common2) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); Set<String> synsets1 = common1.getSynsets(); Set<String> synsets2 = common2.getSynsets(); @@ -502,8 +493,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode return features; } int numCommonSynsets = 0; - for (Iterator<String> si = synsets1.iterator(); si.hasNext();) { - String synset = si.next(); + for (String synset : synsets1) { if (synsets2.contains(synset)) { features.add("ss=" + synset); numCommonSynsets++; @@ -527,7 +517,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getPronounPronounFeatures(Context pronoun1, Context pronoun2) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); String g1 = ResolverUtils.getPronounGender(pronoun1.getHeadTokenText()); String g2 = ResolverUtils.getPronounGender(pronoun2.getHeadTokenText()); if (g1.equals(g2)) { @@ -540,7 +530,7 @@ public class SimilarityModel implements TestSimilarityModel, TrainSimilarityMode } private List<String> getFeatures(Context np1, Context np2) { - List<String> features = new ArrayList<String>(); + List<String> features = new ArrayList<>(); features.add("default"); // semantic categories String w1 = np1.getHeadTokenText().toLowerCase(); diff --git a/opennlp-coref/src/main/java/opennlp/tools/formats/CorefSampleStreamFactory.java b/opennlp-coref/src/main/java/opennlp/tools/formats/CorefSampleStreamFactory.java index 2dbbf74..9d4895d 100644 --- a/opennlp-coref/src/main/java/opennlp/tools/formats/CorefSampleStreamFactory.java +++ b/opennlp-coref/src/main/java/opennlp/tools/formats/CorefSampleStreamFactory.java @@ -18,6 +18,7 @@ package opennlp.tools.formats; import java.io.FileInputStream; +import java.io.IOException; import opennlp.tools.cmdline.ArgumentParser; import opennlp.tools.cmdline.CmdLineUtil; @@ -25,6 +26,7 @@ import opennlp.tools.cmdline.StreamFactoryRegistry; import opennlp.tools.cmdline.params.BasicFormatParams; import opennlp.tools.coref.CorefSample; import opennlp.tools.coref.CorefSampleDataStream; +import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ParagraphStream; import opennlp.tools.util.PlainTextByLineStream; @@ -42,16 +44,19 @@ public class CorefSampleStreamFactory extends AbstractSampleStreamFactory<CorefS StreamFactoryRegistry.registerFactory(CorefSample.class, StreamFactoryRegistry.DEFAULT_FORMAT, new CorefSampleStreamFactory()); } - + + @Override public ObjectStream<CorefSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); CmdLineUtil.checkInputFile("Data", params.getData()); - FileInputStream sampleDataIn = CmdLineUtil.openInFile(params.getData()); - - ObjectStream<String> lineStream = new ParagraphStream(new PlainTextByLineStream(sampleDataIn - .getChannel(), params.getEncoding())); - - return new CorefSampleDataStream(lineStream); + try { + MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory(params.getData()); + ObjectStream<String> lineStream = new ParagraphStream(new PlainTextByLineStream( + factory, params.getEncoding())); + return new CorefSampleDataStream(lineStream); + } catch (IOException e) { + throw new RuntimeException("Error loading input data from parameters!", e); + } } }
