Revision: 19514 http://sourceforge.net/p/gate/code/19514 Author: markagreenwood Date: 2016-08-19 10:17:23 +0000 (Fri, 19 Aug 2016) Log Message: ----------- some cleaning up and fixing bugs highlighted by findbugs
Modified Paths: -------------- gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java Modified: gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java =================================================================== --- gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java 2016-08-19 01:22:42 UTC (rev 19513) +++ gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java 2016-08-19 10:17:23 UTC (rev 19514) @@ -1,5 +1,5 @@ /************************************************************************ - * Copyright (C) 2004-2009 The University of Sheffield * + * Copyright (C) 2004-2016 The University of Sheffield * * Developed by Mark Greenwood <m.greenw...@dcs.shef.ac.uk> * * * * This program is free software; you can redistribute it and/or modify * @@ -22,203 +22,105 @@ import gate.util.BomStrippingInputStreamReader; import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; import java.io.IOException; -import java.io.InputStreamReader; +import java.io.Serializable; import java.net.URL; import java.util.ArrayList; -import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Map; -public class Chunker -{ - private List<Rule> rules = new ArrayList<Rule>(); +public class Chunker implements Serializable { - public static void main(String args[]) throws Exception - { - Chunker c = new Chunker((new File(args[0])).toURI().toURL()); + private static final long serialVersionUID = 9147365383638459068L; - BufferedReader in = new BufferedReader(new FileReader(args[1])); + private List<Rule> rules = new ArrayList<Rule>(); - String line = in.readLine(); - - Map<String,String> chunkTags = new HashMap<String,String>(); - - while (line != null) - { - if (!line.trim().equals("")) - { - String[] tags = line.split(" "); - chunkTags.put(tags[0],tags[1]); - } - - line = in.readLine(); - } - - in.close(); - - in = new BomStrippingInputStreamReader(System.in); - - line = in.readLine(); - - while (line != null) - { - String[] tokens = line.split(" "); - - List<String> wl = new ArrayList<String>(); - List<String> tl = new ArrayList<String>(); - List<String> pl = new ArrayList<String>(); - - for (int i = 0 ; i < tokens.length ; ++i) - { - String[] data = tokens[i].split("/"); - - wl.add(data[0]); - pl.add(data[1]); - - String ct = chunkTags.get(data[1]); - - if (ct == null) ct = "I"; - - tl.add(ct); - } - - tl = c.chunkSentence(wl,tl,pl); - - boolean inBaseNP = false; - boolean lineBegin = true; - - for (int i = 0 ; i < wl.size() ; ++i) - { - String ct = tl.get(i); - - if (inBaseNP) - { - if (ct.equals("B")) - { - System.out.print(" ] ["); - } - else if (ct.equals("O")) - { - System.out.print(" ]"); - inBaseNP = false; - } - } - else - { - if (ct.equals("B") || ct.equals("I")) - { - if (!lineBegin) System.out.print(" "); - lineBegin = false; - System.out.print("["); - inBaseNP = true; - } - } - if (!lineBegin) System.out.print(" "); - lineBegin = false; - System.out.print(wl.get(i) + "/" + pl.get(i)); - } - - if (inBaseNP) - { - System.out.print("]"); - } - - System.out.println(); - - line = in.readLine(); - } - } - /** * The only constructor that reads the rules from a URL. - * @param u the URL of the rules file. + * + * @param u + * the URL of the rules file. **/ - public Chunker(URL u) throws IOException - { - //Open up the rules file read for reading - BufferedReader in = new BomStrippingInputStreamReader(u.openStream()); + public Chunker(URL u) throws IOException { + // Open up the rules file read for reading + try (BufferedReader in = new BomStrippingInputStreamReader( + u.openStream())) { - //read in the first rule from the file - String rule = in.readLine(); + // read in the first rule from the file + String rule = in.readLine(); - while (rule != null) - { - //while there are still rules to process... + while (rule != null) { + // while there are still rules to process... - if (!rule.trim().equals("")) - { - //create and add a rule to the list of rules - rules.add(new Rule(rule)); + if (!rule.trim().equals("")) { + // create and add a rule to the list of rules + rules.add(new Rule(rule)); + } + + // read in the next rule; + rule = in.readLine(); } - - //read in the next rule; - rule = in.readLine(); } } /** - * This is the method which does all the work and returns - * an updated set of chunk tags. - * @param words an ordered List of the words within the sentence. - * @param tags an ordered List of the chunk tags within the sentence. - * @param pos an ordered List of the POS tags within the sentence. + * This is the method which does all the work and returns an updated set of + * chunk tags. + * + * @param words + * an ordered List of the words within the sentence. + * @param tags + * an ordered List of the chunk tags within the sentence. + * @param pos + * an ordered List of the POS tags within the sentence. * @return an ordered List of the updated chunk tags for the sentence. **/ - public List<String> chunkSentence(List<String> words, List<String> tags, List<String> pos) - { - //add the word/pos/tag that represents the end of - //the sentence, cos some of the rules match against - //the end of the sentence + public List<String> chunkSentence(List<String> words, List<String> tags, + List<String> pos) { + // add the word/pos/tag that represents the end of + // the sentence, cos some of the rules match against + // the end of the sentence words.add("ZZZ"); pos.add("ZZZ"); tags.add("Z"); - //Get an iterator over the rules and loop - //through them... + // Get an iterator over the rules and loop + // through them... Iterator<Rule> it = rules.iterator(); - while (it.hasNext()) - { - //create an empty list to hold the new - //chunk tags for this iterations + while (it.hasNext()) { + // create an empty list to hold the new + // chunk tags for this iterations List<String> newTags = new ArrayList<String>(); - //get the next rule we are going to apply + // get the next rule we are going to apply Rule r = it.next(); - //loop over all the words in the sentence - for (int i = 0 ; i < words.size() ; ++i) - { - if (r.match(i,words,tags,pos)) - { - //if the rule matches against the current - //word in the sentence then and the new tag - //from the rule to the new tag list + // loop over all the words in the sentence + for (int i = 0; i < words.size(); ++i) { + if (r.match(i, words, tags, pos)) { + // if the rule matches against the current + // word in the sentence then and the new tag + // from the rule to the new tag list newTags.add(r.getNewTag()); - } - else - { - //the rule didn't match so simply copy the - //chunk tag that was already assigned + } else { + // the rule didn't match so simply copy the + // chunk tag that was already assigned newTags.add(tags.get(i)); } } - //now replace the old tags with the new ones ready - //for running the next rule, this stops rule-chaining + // now replace the old tags with the new ones ready + // for running the next rule, this stops rule-chaining tags = newTags; } - //remove the last token from each list as these - //are not part of the original input sentence - words.remove(words.size()-1); - pos.remove(pos.size()-1); - tags.remove(tags.size()-1); + // remove the last token from each list as these + // are not part of the original input sentence + words.remove(words.size() - 1); + pos.remove(pos.size() - 1); + tags.remove(tags.size() - 1); - //return the final updated chunk tag lists + // return the final updated chunk tag lists return tags; } } \ No newline at end of file Modified: gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java =================================================================== --- gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java 2016-08-19 01:22:42 UTC (rev 19513) +++ gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java 2016-08-19 10:17:23 UTC (rev 19514) @@ -1,3 +1,22 @@ +/************************************************************************ + * Copyright (C) 2004-2016 The University of Sheffield * + * Developed by Mark Greenwood <m.greenw...@dcs.shef.ac.uk> * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License along with this program; if not, write to the Free Software * + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * + ************************************************************************/ + package mark.chunking; import gate.creole.PackagedController; Modified: gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java =================================================================== --- gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java 2016-08-19 01:22:42 UTC (rev 19513) +++ gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java 2016-08-19 10:17:23 UTC (rev 19514) @@ -1,5 +1,5 @@ /************************************************************************ - * Copyright (C) 2004-2009 The University of Sheffield * + * Copyright (C) 2004-2016 The University of Sheffield * * Developed by Mark Greenwood <m.greenw...@dcs.shef.ac.uk> * * * * This program is free software; you can redistribute it and/or modify * @@ -23,7 +23,6 @@ import gate.AnnotationSet; import gate.Factory; import gate.FeatureMap; -import gate.ProcessingResource; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; @@ -37,7 +36,7 @@ import gate.util.OffsetComparator; import java.io.BufferedReader; -import java.io.Serializable; +import java.io.IOException; import java.net.URL; import java.text.NumberFormat; import java.util.ArrayList; @@ -48,343 +47,340 @@ import java.util.Map; @CreoleResource(name = "Noun Phrase Chunker", comment = "Implementation of the Ramshaw and Marcus base noun phrase chunker", helpURL = "http://gate.ac.uk/userguide/sec:parsers:npchunker", icon = "NpChunker") -public class GATEWrapper extends AbstractLanguageAnalyser implements - ProcessingResource, - Serializable { - +public class GATEWrapper extends AbstractLanguageAnalyser { + private static final long serialVersionUID = -801244032207014722L; - private Chunker c = null; + private Chunker c = null; - private Map<String,String> chunkTags = null; + private Map<String, String> chunkTags = null; - private OffsetComparator offsetComparator = new OffsetComparator(); + private URL posTagURL; - private URL posTagURL; + @CreoleParameter(defaultValue = "resources/pos_tag_dict", comment = "The URL of the pos_tag_dict file.") + public void setPosTagURL(URL posTagURL) { + this.posTagURL = posTagURL; + } - @CreoleParameter(defaultValue="resources/pos_tag_dict",comment="The URL of the pos_tag_dict file.") - public void setPosTagURL(URL posTagURL) { - this.posTagURL = posTagURL; - } + public URL getPosTagURL() { + return posTagURL; + } - public URL getPosTagURL() { - return posTagURL; - } + private URL rulesURL; - private URL rulesURL; + @CreoleParameter(defaultValue = "resources/rules", comment = "The URL of the rules file.") + public void setRulesURL(URL rulesURL) { + this.rulesURL = rulesURL; + } - @CreoleParameter(defaultValue="resources/rules",comment="The URL of the rules file.") - public void setRulesURL(URL rulesURL) { - this.rulesURL = rulesURL; - } + public URL getRulesURL() { + return rulesURL; + } - public URL getRulesURL() { - return rulesURL; - } + private String posFeature; - private String posFeature; + @RunTime + @CreoleParameter(defaultValue = "category", comment = "The name of the feature which holds the POS tag.") + public void setPosFeature(String posFeature) { + this.posFeature = posFeature; + } - @RunTime - @CreoleParameter(defaultValue="category",comment="The name of the feature which holds the POS tag.") - public void setPosFeature(String posFeature) { - this.posFeature = posFeature; - } + public String getPosFeature() { + return posFeature; + } - public String getPosFeature() { - return posFeature; - } + private String unknownTag; - private String unknownTag; + @RunTime + @CreoleParameter(defaultValue = "I", comment = "The chunk tag to use for an unknown POS tag.") + public void setUnknownTag(String unknownTag) { + this.unknownTag = unknownTag; + } - @RunTime - @CreoleParameter(defaultValue="I",comment="The chunk tag to use for an unknown POS tag.") - public void setUnknownTag(String unknownTag) { - this.unknownTag = unknownTag; - } + public String getUnknownTag() { + return unknownTag; + } - public String getUnknownTag() { - return unknownTag; - } + private String inputASName; - private String inputASName; + @Optional + @RunTime + @CreoleParameter(comment = "The name of the annotation set used for input.") + public void setInputASName(String inputASName) { + this.inputASName = inputASName; + } - @Optional - @RunTime - @CreoleParameter(comment="The name of the annotation set used for input.") - public void setInputASName(String inputASName) { - this.inputASName = inputASName; - } + public String getInputASName() { + return inputASName; + } - public String getInputASName() { - return inputASName; - } + private String outputASName; - private String outputASName; + @Optional + @RunTime + @CreoleParameter(comment = "The name of the annotation set used for output.") + public void setOutputASName(String outputASName) { + this.outputASName = outputASName; + } - @Optional - @RunTime - @CreoleParameter(comment="The name of the annotation set used for output.") - public void setOutputASName(String outputASName) { - this.outputASName = outputASName; - } + public String getOutputASName() { + return outputASName; + } - public String getOutputASName() { - return outputASName; - } + private String annotationName; - private String annotationName; + @RunTime + @CreoleParameter(defaultValue = "NounChunk", comment = "The name of the annotations added to mark noun chunks.") + public void setAnnotationName(String annotationName) { + this.annotationName = annotationName; + } - @RunTime - @CreoleParameter(defaultValue="NounChunk",comment="The name of the annotations added to mark noun chunks.") - public void setAnnotationName(String annotationName) { - this.annotationName = annotationName; - } + public String getAnnotationName() { + return annotationName; + } - public String getAnnotationName() { - return annotationName; - } + public Resource init() throws ResourceInstantiationException { + if (rulesURL == null) { + throw new ResourceInstantiationException( + "Rules URL must be specified"); + } - public Resource init() throws ResourceInstantiationException { - if(rulesURL == null) { - throw new ResourceInstantiationException("Rules URL must be specified"); - } + if (posTagURL == null) { + throw new ResourceInstantiationException( + "POS tag dictionary URL must be specified"); + } - if(posTagURL == null) { - throw new ResourceInstantiationException( - "POS tag dictionary URL must be specified"); - } + try (BufferedReader in = new BomStrippingInputStreamReader( + posTagURL.openStream())) { + // lets create a new Chunker using the URL provided (which we know + // is not null as we already checked it). + c = new Chunker(rulesURL); - try { - // lets create a new Chunker using the URL provided (which we know - // is not null as we already checked it). - c = new Chunker(rulesURL); + // read in the first line of the file + String line = in.readLine(); - // Open a reader over the pos_tag_dict file so we can load - // the database - BufferedReader in = new BomStrippingInputStreamReader(posTagURL - .openStream()); + // create a new empty map to hold the pos and chunk tags + chunkTags = new HashMap<String, String>(); - // read in the first line of the file - String line = in.readLine(); + while (line != null) { + // while there is still data in the file... - // create a new empty map to hold the pos and chunk tags - chunkTags = new HashMap<String,String>(); + // split the current line into two parts + String[] tags = line.split(" "); - while(line != null) { - // while there is still data in the file... + // put the data in the map, POS tags as key + // chunk tag as value + chunkTags.put(tags[0], tags[1]); - // split the current line into two parts - String[] tags = line.split(" "); + // get the next line from the data file + line = in.readLine(); + } - // put the data in the map, POS tags as key - // chunk tag as value - chunkTags.put(tags[0], tags[1]); + // close the data file now we have finished with it + in.close(); + } catch (IOException e) { + // if an error occurred then throw an exception so that the user + // knows + throw new ResourceInstantiationException( + "Unable to correctly init the chunker: " + e.getMessage()); + } - // get the next line from the data file - line = in.readLine(); - } + // if we get to here then everything has initialised correctly + // so return this instance + return this; + } - // close the data file now we have finished with it - in.close(); - } - catch(Exception e) { - // if an error occurred then throw an exception so that the user - // knows - throw new ResourceInstantiationException( - "Unable to correctly init the chunker: " + e.getMessage()); - } + public void execute() throws ExecutionException { + // lets get the AnnotationSet we are using as input. Get either the + // set the user has asked for or if they haven't specified use the + // default set + if (inputASName != null && inputASName.equals("")) + inputASName = null; + AnnotationSet inputAS = (inputASName == null) ? document + .getAnnotations() : document.getAnnotations(inputASName); - // if we get to here then everything has initialised correctly - // so return this instance - return this; - } + // lets get the AnnotationSet we are using as output. Get either the + // set the user has asked for or if they haven't specified use the + // default set + if (outputASName != null && outputASName.equals("")) + outputASName = null; + AnnotationSet outputAS = (outputASName == null) ? document + .getAnnotations() : document.getAnnotations(outputASName); - public void execute() throws ExecutionException { - // lets get the AnnotationSet we are using as input. Get either the - // set the user has asked for or if they haven't specified use the - // default set - if(inputASName != null && inputASName.equals("")) inputASName = null; - AnnotationSet inputAS = (inputASName == null) - ? document.getAnnotations() - : document.getAnnotations(inputASName); + // Get the set of sentences contained within the current document + AnnotationSet sentences = inputAS.get(SENTENCE_ANNOTATION_TYPE); - // lets get the AnnotationSet we are using as output. Get either the - // set the user has asked for or if they haven't specified use the - // default set - if(outputASName != null && outputASName.equals("")) outputASName = null; - AnnotationSet outputAS = (outputASName == null) - ? document.getAnnotations() - : document.getAnnotations(outputASName); + // All annotations of type tokens + AnnotationSet tokenas = inputAS.get(TOKEN_ANNOTATION_TYPE); - // Get the set of sentences contained within the current document - AnnotationSet sentences = inputAS.get(SENTENCE_ANNOTATION_TYPE); + if (sentences != null && sentences.size() > 0) { + // assuming there are sentences... - // All annotations of type tokens - AnnotationSet tokenas = inputAS.get(TOKEN_ANNOTATION_TYPE); + // get the current time to use as part of the progress feedback + long startTime = System.currentTimeMillis(); - if(sentences != null && sentences.size() > 0) { - // assuming there are sentences... + // tell the user we are just starting to chunk the document + fireStatusChanged("Chunking " + document.getName()); + fireProgressChanged(0); - // get the current time to use as part of the progress feedback - long startTime = System.currentTimeMillis(); + // we are just starting so we haven't processed a document yet + // so remember this ready for the progress feedback + int i = 0; - // tell the user we are just starting to chunk the document - fireStatusChanged("Chunking " + document.getName()); - fireProgressChanged(0); + // Loop through all the sentences + Iterator<Annotation> sit = sentences.iterator(); + while (sit.hasNext()) { + // get the current sentence to process + Annotation sentence = sit.next(); - // we are just starting so we haven't processed a document yet - // so remember this ready for the progress feedback - int i = 0; + // Get a sorted list of the tokens within the current sentence + List<Annotation> tokens = new ArrayList<Annotation>(); + tokens.addAll(tokenas.getContained(sentence.getStartNode() + .getOffset(), sentence.getEndNode().getOffset())); + Collections.sort(tokens, new OffsetComparator()); - // Loop through all the sentences - Iterator<Annotation> sit = sentences.iterator(); - while(sit.hasNext()) { - // get the current sentence to process - Annotation sentence = sit.next(); + // Create three empty lists to hold the words, pos and chunk + // tags of the tokens in the current sentence + List<String> wl = new ArrayList<String>(); + List<String> tl = new ArrayList<String>(); + List<String> pl = new ArrayList<String>(); - // Get a sorted list of the tokens within the current sentence - List<Annotation> tokens = new ArrayList<Annotation>(); - tokens.addAll(tokenas.getContained(sentence.getStartNode().getOffset(), - sentence.getEndNode().getOffset())); - Collections.sort(tokens, offsetComparator); + // Loop through all the tokens in the current sentence + Iterator<Annotation> tit = tokens.iterator(); + while (tit.hasNext()) { + // get the current token to process + Annotation token = tit.next(); - // Create three empty lists to hold the words, pos and chunk - // tags of the tokens in the current sentence - List<String> wl = new ArrayList<String>(); - List<String> tl = new ArrayList<String>(); - List<String> pl = new ArrayList<String>(); + // add the string spanned by the current token to the list + // of + // words + wl.add((String) token.getFeatures().get("string")); - // Loop through all the tokens in the current sentence - Iterator<Annotation> tit = tokens.iterator(); - while(tit.hasNext()) { - // get the current token to process - Annotation token = tit.next(); + // get the POS tag for the current token + String pos = (String) token.getFeatures().get(posFeature); - // add the string spanned by the current token to the list of - // words - wl.add((String)token.getFeatures().get("string")); + // add the POS tag to the list of POS tags + pl.add(pos); - // get the POS tag for the current token - String pos = (String)token.getFeatures().get(posFeature); + // get the initial chunk tag for this POS tag + String chunkTag = chunkTags.get(pos); - // add the POS tag to the list of POS tags - pl.add(pos); + // if the chunk tag is null then use the unknown chunk tag + if (chunkTag == null) + chunkTag = unknownTag; - // get the initial chunk tag for this POS tag - String chunkTag = chunkTags.get(pos); + // now add the chunk tag to the list of chunk tags + tl.add(chunkTag); + } - // if the chunk tag is null then use the unknown chunk tag - if(chunkTag == null) chunkTag = unknownTag; + // run the chunker over the current sentence and get back + // an updated list of chunk tags + tl = c.chunkSentence(wl, tl, pl); - // now add the chunk tag to the list of chunk tags - tl.add(chunkTag); - } + // a variable to hold the index of the token which + // starts the current noun chunk + int start = 0; - // run the chunker over the current sentence and get back - // an updated list of chunk tags - tl = c.chunkSentence(wl, tl, pl); + // a flag so we know if we are in an NP or not + boolean inBaseNP = false; - // a variable to hold the index of the token which - // starts the current noun chunk - int start = 0; + // Loop through all the chunk tags in the current sentence + // so we can find the noun chunks + for (int tIndex = 0; tIndex < tl.size(); ++tIndex) { + // get the current chunk tag + String ct = tl.get(tIndex); - // a flag so we know if we are in an NP or not - boolean inBaseNP = false; + if (inBaseNP) { + // if we are currently inside a noun chunk then... - // Loop through all the chunk tags in the current sentence - // so we can find the noun chunks - for(int tIndex = 0; tIndex < tl.size(); ++tIndex) { - // get the current chunk tag - String ct = tl.get(tIndex); + if (ct.equals("B")) { + // if the chunk tag is "B" then we are about to + // start a + // new chunk so record the one that has just + // finished + addAnnotation(outputAS, tokens, start, tIndex - 1); - if(inBaseNP) { - // if we are currently inside a noun chunk then... + // now reset the beginning of the chunk to the + // current + // token + start = tIndex; + } else if (ct.equals("O")) { + // if the chunk tag is "O" then we have dropped out + // the end of a chunk so add the chunk we just + // finished + addAnnotation(outputAS, tokens, start, tIndex - 1); - if(ct.equals("B")) { - // if the chunk tag is "B" then we are about to start a - // new chunk so record the one that has just finished - addAnnotation(outputAS, tokens, start, tIndex - 1); + // now flag that we are outside of any chunk + inBaseNP = false; + } + } else { + // we aren't currently in a noun chunk so... - // now reset the beginning of the chunk to the current - // token - start = tIndex; - } - else if(ct.equals("O")) { - // if the chunk tag is "O" then we have dropped out - // the end of a chunk so add the chunk we just finished - addAnnotation(outputAS, tokens, start, tIndex - 1); + if (ct.equals("B") || ct.equals("I")) { + // if the chunk tag is "B" or "I" then we have found + // the beginning of a chunk, so.... - // now flag that we are outside of any chunk - inBaseNP = false; - } - } - else { - // we aren't currently in a noun chunk so... + // record the start index + start = tIndex; - if(ct.equals("B") || ct.equals("I")) { - // if the chunk tag is "B" or "I" then we have found - // the beginning of a chunk, so.... + // and flag that we are now inside a chunk + inBaseNP = true; + } + } + } - // record the start index - start = tIndex; + if (inBaseNP) { + // if we got to the end of a sentence and we are still in a + // noun chunk then we need to close the end and add the + // annotation + addAnnotation(outputAS, tokens, start, tl.size() - 1); + } - // and flag that we are now inside a chunk - inBaseNP = true; - } - } - } + // update the progress stuff to show the precentage of sentences + // we have processed so far + fireProgressChanged(i++ * 100 / sentences.size()); + } - if(inBaseNP) { - // if we got to the end of a sentence and we are still in a - // noun chunk then we need to close the end and add the - // annotation - addAnnotation(outputAS, tokens, start, tl.size() - 1); - } + // we have finished! so update the progress and tell + // the user how long it took to chunk the document + fireProcessFinished(); + fireStatusChanged(document.getName() + + " chunked in " + + NumberFormat + .getInstance() + .format((double) (System.currentTimeMillis() - startTime) / 1000) + + " seconds!"); + } else { + // if there are no sentence annotations then throw an exception as + // theres + // not much we can do + throw new GateRuntimeException( + "No sentences to process! Please run a sentence splitter first!"); + } + } - // update the progress stuff to show the precentage of sentences - // we have processed so far - fireProgressChanged(i++ * 100 / sentences.size()); - } + private void addAnnotation(AnnotationSet outputAS, List<Annotation> tokens, + int start, int end) { + // Create a new FeatureMap to act as the features for the new + // annotation + // but we will leave it blank for now as we don't have anything to + // add + FeatureMap params = Factory.newFeatureMap(); - // we have finished! so update the progress and tell - // the user how long it took to chunk the document - fireProcessFinished(); - fireStatusChanged(document.getName() - + " chunked in " - + NumberFormat.getInstance().format( - (double)(System.currentTimeMillis() - startTime) / 1000) - + " seconds!"); - } - else { - // if there are no sentence annotations then throw an exception as - // theres - // not much we can do - throw new GateRuntimeException( - "No sentences to process! Please run a sentence splitter first!"); - } - } + // Get the token annotation from the beginning of the chunk + Annotation aStart = tokens.get(start); - private void addAnnotation(AnnotationSet outputAS, List<Annotation> tokens, int start, - int end) { - // Create a new FeatureMap to act as the features for the new - // annotation - // but we will leave it blank for now as we don't have anything to - // add - FeatureMap params = Factory.newFeatureMap(); + // Get the token annotation from the end of the chunk + Annotation aEnd = tokens.get(end); - // Get the token annotation from the beginning of the chunk - Annotation aStart = tokens.get(start); + // This spots errors where the start is after the end. What + // we should do is figure out why this occurs in the first place + if (aStart.getStartNode().getOffset().longValue() >= aEnd.getEndNode() + .getOffset().longValue()) + return; - // Get the token annotation from the end of the chunk - Annotation aEnd = tokens.get(end); - - // This spots errors where the start is after the end. What - // we should do is figure out why this occurs in the first place - if(aStart.getStartNode().getOffset().longValue() >= aEnd.getEndNode() - .getOffset().longValue()) return; - - // add a new annotation to mark the noun chunk - outputAS.add(aStart.getStartNode(), aEnd.getEndNode(), annotationName, - params); - } + // add a new annotation to mark the noun chunk + outputAS.add(aStart.getStartNode(), aEnd.getEndNode(), annotationName, + params); + } } Modified: gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java =================================================================== --- gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java 2016-08-19 01:22:42 UTC (rev 19513) +++ gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java 2016-08-19 10:17:23 UTC (rev 19514) @@ -1,5 +1,5 @@ /************************************************************************ - * Copyright (C) 2004-2009 The University of Sheffield * + * Copyright (C) 2004-2016 The University of Sheffield * * Developed by Mark Greenwood <m.greenw...@dcs.shef.ac.uk> * * * * This program is free software; you can redistribute it and/or modify * @@ -19,18 +19,19 @@ package mark.chunking; +import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * This class encapulates chunking rules, providing methods to - * see if it matches against text and to see if it fits within - * a sentence. + * This class encapsulates chunking rules, providing methods to see if it + * matches against text and to see if it fits within a sentence. **/ -public class Rule -{ +public class Rule implements Serializable { + private static final long serialVersionUID = 787395188113920930L; + /** * A Pattern which will split the rule name into type and offsets. **/ @@ -42,32 +43,28 @@ private static final Pattern po = Pattern.compile("_?[0-3]"); /** - * The new chunk tag that is assigned if this rule matches the - * input text. + * The new chunk tag that is assigned if this rule matches the input text. **/ private String outTag = null; /** - * The full line of the rules file which created this rule, - * used mainly for the toString() method. + * The full line of the rules file which created this rule, used mainly for + * the toString() method. **/ private String rule = null; /** - * A List to hold the types (T, W or P) - * of the parts of the rule. + * A List to hold the types (T, W or P) of the parts of the rule. **/ private List<String> types = new ArrayList<String>(); /** - * A List to hold the offsets for the - * parts of the rule. + * A List to hold the offsets for the parts of the rule. **/ private List<List<Integer>> offsets = new ArrayList<List<Integer>>(); /** - * A List to hold the values for the - * parts of the rule. + * A List to hold the values for the parts of the rule. **/ private List<String> values = new ArrayList<String>(); @@ -81,174 +78,180 @@ **/ private int end = Integer.MIN_VALUE; - public Rule(String rule) - { - //store a copy of the defining text + public Rule(String rule) { + // store a copy of the defining text this.rule = rule; - //split the rule into pieces at the spaces + // split the rule into pieces at the spaces String[] parts = rule.split(" "); - //store the last part of the rule as the out tag - outTag = parts[parts.length-1]; + // store the last part of the rule as the out tag + outTag = parts[parts.length - 1]; - //Use the Pattern to split the template type - //into the different token/offsets + // Use the Pattern to split the template type + // into the different token/offsets Matcher mt = pt.matcher(parts[0]); - //We know that the first value is at position 1 - //in the split array + // We know that the first value is at position 1 + // in the split array int index = 1; - while (mt.find()) - { - //while there are still parts to process, - //get the next one + while (mt.find()) { + // while there are still parts to process, + // get the next one String to = mt.group(); - //store the type of this part - types.add(to.substring(0,1)); + // store the type of this part + types.add(to.substring(0, 1)); - //create a new list to hold the offsets - //for this part + // create a new list to hold the offsets + // for this part List<Integer> ofs = new ArrayList<Integer>(); - //split the offsets into separate parts + // split the offsets into separate parts Matcher mo = po.matcher(to.substring(1)); - //store the value associated with this part of the rule + // store the value associated with this part of the rule values.add(parts[index++]); - while (mo.find()) - { - //while there are more offsets, + while (mo.find()) { + // while there are more offsets, - //get the next one and make an Integer from it - //(we have to replace '_' by '-' first for it to work) - Integer offset = new Integer(mo.group().replaceAll("_","-")); + // get the next one and make an Integer from it + // (we have to replace '_' by '-' first for it to work) + Integer offset = Integer.valueOf(mo.group().replaceAll("_", "-")); - //if the current offset is before the known beginning then - //make this the beginning - if (offset.intValue() < begin) begin = offset.intValue(); + // if the current offset is before the known beginning then + // make this the beginning + if (offset.intValue() < begin) + begin = offset.intValue(); - //if the current offset is after the known ending then - //make this offset the end - if (offset.intValue() > end) end = offset.intValue(); + // if the current offset is after the known ending then + // make this offset the end + if (offset.intValue() > end) + end = offset.intValue(); - //store the offset in the list + // store the offset in the list ofs.add(offset); } - //store the list of offsets for this part + // store the list of offsets for this part offsets.add(ofs); } } /** - * Simply returns true if this rule matches against the sentence at - * a given position. This method makes no alterations to the - * tags assigned to any specific offset. - * @param currentToken the index within the sentece of the token - * upon which the rule is centered. - * @param words an ordered List of the words within the sentence. - * @param tags an ordered List of the chunk tags within the sentence. - * @param pos an ordered List of the POS tags within the sentence. + * Simply returns true if this rule matches against the sentence at a given + * position. This method makes no alterations to the tags assigned to any + * specific offset. + * + * @param currentToken + * the index within the sentece of the token upon which the rule + * is centered. + * @param words + * an ordered List of the words within the sentence. + * @param tags + * an ordered List of the chunk tags within the sentence. + * @param pos + * an ordered List of the POS tags within the sentence. * @return true if the rule matches the input sentence, false otherwise. **/ - public boolean match(int currentToken, List<String> words, List<String> tags, List<String> pos) - { - //if the rule doesn't fit within the sentence then it can never - //match so simply return false - if (!withinSentence(words.size(), currentToken)) return false; + public boolean match(int currentToken, List<String> words, + List<String> tags, List<String> pos) { + // if the rule doesn't fit within the sentence then it can never + // match so simply return false + if (!withinSentence(words.size(), currentToken)) + return false; - //assume the rule will match + // assume the rule will match boolean matched = true; - //loop through all the parts of this rule - for (int i = 0 ; i < types.size() ; ++i) - { - //get the current type + // loop through all the parts of this rule + for (int i = 0; i < types.size(); ++i) { + // get the current type String type = types.get(i); - //get the list of offsets for the part + // get the list of offsets for the part List<Integer> ofs = offsets.get(i); - //get the value for this part + // get the value for this part String value = values.get(i); - //A placeholder for the right list + // A placeholder for the right list List<String> working = null; - if (type.equals("T")) - { - //if the type is "T" then the list we - //are going to work on contains chunk tags + if (type.equals("T")) { + // if the type is "T" then the list we + // are going to work on contains chunk tags working = tags; - } - else if (type.equals("W")) - { - //if the type is "W" then the list we - //are going to work on contains words + } else if (type.equals("W")) { + // if the type is "W" then the list we + // are going to work on contains words working = words; - } - else if (type.equals("P")) - { - //if the type is "P" then the list we - //are going to work on contains POS tags + } else if (type.equals("P")) { + // if the type is "P" then the list we + // are going to work on contains POS tags working = pos; + } else { + // we have an invalid rule so we can't match it + return false; } - //get the first (maybe the only) offset for this part + // get the first (maybe the only) offset for this part int offset = ofs.get(0).intValue(); - //does the value of this offset match the value given in the rule - boolean matchOffset = working.get(currentToken+offset).equals(value); + // does the value of this offset match the value given in the rule + boolean matchOffset = working.get(currentToken + offset).equals( + value); - for (int j = 1 ; j < ofs.size() ; ++j) - { - //if there is more than one offset then... + for (int j = 1; j < ofs.size(); ++j) { + // if there is more than one offset then... - //get the next offset + // get the next offset offset = ofs.get(j).intValue(); - //or the truth of matching the value in the rule against - //the value of the offset - matchOffset = matchOffset || working.get(currentToken+offset).equals(value); + // or the truth of matching the value in the rule against + // the value of the offset + matchOffset = matchOffset + || working.get(currentToken + offset).equals(value); } - //combine the success/failure of matching this part with that - //of matching the rest of the rule + // combine the success/failure of matching this part with that + // of matching the rest of the rule matched = matched && matchOffset; - //if we have failed to match there is no point trying - //to match the rest of the rule so jump out of this loop - if (!matched) i = types.size(); + // if we have failed to match there is no point trying + // to match the rest of the rule so jump out of this loop + if (!matched) + i = types.size(); } - //return the result of matching we have found + // return the result of matching we have found return matched; } /** * Simply returns the new chunk tag to use if this rule matched. + * * @return the new chunk tag. **/ - public String getNewTag() - { - //simply return the out tag + public String getNewTag() { + // simply return the out tag return outTag; } /** - * A method which allows you to check that this rule fits within - * the sentence when centered on a specific token. - * @param numTokens the total number of tokens in the sentence. - * @param currentToken the index of the token upon which the - * rule is going to be centered. + * A method which allows you to check that this rule fits within the + * sentence when centered on a specific token. + * + * @param numTokens + * the total number of tokens in the sentence. + * @param currentToken + * the index of the token upon which the rule is going to be + * centered. * @return true if the rule fits within the sentence, false otherwise. **/ - public boolean withinSentence(int numTokens, int currentToken) - { + public boolean withinSentence(int numTokens, int currentToken) { int start = currentToken + begin; int finish = currentToken + end; @@ -259,9 +262,9 @@ return within; } - @Override public String toString() - { - //simply return the line of the rules file + @Override + public String toString() { + // simply return the line of the rules file return rule; } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs