Revision: 19526 http://sourceforge.net/p/gate/code/19526 Author: markagreenwood Date: 2016-08-19 17:01:15 +0000 (Fri, 19 Aug 2016) Log Message: ----------- formatting and removed some unused stuff
Modified Paths: -------------- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -26,35 +26,23 @@ import edu.stanford.nlp.trees.TypedDependency; public enum DependencyMode { - Typed, - AllTyped, - TypedCollapsed, - TypedCCprocessed; - - - protected static Collection<TypedDependency> getDependencies(GrammaticalStructure gs, - DependencyMode mode, boolean includeExtras) { + Typed, AllTyped, TypedCollapsed, TypedCCprocessed; + protected static Collection<TypedDependency> getDependencies( + GrammaticalStructure gs, DependencyMode mode, boolean includeExtras) { Collection<TypedDependency> result = null; - Extras incl = Extras.NONE; if(includeExtras) { incl = Extras.MAXIMAL; } - - if (mode.equals(Typed)) { + if(mode.equals(Typed)) { result = gs.typedDependencies(incl); - } - else if (mode.equals(AllTyped)) { + } else if(mode.equals(AllTyped)) { result = gs.allTypedDependencies(); - } - else if (mode.equals(TypedCollapsed)) { + } else if(mode.equals(TypedCollapsed)) { result = gs.typedDependenciesCollapsed(incl); - } - else if (mode.equals(TypedCCprocessed)) { + } else if(mode.equals(TypedCCprocessed)) { result = gs.typedDependenciesCCprocessed(incl); } - return result; } - } Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -14,34 +14,33 @@ * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. - * - * $Id: DependencyRelation.java 15600 2012-03-19 15:40:56Z adamfunk $ + * + * $Id: DependencyRelation.java 15600 2012-03-19 15:40:56Z adamfunk $ */ package gate.stanford; import java.io.Serializable; /** - * Simple data structure representing a single dependency relation. The "target" - * is the Annotation ID of the dependent; the "type" is the dependency - * tag (<a href="http://nlp.stanford.edu/software/parser-faq.shtml#c">the - * Stanford Parser documentation</a> contains links to the tagset</a>; for example, - * nsubj = "nominal subject", dobj = "direct object). + * Simple data structure representing a single dependency relation. The "target" + * is the Annotation ID of the dependent; the "type" is the dependency tag (<a + * href="http://nlp.stanford.edu/software/parser-faq.shtml#c">the Stanford + * Parser documentation</a> contains links to the tagset</a>; for example, nsubj + * = "nominal subject", dobj = "direct object). */ public class DependencyRelation implements Serializable { - private static final long serialVersionUID = -7842607116149222052L; /** * The type of the dependency relation (det, amod, etc.). */ private String type; - + /** * The ID of the token that is the target of this relation. */ private Integer targetId; - + public DependencyRelation(String type, Integer targetId) { this.type = type; this.targetId = targetId; @@ -49,6 +48,7 @@ /** * Return the dependency tag (type). + * * @return the dependency tag */ public String getType() { @@ -57,7 +57,9 @@ /** * Set the dependency tag. - * @param type dependency tag + * + * @param type + * dependency tag */ public void setType(String type) { this.type = type; @@ -65,6 +67,7 @@ /** * Return the GATE Annotation ID of the dependent. + * * @return the Annotation ID */ public Integer getTargetId() { @@ -73,16 +76,17 @@ /** * Set the Annotation ID of the dependent. - * @param targetId the Annotation ID + * + * @param targetId + * the Annotation ID */ public void setTargetId(Integer targetId) { this.targetId = targetId; } - + /** - * Format the data structure for display. - * For example, if type is "dobj" and the dependent has Annotation ID 37, - * return the String "dobj(37)". + * Format the data structure for display. For example, if type is "dobj" and + * the dependent has Annotation ID 37, return the String "dobj(37)". */ public String toString() { return type + "(" + targetId + ")"; Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -19,7 +19,6 @@ * * $Id: NER.java 15468 2013-10-22 21:13:15Z $ */ - package gate.stanford; import edu.stanford.nlp.ie.AbstractSequenceClassifier; @@ -56,9 +55,8 @@ /** * This class is a wrapper for the Stanford NER tool v3.2.0. */ -@CreoleResource(name = "Stanford NER", comment = "Stanford Named Entity Recogniser", icon = "ne-transducer", helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford") +@CreoleResource(name = "Stanford NER", comment = "Stanford Named Entity Recogniser", icon = "ne-transducer", helpURL = "http://gate.ac.uk/userguide/sec:misc:creole:stanford") public class NER extends AbstractLanguageAnalyser { - private static final long serialVersionUID = -6001372186847970080L; public static final String TAG_DOCUMENT_PARAMETER_NAME = "document"; @@ -68,10 +66,10 @@ public static final String TAG_ENCODING_PARAMETER_NAME = "encoding"; public static final String BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = - "baseTokenAnnotationType"; + "baseTokenAnnotationType"; public static final String BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = - "baseSentenceAnnotationType"; + "baseSentenceAnnotationType"; public static final String TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName"; @@ -97,7 +95,8 @@ if(tagger == null) { fireStatusChanged("Loading Stanford NER model"); try { - // nasty workaround for stanford NER's path format inconsistency - tagger is content with uris beginning file:, ner labeller is not + // nasty workaround for stanford NER's path format inconsistency - + // tagger is content with uris beginning file:, ner labeller is not tagger = CRFClassifier.getClassifier(modelFile.toString().substring(5)); } catch(Exception e) { throw new ResourceInstantiationException(e); @@ -117,172 +116,143 @@ // check the parameters if(document == null) throw new ExecutionException("No document to process!"); - AnnotationSet inputAS = document.getAnnotations(inputASName); AnnotationSet outputAS = document.getAnnotations(outputASName); - - if(baseTokenAnnotationType == null || - baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException( - "No base Token Annotation Type provided!"); } - - if(baseSentenceAnnotationType == null || - baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException( - "No base Sentence Annotation Type provided!"); } - + if(baseTokenAnnotationType == null + || baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException( + "No base Token Annotation Type provided!"); } + if(baseSentenceAnnotationType == null + || baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException( + "No base Sentence Annotation Type provided!"); } AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType); AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType); - if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && - tokensAS.size() > 0) { + if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null + && tokensAS.size() > 0) { long startTime = System.currentTimeMillis(); fireStatusChanged("NER searching " + document.getName()); fireProgressChanged(0); - // prepare the input for CRFClassifier List<CoreLabel> sentenceForTagger = new ArrayList<CoreLabel>(); - // define a comparator for annotations by start offset OffsetComparator offsetComparator = new OffsetComparator(); - // read all the tokens and all the sentences List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS); Collections.sort(sentencesList, offsetComparator); List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS); Collections.sort(tokensList, offsetComparator); - Iterator<Annotation> sentencesIter = sentencesList.iterator(); ListIterator<Annotation> tokensIter = tokensList.listIterator(); - List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>(); Annotation currentToken = tokensIter.next(); int sentIndex = 0; int sentCnt = sentencesAS.size(); - // go through sentence annotations in the document while(sentencesIter.hasNext()) { Annotation currentSentence = sentencesIter.next(); - // reset sentence-level processing variables tokensInCurrentSentence.clear(); sentenceForTagger.clear(); - // while we have sane tokens - while(currentToken != null && - currentToken.getEndNode().getOffset() - .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { - + while(currentToken != null + && currentToken.getEndNode().getOffset() + .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { // If we're only labelling Tokens within baseSentenceAnnotationType, // don't add the sentence if the Tokens aren't within the span of // baseSentenceAnnotationType if(currentToken.withinSpanOf(currentSentence)) { tokensInCurrentSentence.add(currentToken); - - // build a stanford nlp representation of the token and add it to the sequence + // build a stanford nlp representation of the token and add it to + // the sequence CoreLabel currentLabel = new CoreLabel(); - currentLabel.setWord((String)currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)); - + currentLabel.setWord((String)currentToken.getFeatures().get( + TOKEN_STRING_FEATURE_NAME)); sentenceForTagger.add(currentLabel); } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } - // if the sentence doesn't contain any tokens (which is a bit weird but // is possible) then don't try running the labeller if(sentenceForTagger.isEmpty()) continue; - // run the labeller List<CoreLabel> taggerResults = - tagger.classifySentence(sentenceForTagger); - + tagger.classifySentence(sentenceForTagger); // add the results // make sure no malfunction occurred if(taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException( - "NER labeller malfunction: the output size (" + - taggerResults.size() + ") is different from the input size (" + - tokensInCurrentSentence.size() + ")!"); - + "NER labeller malfunction: the output size (" + + taggerResults.size() + + ") is different from the input size (" + + tokensInCurrentSentence.size() + ")!"); // proceed through the annotated sequence Iterator<CoreLabel> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); - String previousLabel = outsideLabel; Long previousEnd = new Long(-1); Long entityStart = new Long(-1); - - //No idea why this was there so lets comment it out - //Long entityEnd = new Long(-1); - + // No idea why this was there so lets comment it out + // Long entityEnd = new Long(-1); Annotation annot; String nerLabel = ""; - while(resIter.hasNext()) { - // for each labelled token.. annot = tokIter.next(); CoreLabel word = resIter.next(); nerLabel = word.get(CoreAnnotations.AnswerAnnotation.class); - // falling edge transition: entity ends // guard against this triggering at document start - if (!nerLabel.equals(previousLabel) && !previousLabel.equals(outsideLabel) && entityStart != -1) { - -// System.out.println("falling edge"); + if(!nerLabel.equals(previousLabel) + && !previousLabel.equals(outsideLabel) && entityStart != -1) { + // System.out.println("falling edge"); // get final bound; add new annotation in output AS try { - outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl()); - } catch (InvalidOffsetException e) { + outputAS.add(entityStart, previousEnd, previousLabel, + new SimpleFeatureMapImpl()); + } catch(InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } - } - // rising edge transition: entity starts - if (!nerLabel.equals(previousLabel) && !nerLabel.equals(outsideLabel)) { -// System.out.println("rising edge"); + if(!nerLabel.equals(previousLabel) && !nerLabel.equals(outsideLabel)) { + // System.out.println("rising edge"); entityStart = annot.getStartNode().getOffset(); } -// System.out.println(word.word() + "/" + nerLabel); - + // System.out.println(word.word() + "/" + nerLabel); previousLabel = nerLabel; previousEnd = annot.getEndNode().getOffset(); - } - // clean up, in case last token in sentence was in an entity - if (!nerLabel.equals(outsideLabel)) { + if(!nerLabel.equals(outsideLabel)) { try { - outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl()); - } catch (InvalidOffsetException e) { + outputAS.add(entityStart, previousEnd, previousLabel, + new SimpleFeatureMapImpl()); + } catch(InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } } - fireProgressChanged(sentIndex++ * 100 / sentCnt); - } - fireProcessFinished(); - fireStatusChanged(document.getName() + - " tagged in " + - NumberFormat.getInstance().format( - (double)(System.currentTimeMillis() - startTime) / 1000) + - " seconds!"); + fireStatusChanged(document.getName() + + " tagged in " + + NumberFormat.getInstance().format( + (double)(System.currentTimeMillis() - startTime) / 1000) + + " seconds!"); } else { if(failOnMissingInputAnnotations) { throw new ExecutionException( - "No sentences or tokens to process in document " + - document.getName() + "\n" + "Please run a sentence splitter " + - "and tokeniser first!"); + "No sentences or tokens to process in document " + + document.getName() + "\n" + "Please run a sentence splitter " + + "and tokeniser first!"); } else { Utils - .logOnce( - logger, - Level.INFO, - "NE labeller: no sentence or token annotations in input document - see debug log for details."); + .logOnce( + logger, + Level.INFO, + "NE labeller: no sentence or token annotations in input document - see debug log for details."); logger.debug("No input annotations in document " + document.getName()); } } - } public void setEncoding(String encoding) { @@ -335,7 +305,6 @@ this.outputASName = outputASName; } - @RunTime @CreoleParameter(comment = "Label used by model for tokens outside entities", defaultValue = "O") public void setOutsideLabel(String outsideLabel) { @@ -346,14 +315,11 @@ return this.outsideLabel; } - - @CreoleParameter(comment = "Path to the NER model file", defaultValue = "resources/english.all.3class.distsim.crf.ser.gz", suffixes="tagger;model;gz") + @CreoleParameter(comment = "Path to the NER model file", defaultValue = "resources/english.all.3class.distsim.crf.ser.gz", suffixes = "tagger;model;gz") public void setModelFile(URL modelFile) { this.modelFile = modelFile; } - - public URL getModelFile() { return this.modelFile; } Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -14,8 +14,8 @@ * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. - * - * $Id: Parser.java 17831 2014-04-15 09:37:23Z ian_roberts $ + * + * $Id: Parser.java 17831 2014-04-15 09:37:23Z ian_roberts $ */ package gate.stanford; @@ -43,7 +43,6 @@ import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import gate.creole.metadata.Sharable; -import gate.util.Files; import gate.util.InvalidOffsetException; import java.io.BufferedReader; @@ -67,197 +66,190 @@ * be stored in the outputAS in various ways, controlled by CREOLE run-time * parameters. */ -@CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper", - helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford") -public class Parser extends AbstractLanguageAnalyser -implements ProcessingResource { - +@CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper", helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford") +public class Parser extends AbstractLanguageAnalyser implements + ProcessingResource { private static final long serialVersionUID = -3062171258011850283L; protected LexicalizedParser stanfordParser; - /* Type "SyntaxTreeNode" with feature "cat" is compatible with the - * classic SyntaxTreeViewer. */ - public static final String PHRASE_ANNOTATION_TYPE = "SyntaxTreeNode" ; - public static final String PHRASE_CAT_FEATURE = "cat" ; - - /* But "category" feature is compatible with the ANNIE POS tagger. */ - private static final String POS_TAG_FEATURE = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; + /* + * Type "SyntaxTreeNode" with feature "cat" is compatible with the classic + * SyntaxTreeViewer. + */ + public static final String PHRASE_ANNOTATION_TYPE = "SyntaxTreeNode"; - public static final String DEPENDENCY_ANNOTATION_TYPE = "Dependency"; - public static final String DEPENDENCY_ARG_FEATURE = "args"; - public static final String DEPENDENCY_LABEL_FEATURE = "kind"; + public static final String PHRASE_CAT_FEATURE = "cat"; - protected String annotationSetName; - private URL parserFile; - protected boolean debugMode; - private boolean reusePosTags; + /* But "category" feature is compatible with the ANNIE POS tagger. */ + private static final String POS_TAG_FEATURE = + ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; - private Map<String, String> tagMap; - protected GrammaticalStructureFactory gsf; - + public static final String DEPENDENCY_ANNOTATION_TYPE = "Dependency"; - /* CREOLE parameters for optional mapping */ - private boolean useMapping = false; - private URL mappingFileURL; - - /* internal variables for mapping */ - private File mappingFile; - private boolean mappingLoaded = false; - - /* CREOLE parameters: what are we going to annotate, and how? */ - private String inputSentenceType; - private String inputTokenType; - private boolean addConstituentAnnotations; - private boolean addDependencyFeatures; - private boolean addDependencyAnnotations; - private boolean addPosTags; - private boolean includeExtraDependencies; + public static final String DEPENDENCY_ARG_FEATURE = "args"; + + public static final String DEPENDENCY_LABEL_FEATURE = "kind"; + + protected String annotationSetName; + + private URL parserFile; + + protected boolean debugMode; + + private boolean reusePosTags; + + private Map<String, String> tagMap; + + protected GrammaticalStructureFactory gsf; + + /* CREOLE parameters for optional mapping */ + private boolean useMapping = false; + + private URL mappingFileURL; + + /* internal variables for mapping */ + private File mappingFile; + + private boolean mappingLoaded = false; + + /* CREOLE parameters: what are we going to annotate, and how? */ + private String inputSentenceType; + + private String inputTokenType; + + private boolean addConstituentAnnotations; + + private boolean addDependencyFeatures; + + private boolean addDependencyAnnotations; + + private boolean addPosTags; + + private boolean includeExtraDependencies; + private DependencyMode dependencyMode; - /** - * The {@link TreebankLangParserParams} implementation to use. This is - * where we get the language pack, and then the - * {@link GrammaticalStructureFactory} used to extract the - * dependencies from the parse. In most cases you should leave this at - * the default value, which is suitable for English text. + * The {@link TreebankLangParserParams} implementation to use. This is where + * we get the language pack, and then the {@link GrammaticalStructureFactory} + * used to extract the dependencies from the parse. In most cases you should + * leave this at the default value, which is suitable for English text. */ private String tlppClass; - /** * The name of the feature to add to tokens. The feature value is a - * {@link List} of {@link DependencyRelation} objects giving the - * dependencies from this token to other tokens. + * {@link List} of {@link DependencyRelation} objects giving the dependencies + * from this token to other tokens. */ protected String dependenciesFeature = "dependencies"; - - /** - * Parse the current document. (This is the principal - * method called by a CorpusController.) + * Parse the current document. (This is the principal method called by a + * CorpusController.) */ public void execute() throws ExecutionException { interrupted = false; long startTime = System.currentTimeMillis(); - if(document == null) { - throw new ExecutionException("No document to process!"); - } + if(document == null) { throw new ExecutionException( + "No document to process!"); } fireStatusChanged("Running " + this.getName() + " on " + document.getName()); fireProgressChanged(0); - - if (debugMode) { + if(debugMode) { System.out.println("Parsing document: " + document.getName()); } - - if (useMapping && (! mappingLoaded) ) { + if(useMapping && (!mappingLoaded)) { System.err.println("Warning: no mapping loaded!"); } - checkInterruption(); - if (addConstituentAnnotations || addDependencyFeatures || addDependencyAnnotations || addPosTags) { + if(addConstituentAnnotations || addDependencyFeatures + || addDependencyAnnotations || addPosTags) { parseSentences(document.getAnnotations(annotationSetName)); - } - else { + } else { System.err.println("There is nothing for the parser to do."); - System.err.println("Please enable at least one of the \"add...\" options."); + System.err + .println("Please enable at least one of the \"add...\" options."); } - fireProcessFinished(); - fireStatusChanged("Finished " + this.getName() + " on " + document.getName() - + " in " + NumberFormat.getInstance().format( + fireStatusChanged("Finished " + + this.getName() + + " on " + + document.getName() + + " in " + + NumberFormat.getInstance().format( (double)(System.currentTimeMillis() - startTime) / 1000) - + " seconds!"); + + " seconds!"); } - /** - * Initialize the Parser resource. In particular, load the trained data - * file. + * Initialize the Parser resource. In particular, load the trained data file. */ public Resource init() throws ResourceInstantiationException { instantiateStanfordParser(); - if (mappingFile != null) { + if(mappingFile != null) { loadTagMapping(mappingFile); } - super.init(); - - if(tlppClass == null || tlppClass.equals("")) { - throw new ResourceInstantiationException( - "TLPP class name must be specified"); - } + if(tlppClass == null || tlppClass.equals("")) { throw new ResourceInstantiationException( + "TLPP class name must be specified"); } try { - Class<?> tlppClassObj = - Class.forName(tlppClass); - if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) { - throw new ResourceInstantiationException(tlppClassObj - + " does not implement " - + TreebankLangParserParams.class.getName()); - } + Class<?> tlppClassObj = Class.forName(tlppClass); + if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) { throw new ResourceInstantiationException( + tlppClassObj + " does not implement " + + TreebankLangParserParams.class.getName()); } TreebankLangParserParams tlpp = - TreebankLangParserParams.class.cast(tlppClassObj.newInstance()); + TreebankLangParserParams.class.cast(tlppClassObj.newInstance()); gsf = tlpp.treebankLanguagePack().grammaticalStructureFactory(); - } - catch(UnsupportedOperationException e) { + } catch(UnsupportedOperationException e) { throw new ResourceInstantiationException(e); - } - catch(ClassNotFoundException e) { + } catch(ClassNotFoundException e) { throw new ResourceInstantiationException("Class " + tlppClass - + " not found", e); - } - catch(InstantiationException e) { + + " not found", e); + } catch(InstantiationException e) { throw new ResourceInstantiationException("Error creating TLPP object", e); - } - catch(IllegalAccessException e) { + } catch(IllegalAccessException e) { throw new ResourceInstantiationException("Error creating TLPP object", e); } return this; } - /** - * Re-initialize the Parser resource. In particular, reload the trained - * data file. + * Re-initialize the Parser resource. In particular, reload the trained data + * file. */ - @Override + @Override public void reInit() throws ResourceInstantiationException { stanfordParser = null; init(); - } + } - - /** * Find all the Sentence annotations and iterate through them, parsing one * sentence at a time and storing the result in the output AS. (Sentences are * scanned for Tokens. You have to run the ANNIE tokenizer and splitter before * this PR.) - * @throws ExecutionInterruptedException + * + * @throws ExecutionInterruptedException */ - private void parseSentences(AnnotationSet annotationSet) throws ExecutionInterruptedException { - List<Annotation> sentences = gate.Utils.inDocumentOrder(annotationSet.get(inputSentenceType)); + private void parseSentences(AnnotationSet annotationSet) + throws ExecutionInterruptedException { + List<Annotation> sentences = + gate.Utils.inDocumentOrder(annotationSet.get(inputSentenceType)); int sentencesDone = 0; int nbrSentences = sentences.size(); - - for (Annotation sentence : sentences) { + for(Annotation sentence : sentences) { parseOneSentence(annotationSet, sentence, sentencesDone, nbrSentences); sentencesDone++; checkInterruption(); } - sentencesDone++; fireProgressChanged(100 * sentencesDone / nbrSentences); + } - } - - - /** * Generate the special data structure for one sentence and pass the List of - * Word to the parser. Apply the annotations back to the document. + * Word to the parser. Apply the annotations back to the document. * * @param sentence * the Sentence annotation @@ -265,106 +257,100 @@ * sentence number of debugging output * @param ofS * total number of sentences for debugging output - * @return null if the sentence is empty - * @throws ExecutionInterruptedException + * @return null if the sentence is empty + * @throws ExecutionInterruptedException */ - private void parseOneSentence(AnnotationSet annotationSet, Annotation sentence, int sentCtr, int sentCount) throws ExecutionInterruptedException { + private void parseOneSentence(AnnotationSet annotationSet, + Annotation sentence, int sentCtr, int sentCount) + throws ExecutionInterruptedException { Tree tree; - - StanfordSentence stanfordSentence = new StanfordSentence(sentence, inputTokenType, annotationSet, reusePosTags); - if (debugMode) { + StanfordSentence stanfordSentence = + new StanfordSentence(sentence, inputTokenType, annotationSet, + reusePosTags); + if(debugMode) { System.out.println(stanfordSentence.toString()); } - - /* Ignore an empty Sentence (sometimes the regex splitter can create one + /* + * Ignore an empty Sentence (sometimes the regex splitter can create one * with no Token annotations in it). */ - if ( stanfordSentence.isNotEmpty() ) { + if(stanfordSentence.isNotEmpty()) { List<Word> wordList = stanfordSentence.getWordList(); - - if (reusePosTags) { + if(reusePosTags) { int nbrMissingTags = stanfordSentence.numberOfMissingPosTags(); - if (nbrMissingTags > 0) { - double percentMissing = Math.ceil(100.0 * (nbrMissingTags) / - (stanfordSentence.numberOfTokens()) ); - System.err.println("Warning (sentence " + sentCtr + "): " + (int) percentMissing - + "% of the Tokens are missing POS tags." ); + if(nbrMissingTags > 0) { + double percentMissing = + Math.ceil(100.0 * (nbrMissingTags) + / (stanfordSentence.numberOfTokens())); + System.err.println("Warning (sentence " + sentCtr + "): " + + (int)percentMissing + "% of the Tokens are missing POS tags."); } } - - tree = stanfordParser.parse(wordList); + tree = stanfordParser.parse(wordList); checkInterruption(); - - if (addConstituentAnnotations || addPosTags) { - annotatePhraseStructureRecursively(annotationSet, stanfordSentence, tree, tree); + if(addConstituentAnnotations || addPosTags) { + annotatePhraseStructureRecursively(annotationSet, stanfordSentence, + tree, tree); } - checkInterruption(); - if (addDependencyFeatures || addDependencyAnnotations) { + if(addDependencyFeatures || addDependencyAnnotations) { annotateDependencies(annotationSet, stanfordSentence, tree); } - - if (debugMode) { + if(debugMode) { System.out.println("Parsed sentence " + sentCtr + " of " + sentCount); } + } else if(debugMode) { + System.out.println("Ignored empty sentence " + sentCtr + " of " + + sentCount); } - - else if (debugMode) { - System.out.println("Ignored empty sentence " + sentCtr + " of " + sentCount); - } } - /** - * Generate a SyntaxTreeNode Annotation corresponding to this Tree. Work - * recursively so that the annotations are actually generated from the - * bottom up, in order to build the consists list of annotation IDs. + * Generate a SyntaxTreeNode Annotation corresponding to this Tree. Work + * recursively so that the annotations are actually generated from the bottom + * up, in order to build the consists list of annotation IDs. * - * @param tree the current subtree - * @param rootTree the whole sentence, used to find the span of the current subtree + * @param tree + * the current subtree + * @param rootTree + * the whole sentence, used to find the span of the current subtree * @return a GATE Annotation of type "SyntaxTreeNode" */ - protected Annotation annotatePhraseStructureRecursively(AnnotationSet annotationSet, StanfordSentence stanfordSentence, Tree tree, Tree rootTree) { + protected Annotation annotatePhraseStructureRecursively( + AnnotationSet annotationSet, StanfordSentence stanfordSentence, + Tree tree, Tree rootTree) { Annotation annotation = null; Annotation child; - String label = tree.value(); - + String label = tree.value(); List<Tree> children = tree.getChildrenAsList(); - - if (children.size() == 0) { - return null; - } + if(children.size() == 0) { return null; } /* implied else */ - - /* following line generates ClassCastException - * IntPair span = tree.getSpan(); - * edu.stanford.nlp.ling.CategoryWordTag - * at edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393) - * but I think it's a bug in the parser, so I'm hacking - * around it as follows. */ + /* + * following line generates ClassCastException IntPair span = + * tree.getSpan(); edu.stanford.nlp.ling.CategoryWordTag at + * edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393) but I think it's a bug + * in the parser, so I'm hacking around it as follows. + */ int startPos = Trees.leftEdge(tree, rootTree); - int endPos = Trees.rightEdge(tree, rootTree); - + int endPos = Trees.rightEdge(tree, rootTree); Long startNode = stanfordSentence.startPos2offset(startPos); - Long endNode = stanfordSentence.endPos2offset(endPos); - + Long endNode = stanfordSentence.endPos2offset(endPos); List<Integer> consists = new ArrayList<Integer>(); - Iterator<Tree> childIter = children.iterator(); - while (childIter.hasNext()) { - child = annotatePhraseStructureRecursively(annotationSet, stanfordSentence, childIter.next(), rootTree); - if ( (child != null) && - (! child.getType().equals(inputTokenType) )) { + while(childIter.hasNext()) { + child = + annotatePhraseStructureRecursively(annotationSet, stanfordSentence, + childIter.next(), rootTree); + if((child != null) && (!child.getType().equals(inputTokenType))) { consists.add(child.getId()); } } - annotation = annotatePhraseStructureConstituent(annotationSet, startNode, endNode, label, consists, tree.depth()); - + annotation = + annotatePhraseStructureConstituent(annotationSet, startNode, endNode, + label, consists, tree.depth()); return annotation; } - - /** * Record one constituent as an annotation. * @@ -375,169 +361,148 @@ * @param depth * @return */ - private Annotation annotatePhraseStructureConstituent(AnnotationSet annotationSet, Long startOffset, Long endOffset, String label, - List<Integer> consists, int depth) { + private Annotation annotatePhraseStructureConstituent( + AnnotationSet annotationSet, Long startOffset, Long endOffset, + String label, List<Integer> consists, int depth) { Annotation phrAnnotation = null; Integer phrID; - try { String cat; - if (useMapping && mappingLoaded) { - cat = translateTag(label); + if(useMapping && mappingLoaded) { + cat = translateTag(label); + } else { + cat = label; } - else { - cat = label; - } - - if (addConstituentAnnotations) { - String text = document.getContent().getContent(startOffset, endOffset).toString(); + if(addConstituentAnnotations) { + String text = + document.getContent().getContent(startOffset, endOffset).toString(); FeatureMap fm = gate.Factory.newFeatureMap(); fm.put(PHRASE_CAT_FEATURE, cat); fm.put("text", text); - /* Ignore empty list features on the token-equivalent annotations. */ - if (consists.size() > 0) { + if(consists.size() > 0) { fm.put("consists", consists); } - - phrID = annotationSet.add(startOffset, endOffset, PHRASE_ANNOTATION_TYPE, fm); + phrID = + annotationSet.add(startOffset, endOffset, PHRASE_ANNOTATION_TYPE, + fm); phrAnnotation = annotationSet.get(phrID); recordID(annotationSet, phrID); } - - if ( addPosTags && (depth == 1) ) { + if(addPosTags && (depth == 1)) { /* Expected to be a singleton set! */ - AnnotationSet tokenSet = annotationSet.get(inputTokenType, startOffset, endOffset); - if (tokenSet.size() == 1) { + AnnotationSet tokenSet = + annotationSet.get(inputTokenType, startOffset, endOffset); + if(tokenSet.size() == 1) { Annotation token = tokenSet.iterator().next(); - - /* Add POS tag to token. - * (Note: GATE/Hepple uses "(" and ")" for Penn/Stanford's - * "-LRB-" and "-RRB-". */ + /* + * Add POS tag to token. (Note: GATE/Hepple uses "(" and ")" for + * Penn/Stanford's "-LRB-" and "-RRB-". + */ String hepCat = StanfordSentence.unescapePosTag(cat); token.getFeatures().put(POS_TAG_FEATURE, hepCat); - + } else { + System.err.println("Found a tokenSet with " + tokenSet.size() + + " members!"); } - else { - System.err.println("Found a tokenSet with " + tokenSet.size() + " members!"); - } } - } - catch (InvalidOffsetException e) { + } catch(InvalidOffsetException e) { e.printStackTrace(); } - return phrAnnotation; } - - @SuppressWarnings("unchecked") - private void annotateDependencies(AnnotationSet annotationSet, StanfordSentence stanfordSentence, Tree tree) { + private void annotateDependencies(AnnotationSet annotationSet, + StanfordSentence stanfordSentence, Tree tree) { GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); - Collection<TypedDependency> dependencies = DependencyMode.getDependencies(gs, dependencyMode, includeExtraDependencies); - - if (dependencies == null) { - if (debugMode) { + Collection<TypedDependency> dependencies = + DependencyMode.getDependencies(gs, dependencyMode, + includeExtraDependencies); + if(dependencies == null) { + if(debugMode) { System.out.println("dependencies == null"); } return; } - String dependencyKind; FeatureMap depFeatures; Integer dependentTokenID, governorTokenID; List<Integer> argList; Long offsetLH0, offsetRH0, offsetLH1, offsetRH1, depLH, depRH; Annotation governor, dependent; - for(TypedDependency dependency : dependencies) { if(debugMode) { System.out.println(dependency); } - // Does not work in version 3.5.2 any more - //int governorIndex = dependency.gov().label().index() - 1; - int governorIndex = dependency.gov().index()-1; - governor = stanfordSentence.startPos2token(governorIndex); - - //int dependentIndex = dependency.dep().label().index() - 1; - int dependentIndex = dependency.dep().index()-1; + // int governorIndex = dependency.gov().label().index() - 1; + int governorIndex = dependency.gov().index() - 1; + governor = stanfordSentence.startPos2token(governorIndex); + // int dependentIndex = dependency.dep().label().index() - 1; + int dependentIndex = dependency.dep().index() - 1; dependent = stanfordSentence.startPos2token(dependentIndex); - dependencyKind = dependency.reln().toString(); governorTokenID = governor.getId(); dependentTokenID = dependent.getId(); - - if (addDependencyFeatures) { + if(addDependencyFeatures) { List<DependencyRelation> depsForTok = - (List<DependencyRelation>) governor.getFeatures().get(dependenciesFeature); - + (List<DependencyRelation>)governor.getFeatures().get( + dependenciesFeature); if(depsForTok == null) { depsForTok = new ArrayList<DependencyRelation>(); governor.getFeatures().put(dependenciesFeature, depsForTok); } - - depsForTok.add(new DependencyRelation(dependencyKind, dependentTokenID)); + depsForTok + .add(new DependencyRelation(dependencyKind, dependentTokenID)); } - - if (addDependencyAnnotations) { + if(addDependencyAnnotations) { depFeatures = gate.Factory.newFeatureMap(); argList = new ArrayList<Integer>(); argList.add(governorTokenID); argList.add(dependentTokenID); depFeatures.put(DEPENDENCY_ARG_FEATURE, argList); depFeatures.put(DEPENDENCY_LABEL_FEATURE, dependencyKind); - offsetLH0 = governor.getStartNode().getOffset(); offsetRH0 = governor.getEndNode().getOffset(); offsetLH1 = dependent.getStartNode().getOffset(); offsetRH1 = dependent.getEndNode().getOffset(); - depLH = Math.min(offsetLH0, offsetLH1); depRH = Math.max(offsetRH0, offsetRH1); - try { - annotationSet.add(depLH, depRH, DEPENDENCY_ANNOTATION_TYPE, depFeatures); + annotationSet.add(depLH, depRH, DEPENDENCY_ANNOTATION_TYPE, + depFeatures); + } catch(InvalidOffsetException e) { + e.printStackTrace(); } - catch(InvalidOffsetException e) { - e.printStackTrace(); - } } } } - - private void instantiateStanfordParser() - throws ResourceInstantiationException { + throws ResourceInstantiationException { if(stanfordParser != null) return; - try { - //String filepath = Files.fileFromURL(parserFile).getAbsolutePath(); - stanfordParser = LexicalizedParser.getParserFromSerializedFile(parserFile.toExternalForm()); - } - catch(Exception e) { + // String filepath = Files.fileFromURL(parserFile).getAbsolutePath(); + stanfordParser = + LexicalizedParser.getParserFromSerializedFile(parserFile + .toExternalForm()); + } catch(Exception e) { throw new ResourceInstantiationException(e); } - } + } - - private void loadTagMapping(File mappingFile) { + private void loadTagMapping(File mappingFile) { tagMap = new HashMap<String, String>(); mappingLoaded = false; - try { - if (mappingFile.exists() && mappingFile.canRead()) { - + if(mappingFile.exists() && mappingFile.canRead()) { BufferedReader br = new BufferedReader(new FileReader(mappingFile)); String line = ""; - // read until it reaches to an end of the file while((line = br.readLine()) != null) { - // two columns delimited by whitespace - String [] data = line.split("\\s+", 2); - + // two columns delimited by whitespace + String[] data = line.split("\\s+", 2); // are there key and value available if(data == null || data.length < 2) { continue; @@ -546,27 +511,22 @@ tagMap.put(data[0].trim(), data[1].trim()); } } - br.close(); + } else { + System.err.println("Can't find or read mapping file " + + mappingFile.getPath() + " so no mappings will be used."); } - - else { - System.err.println("Can't find or read mapping file " - + mappingFile.getPath() + " so no mappings will be used."); - } - } - catch(Exception e) { + } catch(Exception e) { System.err.println("Exception trying to load mapping file " - + mappingFile.getPath()); + + mappingFile.getPath()); e.printStackTrace(); } - int nbrMapped = tagMap.size(); - System.out.println("Loaded " + nbrMapped + " mappings from file " + mappingFile); + System.out.println("Loaded " + nbrMapped + " mappings from file " + + mappingFile); mappingLoaded = (nbrMapped > 0); } - /** * This method stores the annotation ID as a value of feature "ID" on the * relevant annotation. (Mainly to make the ID visible in the GUI for @@ -579,13 +539,11 @@ annSet.get(annotationID).getFeatures().put("ID", annotationID); } - private void checkInterruption() throws ExecutionInterruptedException { if(isInterrupted()) { throw new ExecutionInterruptedException( "Execution of " + this.getName() + " has been abruptly interrupted!"); } } - /** * Translate the tag in the map, or leave it the same if there is no * translation. @@ -595,27 +553,22 @@ */ private String translateTag(String stanfordTag) { String translatedTag = stanfordTag; - - if (tagMap.containsKey(stanfordTag)) { + if(tagMap.containsKey(stanfordTag)) { translatedTag = tagMap.get(stanfordTag); } - return translatedTag; } - /* get & set methods for the CREOLE parameters */ - @CreoleParameter(comment = "TreebankLangParserParams implementation used to extract the dependencies", - defaultValue = "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams") + @CreoleParameter(comment = "TreebankLangParserParams implementation used to extract the dependencies", defaultValue = "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams") public void setTlppClass(String tlppClass) { this.tlppClass = tlppClass; } - + public String getTlppClass() { return tlppClass; } - @Optional @RunTime @CreoleParameter(comment = "annotationSet used for input (Token and " @@ -628,8 +581,7 @@ return this.annotationSetName; } - @CreoleParameter(comment = "path to the parser's grammar file", - defaultValue = "resources/englishRNN.ser.gz") + @CreoleParameter(comment = "path to the parser's grammar file", defaultValue = "resources/englishRNN.ser.gz") public void setParserFile(URL parserFile) { this.parserFile = parserFile; } @@ -649,8 +601,7 @@ } @RunTime - @CreoleParameter(comment = "verbose mode for debugging", - defaultValue = "false") + @CreoleParameter(comment = "verbose mode for debugging", defaultValue = "false") public void setDebug(Boolean debug) { this.debugMode = debug.booleanValue(); } @@ -658,10 +609,9 @@ public Boolean getDebug() { return new Boolean(this.debugMode); } - + @RunTime - @CreoleParameter(comment = "Re-use existing POS tags on tokens", - defaultValue = "false") + @CreoleParameter(comment = "Re-use existing POS tags on tokens", defaultValue = "false") public void setReusePosTags(Boolean reusePosTags) { this.reusePosTags = reusePosTags.booleanValue(); } @@ -669,91 +619,79 @@ public Boolean getReusePosTags() { return new Boolean(this.reusePosTags); } - + @RunTime - @CreoleParameter(comment = "Create POS tags on the Token annotations", - defaultValue = "false") + @CreoleParameter(comment = "Create POS tags on the Token annotations", defaultValue = "false") public void setAddPosTags(Boolean posTagTokens) { this.addPosTags = posTagTokens.booleanValue(); } - + public Boolean getAddPosTags() { return new Boolean(this.addPosTags); } @RunTime - @CreoleParameter(comment = "use tag mapping", - defaultValue = "false") + @CreoleParameter(comment = "use tag mapping", defaultValue = "false") public void setUseMapping(Boolean useMapping) { this.useMapping = useMapping.booleanValue(); } - + public Boolean getUseMapping() { return new Boolean(this.useMapping); } - + @RunTime - @CreoleParameter(comment = "Create dependency features on Token annotations", - defaultValue = "true") + @CreoleParameter(comment = "Create dependency features on Token annotations", defaultValue = "true") public void setAddDependencyFeatures(Boolean useDependency) { this.addDependencyFeatures = useDependency.booleanValue(); } - + public Boolean getAddDependencyFeatures() { return new Boolean(this.addDependencyFeatures); } - + @RunTime - @CreoleParameter(comment = "Create annotations to show dependencies", - defaultValue = "true") + @CreoleParameter(comment = "Create annotations to show dependencies", defaultValue = "true") public void setAddDependencyAnnotations(Boolean useDependency) { this.addDependencyAnnotations = useDependency.booleanValue(); } - + public Boolean getAddDependencyAnnotations() { return new Boolean(this.addDependencyAnnotations); } - - + @RunTime - @CreoleParameter(comment = "input annotation type for each sentence", - defaultValue = ANNIEConstants.SENTENCE_ANNOTATION_TYPE ) + @CreoleParameter(comment = "input annotation type for each sentence", defaultValue = ANNIEConstants.SENTENCE_ANNOTATION_TYPE) public void setInputSentenceType(String sType) { this.inputSentenceType = sType; } - + public String getInputSentenceType() { return this.inputSentenceType; } - @RunTime - @CreoleParameter(comment = "input annotation type for each token", - defaultValue = ANNIEConstants.TOKEN_ANNOTATION_TYPE ) + @CreoleParameter(comment = "input annotation type for each token", defaultValue = ANNIEConstants.TOKEN_ANNOTATION_TYPE) public void setInputTokenType(String tType) { this.inputTokenType = tType; } - + public String getInputTokenType() { return this.inputTokenType; } - @RunTime - @CreoleParameter(comment = "Create annotations to show phrase structures", - defaultValue = "true") + @CreoleParameter(comment = "Create annotations to show phrase structures", defaultValue = "true") public void setAddConstituentAnnotations(Boolean usePhraseStructure) { this.addConstituentAnnotations = usePhraseStructure.booleanValue(); } - + public Boolean getAddConstituentAnnotations() { return new Boolean(this.addConstituentAnnotations); } - - + @RunTime - @CreoleParameter(comment = "Dependency Mode", - defaultValue = "Typed") + @CreoleParameter(comment = "Dependency Mode", defaultValue = "Typed") public void setDependencyMode(DependencyMode mode) { this.dependencyMode = mode; } @@ -761,38 +699,34 @@ public DependencyMode getDependencyMode() { return this.dependencyMode; } - + @RunTime - @CreoleParameter(comment = "include extra dependencies", - defaultValue = "false") + @CreoleParameter(comment = "include extra dependencies", defaultValue = "false") public void setIncludeExtraDependencies(Boolean include) { this.includeExtraDependencies = include; } - + public Boolean getIncludeExtraDependencies() { return this.includeExtraDependencies; } - - - /* Made mappingFile an init parameter to simplify things. - * The CREOLE parameter is called "mappingFile" but it's actually a URL. + + /* + * Made mappingFile an init parameter to simplify things. The CREOLE parameter + * is called "mappingFile" but it's actually a URL. */ @Optional @CreoleParameter(comment = "path to the tag mapping file") public void setMappingFile(URL mappingFileURL) { this.mappingFile = null; // override below this.mappingFileURL = mappingFileURL; - - if ( (this.mappingFileURL != null) && - (! this.mappingFileURL.toString().trim().equals("")) ) { + if((this.mappingFileURL != null) + && (!this.mappingFileURL.toString().trim().equals(""))) { try { this.mappingFile = new File(this.mappingFileURL.toURI()); - } - catch(URISyntaxException e) { + } catch(URISyntaxException e) { e.printStackTrace(); } } - } public URL getMappingFile() { @@ -800,22 +734,21 @@ } /** - * Inject an existing instance of the LexicalizedParser. - * <b>This method is intended for use by {@link Factory#ducplicate} - * and should not be called directly.</b> + * Inject an existing instance of the LexicalizedParser. <b>This method is + * intended for use by {@link Factory#ducplicate} and should not be called + * directly.</b> */ @Sharable public void setStanfordParser(LexicalizedParser parser) { this.stanfordParser = parser; } - + /** - * Get the LexicalizedParser used internally by this PR. - * <b>This method is intended for use by {@link Factory#ducplicate} - * and should not be called directly.</b> + * Get the LexicalizedParser used internally by this PR. <b>This method is + * intended for use by {@link Factory#ducplicate} and should not be called + * directly.</b> */ public LexicalizedParser getStanfordParser() { return stanfordParser; } - } Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -14,8 +14,8 @@ * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. - * - * $Id: StanfordSentence.java 15600 2012-03-19 15:40:56Z adamfunk $ + * + * $Id: StanfordSentence.java 15600 2012-03-19 15:40:56Z adamfunk $ */ package gate.stanford; @@ -28,210 +28,174 @@ import gate.util.Strings; /** - * The Stanford Parser itself takes as input a List of edu.stanford.nlp.ling.Word. - * This data structure is constructed from a Sentence Annotation, using the enclosed - * Token Annotations, and yields the required List, as well as methods for - * converting the parser's output spans into GATE Annotation offsets. + * The Stanford Parser itself takes as input a List of + * edu.stanford.nlp.ling.Word. This data structure is constructed from a + * Sentence Annotation, using the enclosed Token Annotations, and yields the + * required List, as well as methods for converting the parser's output spans + * into GATE Annotation offsets. */ public class StanfordSentence { - private Map<Integer, Long> startPosToOffset; + private Map<Integer, Long> endPosToOffset; + private Map<Integer, Annotation> startPosToToken; + private Map<Integer, String> startPosToString; - private List<Word> words; - private Long sentenceStartOffset, sentenceEndOffset; - private List<Annotation> tokens; - private static final String POS_TAG_FEATURE = ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; - private static final String STRING_FEATURE = ANNIEConstants.TOKEN_STRING_FEATURE_NAME; - + private List<Word> words; + + private Long sentenceStartOffset, sentenceEndOffset; + + private List<Annotation> tokens; + + private static final String POS_TAG_FEATURE = + ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME; + + private static final String STRING_FEATURE = + ANNIEConstants.TOKEN_STRING_FEATURE_NAME; + int nbrOfTokens, nbrOfMissingPosTags; - - - /* This is probably dodgy, but I can't find an "unknown" tag - * in the Penn documentation. */ - private static final String UNKNOWN_TAG = "NN"; - - public StanfordSentence(Annotation sentence, String tokenType, - AnnotationSet inputAS, boolean usePosTags) { - + /* + * This is probably dodgy, but I can't find an "unknown" tag in the Penn + * documentation. + */ + private static final String UNKNOWN_TAG = "NN"; + + public StanfordSentence(Annotation sentence, String tokenType, + AnnotationSet inputAS, boolean usePosTags) { startPosToOffset = new HashMap<Integer, Long>(); - endPosToOffset = new HashMap<Integer, Long>(); - startPosToToken = new HashMap<Integer, Annotation>(); + endPosToOffset = new HashMap<Integer, Long>(); + startPosToToken = new HashMap<Integer, Annotation>(); startPosToString = new HashMap<Integer, String>(); - sentenceStartOffset = sentence.getStartNode().getOffset(); - sentenceEndOffset = sentence.getEndNode().getOffset(); - - nbrOfTokens = 0; + sentenceEndOffset = sentence.getEndNode().getOffset(); + nbrOfTokens = 0; nbrOfMissingPosTags = 0; - - tokens = Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType)); + tokens = + Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset, + sentenceEndOffset).get(tokenType)); words = new ArrayList<Word>(); - add(-1, sentence, "S"); - int tokenNo = 0; - - for (Annotation token : tokens) { - String tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString()); + for(Annotation token : tokens) { + String tokenString = + escapeToken(token.getFeatures().get(STRING_FEATURE).toString()); add(tokenNo, token, tokenString); - - /* The FAQ says the parser will automatically use existing POS tags - * if the List elements are of type TaggedWord. + /* + * The FAQ says the parser will automatically use existing POS tags if the + * List elements are of type TaggedWord. * http://nlp.stanford.edu/software/parser-faq.shtml#f */ - - if (usePosTags) { + if(usePosTags) { words.add(new TaggedWord(tokenString, getEscapedPosTag(token))); - } - else { + } else { words.add(new Word(tokenString)); } - tokenNo++; } - nbrOfTokens = tokenNo; } - public String toString() { StringBuffer output = new StringBuffer(); - output.append("S: ").append(Strings.toString(startPosToOffset)).append('\n'); - output.append(" ").append(Strings.toString(startPosToString)).append('\n'); + output.append("S: ").append(Strings.toString(startPosToOffset)) + .append('\n'); + output.append(" ").append(Strings.toString(startPosToString)) + .append('\n'); output.append(" ").append(Strings.toString(endPosToOffset)); return output.toString(); } - - - private String getEscapedPosTag(Annotation token) { + + private String getEscapedPosTag(Annotation token) { String pos = UNKNOWN_TAG; FeatureMap tokenFeatures = token.getFeatures(); - - if (tokenFeatures.containsKey(POS_TAG_FEATURE)) { + if(tokenFeatures.containsKey(POS_TAG_FEATURE)) { Object temp = tokenFeatures.get(POS_TAG_FEATURE); - - if (temp instanceof String) { - pos = (String) temp; - } - else { + if(temp instanceof String) { + pos = (String)temp; + } else { nbrOfMissingPosTags++; } - - } - else { + } else { nbrOfMissingPosTags++; } - return escapePosTag(pos); } - - private void add(int tokenNbr, Annotation token, String tokenString) { Long tokenStartOffset = token.getStartNode().getOffset(); - Long tokenEndOffset = token.getEndNode().getOffset(); - + Long tokenEndOffset = token.getEndNode().getOffset(); startPosToOffset.put(tokenNbr, tokenStartOffset); endPosToOffset.put(new Integer(tokenNbr + 1), tokenEndOffset); startPosToToken.put(tokenNbr, token); startPosToString.put(tokenNbr, tokenString); } - - - /* Explanation of the position conversion: - * The output of the Stanford Parser specifies each constituent's span in terms of - * token boundaries re-numbered within each sentence, which we need to convert to - * GATE character offsets within the whole document. + /* + * Explanation of the position conversion: The output of the Stanford Parser + * specifies each constituent's span in terms of token boundaries re-numbered + * within each sentence, which we need to convert to GATE character offsets + * within the whole document. * - * Example: - * "This is a test." starting at document offset 100, containing five tokens. - * Stanford says "This" starts at 0 and ends at 1; GATE says 100 to 104. - * Stanford says "is a test" starts at 1 and ends at 4; - * GATE says 105 to 114. + * Example: "This is a test." starting at document offset 100, containing five + * tokens. Stanford says "This" starts at 0 and ends at 1; GATE says 100 to + * 104. Stanford says "is a test" starts at 1 and ends at 4; GATE says 105 to + * 114. */ - - public int numberOfTokens() { return nbrOfTokens; } - + public int numberOfMissingPosTags() { return nbrOfMissingPosTags; } - + public boolean isNotEmpty() { return (nbrOfTokens > 0); } - - + /** - * Change the Token's string to match the Penn Treebank's - * escaping system. - * See Stanford parser FAQ "How can I provide the correct tokenization of my - * sentence to the parser?" - - * @param token original string feature of Token + * Change the Token's string to match the Penn Treebank's escaping system. See + * Stanford parser FAQ "How can I provide the correct tokenization of my + * sentence to the parser?" + * + * @param token + * original string feature of Token * @return escaped version of string */ protected static String escapeToken(String token) { - // ( --> -LRB- - if (token.equals("(")) { - return "-LRB-"; - } - - // ) --> -RRB- - if (token.equals(")")) { - return "-RRB-"; - } - - // / --> \/ - // * --> \* - if (token.contains("/") || token.contains("*")) { - return token.replace("/", "\\/").replace("*", "\\*"); - } - + // ( --> -LRB- + if(token.equals("(")) { return "-LRB-"; } + // ) --> -RRB- + if(token.equals(")")) { return "-RRB-"; } + // / --> \/ + // * --> \* + if(token.contains("/") || token.contains("*")) { return token.replace("/", + "\\/").replace("*", "\\*"); } return token; } - protected static String escapePosTag(String tag) { - // ( --> -LRB- - if (tag.equals("(")) { - return "-LRB-"; - } - - // ) --> -RRB- - if (tag.equals(")")) { - return "-RRB-"; - } - + // ( --> -LRB- + if(tag.equals("(")) { return "-LRB-"; } + // ) --> -RRB- + if(tag.equals(")")) { return "-RRB-"; } return tag; } - protected static String unescapePosTag(String tag) { - // ( <-- -LRB- - if (tag.equals("-LRB-")) { - return "("; - } - - // ) <-- -RRB- - if (tag.equals("-RRB-")) { - return ")"; - } - + // ( <-- -LRB- + if(tag.equals("-LRB-")) { return "("; } + // ) <-- -RRB- + if(tag.equals("-RRB-")) { return ")"; } return tag; } - /** - * Convert a Stanford start position to the GATE Annotation of type - * "Token" that starts there. + * Convert a Stanford start position to the GATE Annotation of type "Token" + * that starts there. */ public Annotation startPos2token(int startPos) { return startPosToToken.get(startPos); @@ -239,6 +203,7 @@ /** * Convert a Stanford start position to a GATE offset. + * * @param startPos * @return the offset in the GATE document */ @@ -248,6 +213,7 @@ /** * Convert a Stanford end position to a GATE offset. + * * @param endPos * @return the offset in the GATE document */ @@ -255,7 +221,6 @@ return endPosToOffset.get(endPos); } - /** * @return The data structure that is passed to the Stanford Parser itself. */ Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -19,7 +19,6 @@ * * $Id: Tagger.java 15468 2012-02-25 14:41:15Z $ */ - package gate.stanford; import edu.stanford.nlp.ling.TaggedWord; @@ -56,9 +55,8 @@ /** * This class is a wrapper for the Stanford PoS tagger v3.2.0. */ -@CreoleResource(name = "Stanford POS Tagger", comment = "Stanford Part-of-Speech Tagger", icon = "pos-tagger", helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford") +@CreoleResource(name = "Stanford POS Tagger", comment = "Stanford Part-of-Speech Tagger", icon = "pos-tagger", helpURL = "http://gate.ac.uk/userguide/sec:misc:creole:stanford") public class Tagger extends AbstractLanguageAnalyser { - private static final long serialVersionUID = -6001372186847970081L; public static final String TAG_DOCUMENT_PARAMETER_NAME = "document"; @@ -68,13 +66,13 @@ public static final String TAG_ENCODING_PARAMETER_NAME = "encoding"; public static final String BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = - "baseTokenAnnotationType"; + "baseTokenAnnotationType"; public static final String OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME = - "outputAnnotationType"; + "outputAnnotationType"; public static final String BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = - "baseSentenceAnnotationType"; + "baseSentenceAnnotationType"; public static final String TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName"; @@ -108,9 +106,9 @@ @RunTime @Optional - @CreoleParameter(comment = "Should existing " + TOKEN_CATEGORY_FEATURE_NAME + - " features on input annotations be respected (true) or ignored (false)?", - defaultValue = "true") + @CreoleParameter(comment = "Should existing " + + TOKEN_CATEGORY_FEATURE_NAME + + " features on input annotations be respected (true) or ignored (false)?", defaultValue = "true") public void setUseExistingTags(Boolean useTags) { useExistingTags = useTags; } @@ -118,6 +116,7 @@ public Boolean getUseExistingTags() { return useExistingTags; } + private Boolean useExistingTags; protected Logger logger = Logger.getLogger(this.getClass().getName()); @@ -145,43 +144,34 @@ // check the parameters if(document == null) throw new ExecutionException("No document to process!"); - AnnotationSet inputAS = document.getAnnotations(inputASName); - - if(baseTokenAnnotationType == null || - baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException( - "No base Token Annotation Type provided!"); } - - if(baseSentenceAnnotationType == null || - baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException( - "No base Sentence Annotation Type provided!"); } - - if(outputAnnotationType == null || - outputAnnotationType.trim().length() == 0) { throw new ExecutionException( - "No AnnotationType provided to store the new feature!"); } - + if(baseTokenAnnotationType == null + || baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException( + "No base Token Annotation Type provided!"); } + if(baseSentenceAnnotationType == null + || baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException( + "No base Sentence Annotation Type provided!"); } + if(outputAnnotationType == null + || outputAnnotationType.trim().length() == 0) { throw new ExecutionException( + "No AnnotationType provided to store the new feature!"); } AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType); AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType); - if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && - tokensAS.size() > 0) { + if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null + && tokensAS.size() > 0) { long startTime = System.currentTimeMillis(); fireStatusChanged("POS tagging " + document.getName()); fireProgressChanged(0); // prepare the input for MaxentTagger List<Word> sentenceForTagger = new ArrayList<Word>(); - // define a comparator for annotations by start offset OffsetComparator offsetComparator = new OffsetComparator(); - // read all the tokens and all the sentences List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS); Collections.sort(sentencesList, offsetComparator); List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS); Collections.sort(tokensList, offsetComparator); - Iterator<Annotation> sentencesIter = sentencesList.iterator(); ListIterator<Annotation> tokensIter = tokensList.listIterator(); - List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>(); Annotation currentToken = tokensIter.next(); int sentIndex = 0; @@ -190,131 +180,125 @@ Annotation currentSentence = sentencesIter.next(); tokensInCurrentSentence.clear(); sentenceForTagger.clear(); - while(currentToken != null && - currentToken.getEndNode().getOffset() - .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { + while(currentToken != null + && currentToken.getEndNode().getOffset() + .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { // If we're only POS tagging Tokens within baseSentenceAnnotationType, // don't add the sentence if the Tokens aren't within the span of // baseSentenceAnnotationType if(posTagAllTokens || currentToken.withinSpanOf(currentSentence)) { tokensInCurrentSentence.add(currentToken); - - if(useExistingTags && currentToken.getFeatures().containsKey( - TOKEN_CATEGORY_FEATURE_NAME)) { - sentenceForTagger.add(new TaggedWord( - (String)currentToken.getFeatures() - .get(TOKEN_STRING_FEATURE_NAME), - (String)currentToken.getFeatures() - .get(TOKEN_CATEGORY_FEATURE_NAME))); + if(useExistingTags + && currentToken.getFeatures().containsKey( + TOKEN_CATEGORY_FEATURE_NAME)) { + sentenceForTagger.add(new TaggedWord((String)currentToken + .getFeatures().get(TOKEN_STRING_FEATURE_NAME), + (String)currentToken.getFeatures().get( + TOKEN_CATEGORY_FEATURE_NAME))); } else { sentenceForTagger.add(new Word((String)currentToken.getFeatures() - .get(TOKEN_STRING_FEATURE_NAME))); + .get(TOKEN_STRING_FEATURE_NAME))); } } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } - // if the sentence doesn't contain any tokens (which is a bit weird but // is possible) then don't try running the POS tagger as you will get an // array index out of bounds exception if(sentenceForTagger.isEmpty()) continue; - // run the POS tagger List<TaggedWord> taggerResults = - tagger.tagSentence(sentenceForTagger, useExistingTags); - + tagger.tagSentence(sentenceForTagger, useExistingTags); // add the results // make sure no malfunction occurred if(taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException( - "POS Tagger malfunction: the output size (" + - taggerResults.size() + ") is different from the input size (" + - tokensInCurrentSentence.size() + ")!"); + "POS Tagger malfunction: the output size (" + + taggerResults.size() + + ") is different from the input size (" + + tokensInCurrentSentence.size() + ")!"); Iterator<TaggedWord> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); while(resIter.hasNext()) { Annotation annot = tokIter.next(); - addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, (resIter.next().tag())); + addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, + (resIter.next().tag())); } fireProgressChanged(sentIndex++ * 100 / sentCnt); }// while(sentencesIter.hasNext()) - if(currentToken != null && posTagAllTokens) { // Tag remaining Tokens if we are not considering those only within // baseSentenceAnnotationType - // we have remaining tokens after the last sentence tokensInCurrentSentence.clear(); sentenceForTagger.clear(); while(currentToken != null) { tokensInCurrentSentence.add(currentToken); - if(useExistingTags && currentToken.getFeatures().containsKey( - TOKEN_CATEGORY_FEATURE_NAME)) { - sentenceForTagger.add(new TaggedWord( - (String)currentToken.getFeatures() - .get(TOKEN_STRING_FEATURE_NAME), - (String)currentToken.getFeatures() - .get(TOKEN_CATEGORY_FEATURE_NAME))); + if(useExistingTags + && currentToken.getFeatures().containsKey( + TOKEN_CATEGORY_FEATURE_NAME)) { + sentenceForTagger.add(new TaggedWord((String)currentToken + .getFeatures().get(TOKEN_STRING_FEATURE_NAME), + (String)currentToken.getFeatures().get( + TOKEN_CATEGORY_FEATURE_NAME))); } else { sentenceForTagger.add(new Word((String)currentToken.getFeatures() - .get(TOKEN_STRING_FEATURE_NAME))); + .get(TOKEN_STRING_FEATURE_NAME))); } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } - // run the POS tagger on remaining tokens List<TaggedWord> taggerResults = - tagger.tagSentence(sentenceForTagger, useExistingTags); - + tagger.tagSentence(sentenceForTagger, useExistingTags); // add the results and make sure no malfunction occurred if(taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException( - "POS Tagger malfunction: the output size (" + taggerResults.size() + - ") is different from the input size (" + - tokensInCurrentSentence.size() + ")!"); + "POS Tagger malfunction: the output size (" + + taggerResults.size() + + ") is different from the input size (" + + tokensInCurrentSentence.size() + ")!"); Iterator<TaggedWord> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); while(resIter.hasNext()) { Annotation annot = tokIter.next(); - addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, (resIter.next().tag())); + addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, + (resIter.next().tag())); } }// if(currentToken != null) fireProcessFinished(); - fireStatusChanged(document.getName() + - " tagged in " + - NumberFormat.getInstance().format( - (double)(System.currentTimeMillis() - startTime) / 1000) + - " seconds!"); + fireStatusChanged(document.getName() + + " tagged in " + + NumberFormat.getInstance().format( + (double)(System.currentTimeMillis() - startTime) / 1000) + + " seconds!"); } else { if(failOnMissingInputAnnotations) { throw new ExecutionException( - "No sentences or tokens to process in document " + - document.getName() + "\n" + "Please run a sentence splitter " + - "and tokeniser first!"); + "No sentences or tokens to process in document " + + document.getName() + "\n" + "Please run a sentence splitter " + + "and tokeniser first!"); } else { Utils - .logOnce( - logger, - Level.INFO, - "POS tagger: no sentence or token annotations in input document - see debug log for details."); + .logOnce( + logger, + Level.INFO, + "POS tagger: no sentence or token annotations in input document - see debug log for details."); logger.debug("No input annotations in document " + document.getName()); } } - } protected void addFeatures(Annotation annot, String featureName, - String featureValue) throws GateRuntimeException { + String featureValue) throws GateRuntimeException { String tempIASN = inputASName == null ? "" : inputASName; String tempOASN = outputASName == null ? "" : outputASName; - if(outputAnnotationType.equals(baseTokenAnnotationType) && - tempIASN.equals(tempOASN)) { + if(outputAnnotationType.equals(baseTokenAnnotationType) + && tempIASN.equals(tempOASN)) { annot.getFeatures().put(featureName, featureValue); return; } else { int start = annot.getStartNode().getOffset().intValue(); int end = annot.getEndNode().getOffset().intValue(); - // get the annotations of type outputAnnotationType AnnotationSet outputAS = document.getAnnotations(outputASName); AnnotationSet annotations = outputAS.get(outputAnnotationType); @@ -324,7 +308,7 @@ features.put(featureName, featureValue); try { outputAS.add(new Long(start), new Long(end), outputAnnotationType, - features); + features); } catch(Exception e) { throw new GateRuntimeException("Invalid Offsets"); } @@ -332,26 +316,25 @@ // search for the annotation if there is one with the same start and end // offsets ArrayList<Annotation> tempList = - new ArrayList<Annotation>(annotations.get()); + new ArrayList<Annotation>(annotations.get()); boolean found = false; for(int i = 0; i < tempList.size(); i++) { Annotation annotation = tempList.get(i); - if(annotation.getStartNode().getOffset().intValue() == start && - annotation.getEndNode().getOffset().intValue() == end) { + if(annotation.getStartNode().getOffset().intValue() == start + && annotation.getEndNode().getOffset().intValue() == end) { // this is the one annotation.getFeatures().put(featureName, featureValue); found = true; break; } } - if(!found) { // add new annotation FeatureMap features = Factory.newFeatureMap(); features.put(featureName, featureValue); try { outputAS.add(new Long(start), new Long(end), outputAnnotationType, - features); + features); } catch(Exception e) { throw new GateRuntimeException("Invalid Offsets"); } @@ -420,7 +403,7 @@ this.outputASName = outputASName; } - @CreoleParameter(comment = "Path to the tagger's model file", defaultValue = "resources/english-left3words-distsim.tagger", suffixes="tagger;model") + @CreoleParameter(comment = "Path to the tagger's model file", defaultValue = "resources/english-left3words-distsim.tagger", suffixes = "tagger;model") public void setModelFile(URL modelFile) { this.modelFile = modelFile; } Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -19,10 +19,8 @@ * * $Id: Tokenizer.java 15468 2013-10-22 21:13:15Z $ */ - package gate.stanford; - import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBTokenizer; @@ -46,9 +44,8 @@ /** * This class is a wrapper for the Stanford Tokenizer v3.2.0. */ -@CreoleResource(name = "Stanford PTB Tokenizer", comment = "Stanford Penn Treebank v3 Tokenizer, for English", icon = "tokeniser", helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford") +@CreoleResource(name = "Stanford PTB Tokenizer", comment = "Stanford Penn Treebank v3 Tokenizer, for English", icon = "tokeniser", helpURL = "http://gate.ac.uk/userguide/sec:misc:creole:stanford") public class Tokenizer extends AbstractLanguageAnalyser { - private static final long serialVersionUID = -6001371186847970080L; public static final String TAG_DOCUMENT_PARAMETER_NAME = "document"; @@ -95,94 +92,65 @@ // check the parameters if(document == null) throw new ExecutionException("No document to process!"); - - AnnotationSet inputAS = document.getAnnotations(inputASName); + AnnotationSet outputAS = document.getAnnotations(outputASName); - - long startTime = System.currentTimeMillis(); fireStatusChanged("Tokenising " + document.getName()); - fireProgressChanged(0); - - + fireProgressChanged(0); // tokenising goes here String rawText = ""; try { - rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString(); - } catch (Exception e) { + rawText = + document.getContent() + .getContent(new Long(0), document.getContent().size()).toString(); + } catch(Exception e) { System.out.println("Document content offsets wrong: " + e); } - PTBTokenizer<CoreLabel> ptbt; try { - ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(), "invertible=true"); - } catch (Exception e) { + ptbt = + new PTBTokenizer<CoreLabel>(new StringReader(rawText), + new CoreLabelTokenFactory(), "invertible=true"); + } catch(Exception e) { System.out.println("Failed when calling tokenizer: " + e); return; } - Long tokenStart; Long tokenEnd; - Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces - - for (CoreLabel label; ptbt.hasNext(); ) { + Long prevTokenEnd = new Long(0); // this default value lets us capture + // leading spaces + for(CoreLabel label; ptbt.hasNext();) { label = ptbt.next(); tokenStart = new Long(label.beginPosition()); tokenEnd = new Long(label.endPosition()); - - SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl(); - // add the token annotation try { - tokenMap.put(TOKEN_STRING_FEATURE, document.getContent().getContent(tokenStart, tokenEnd).toString()); + tokenMap.put(TOKEN_STRING_FEATURE, + document.getContent().getContent(tokenStart, tokenEnd).toString()); outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap); - } catch (InvalidOffsetException e) { + } catch(InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } - // do we need to add a space annotation? - if (tokenStart > prevTokenEnd) { + if(tokenStart > prevTokenEnd) { try { - outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl()); - } catch (InvalidOffsetException e) { + outputAS.add(prevTokenEnd, tokenStart, spaceLabel, + new SimpleFeatureMapImpl()); + } catch(InvalidOffsetException e) { System.out.println("Space token alignment problem:" + e); } - } - prevTokenEnd = tokenEnd; - } - - fireProcessFinished(); - fireStatusChanged(document.getName() + - " tokenised in " + - NumberFormat.getInstance().format( - (double)(System.currentTimeMillis() - startTime) / 1000) + - " seconds!"); + fireStatusChanged(document.getName() + + " tokenised in " + + NumberFormat.getInstance().format( + (double)(System.currentTimeMillis() - startTime) / 1000) + + " seconds!"); } - public void setEncoding(String encoding) { - this.encoding = encoding; - } - - @Optional - @RunTime - @CreoleParameter(comment = "Input annotation set name", defaultValue = "") - public void setInputASName(String newInputASName) { - inputASName = newInputASName; - } - - public String getInputASName() { - return inputASName; - } - - public String getEncoding() { - return this.encoding; - } - public String getOutputASName() { return this.outputASName; } @@ -194,7 +162,6 @@ this.outputASName = outputASName; } - public String getTokenLabel() { return this.tokenLabel; } @@ -217,14 +184,9 @@ this.spaceLabel = spaceLabel; } - private String inputASName; - - private String encoding; - private String outputASName; private String tokenLabel; private String spaceLabel; - } Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -15,7 +15,6 @@ * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. */ - package gate.stanford.apps; import gate.creole.PackagedController; @@ -26,21 +25,18 @@ import java.net.URL; import java.util.List; -@CreoleResource(name="English Dependency Parser", - comment = "Ready-made application for Stanford English parser", - autoinstances = @AutoInstance) +@CreoleResource(name = "English Dependency Parser", comment = "Ready-made application for Stanford English parser", autoinstances = @AutoInstance) public class EnglishDependencies extends PackagedController { - private static final long serialVersionUID = 3163023140886167369L; @Override - @CreoleParameter(defaultValue="resources/sample_parser_en.gapp") + @CreoleParameter(defaultValue = "resources/sample_parser_en.gapp") public void setPipelineURL(URL url) { - this.url = url; + this.url = url; } - + @Override - @CreoleParameter(defaultValue="Stanford Parser") + @CreoleParameter(defaultValue = "Stanford Parser") public void setMenu(List<String> menu) { super.setMenu(menu); } Modified: gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java =================================================================== --- gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java 2016-08-19 16:50:05 UTC (rev 19525) +++ gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java 2016-08-19 17:01:15 UTC (rev 19526) @@ -15,7 +15,6 @@ * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. */ - package gate.stanford.apps; import gate.creole.PackagedController; @@ -26,21 +25,18 @@ import java.net.URL; import java.util.List; -@CreoleResource(name="English POS Tagger and Dependency Parser", - comment = "Ready-made application for Stanford English POS tagger and parser", - autoinstances = @AutoInstance) +@CreoleResource(name = "English POS Tagger and Dependency Parser", comment = "Ready-made application for Stanford English POS tagger and parser", autoinstances = @AutoInstance) public class EnglishPOSDependencies extends PackagedController { - private static final long serialVersionUID = 3163023140886167369L; @Override - @CreoleParameter(defaultValue="resources/sample_pos+parser_en.gapp") + @CreoleParameter(defaultValue = "resources/sample_pos+parser_en.gapp") public void setPipelineURL(URL url) { - this.url = url; + this.url = url; } - + @Override - @CreoleParameter(defaultValue="Stanford Parser") + @CreoleParameter(defaultValue = "Stanford Parser") public void setMenu(List<String> menu) { super.setMenu(menu); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs