Revision: 17168
http://sourceforge.net/p/gate/code/17168
Author: markagreenwood
Date: 2013-12-09 17:38:07 +0000 (Mon, 09 Dec 2013)
Log Message:
-----------
too many changes to list -- I think possibly the class name is about the only
thing to have been retained (and I was tempted to change that too)
Modified Paths:
--------------
gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java
Modified: gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java
===================================================================
--- gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java
2013-12-09 16:45:10 UTC (rev 17167)
+++ gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java
2013-12-09 17:38:07 UTC (rev 17168)
@@ -1,9 +1,8 @@
/*
* BulStemPR.java
*
+ * Copyright (c) 2013 The University of Sheffield.
*
- * Copyright (c) 2010,2011 The University of Sheffield.
- *
* This file is part of GATE (see http://gate.ac.uk/), and is free software,
* licenced under the GNU Library General Public License, Version 2, June1991.
*
@@ -18,6 +17,7 @@
import gate.AnnotationSet;
import gate.ProcessingResource;
import gate.Resource;
+import gate.Utils;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
@@ -27,136 +27,152 @@
import gate.creole.metadata.RunTime;
import java.io.BufferedReader;
-import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
-import java.net.URISyntaxException;
import java.net.URL;
-import java.util.Hashtable;
+import java.text.NumberFormat;
+import java.util.HashMap;
+import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
/**
* Stemming algorithm by Preslav Nakov.
*
* @author Alexander Alexandrov, e-mail: [email protected], provided the JAVA
* implementation of the algorithm
* @author Ivelina Nikolova, e-mail:[email protected], wrapped the stemmer for
GATE
- * @since 2013-12-05
*/
-@CreoleResource(name = "Stemmer BulStem", helpURL =
"http://lml.bas.bg/~nakov/bulstem/", comment = "This plugin is an
implementation of the BulStem stemmer algorithm for Bulgarian developed by
Preslav Nakov.")
+@CreoleResource(name = "BulStem", helpURL =
"http://lml.bas.bg/~nakov/bulstem/", comment = "This plugin is an
implementation of the BulStem stemmer algorithm for Bulgarian developed by
Preslav Nakov.")
public class BulStemPR extends AbstractLanguageAnalyser implements
ProcessingResource, Serializable {
+ private static final long serialVersionUID = 257778017962925274L;
+
+ protected Logger logger = Logger.getLogger(this.getClass());
+
private URL rulesURL;
- public Hashtable stemmingRules = new Hashtable();
+ private String annotationSetName;
- public int STEM_BOUNDARY = 1;
+ private String annotationType;
- public static Pattern vocals = Pattern.compile("[^аъоуеиюя]*[аъоуеиюя]");
+ private Map<String, String> stemmingRules;;
- public static Pattern p = Pattern
+ // should we make this an init param?
+ // at the moment this always excludes 8556 entries from the default rules
file
+ private static final int STEM_BOUNDARY = 1;
+
+ private Boolean failOnMissingInputAnnotations = true;
+
+ private static final Pattern vocals = Pattern
+ .compile("[^аъоуеиюя]*[аъоуеиюя]");
+
+ public static final Pattern p = Pattern
.compile("([а-я]+)\\s==>\\s([а-я]+)\\s([0-9]+)");
- // Exit gracefully if exception caught on init()
- private boolean gracefulExit;
-
@Override
public Resource init() throws ResourceInstantiationException {
+
// check required parameters are set
- if(rulesURL == null) {
- // throw new
- // ResourceInstantiationException("outputMode parameter must be set");
- gate.util.Err.println("rulesURL parameter must be set");
- gracefulExit = true;
+ if(rulesURL == null) { throw new ResourceInstantiationException(
+ "rulesURL param must be set"); }
+
+ stemmingRules = new HashMap<String, String>();
+
+ BufferedReader br = null;
+ try {
+ br = new BufferedReader(new InputStreamReader(rulesURL.openStream()));
+ String s = null;
+ while((s = br.readLine()) != null) {
+ Matcher m = p.matcher(s);
+ if(m.matches()) {
+ if(Integer.parseInt(m.group(3)) > STEM_BOUNDARY) {
+ stemmingRules.put(m.group(1), m.group(2));
+ }
+ }
+ }
+ } catch(Exception e) {
+ throw new ResourceInstantiationException(e);
+ } finally {
+ if(br != null) IOUtils.closeQuietly(br);
}
return this;
-
}
- /* Set gracefulExit flag and clean up */
- private void gracefulExit(String msg) {
- gate.util.Err.println(msg);
- cleanup();
- fireProcessFinished();
- }
-
@Override
public void execute() throws ExecutionException {
- // check required parameters are set
- if(rulesURL == null) {
- // throw new
- // ResourceInstantiationException("outputMode parameter must be set");
- gracefulExit("rulesURL parameter must be set in BulStem PR");
- return;
- }
- try {
- loadStemmingRules(this.rulesURL.getPath());
- } catch(URISyntaxException e1) {
- // TODO Auto-generated catch block
- e1.printStackTrace();
- } catch(Exception e1) {
- // TODO Auto-generated catch block
- e1.printStackTrace();
- }
+ // get all the tokens from the specified annotation set
+ AnnotationSet allTokens =
+ document.getAnnotations(annotationSetName).get(annotationType);
- // Just process the entire document
- // String docText = document.getContent().toString();
- AnnotationSet allTokens = document.getAnnotations().get("Token");
- try {
- // System.out.println("bustem works");
- this.processWithBulstem(allTokens);
- } catch(Exception e) {
- gracefulExit(e.getMessage());
- }
- }
+ if(allTokens.size() > 0) {
- private void processWithBulstem(AnnotationSet allTokens) {
- // TODO Auto-generated method stub
- for(Annotation token : allTokens) {
- String tokenString = token.getFeatures().get("string").toString();
- String stem = stem(tokenString).toLowerCase();
- token.getFeatures().put("stem", stem);
- }
+ // sort out the status reporting stuff
+ long startTime = System.currentTimeMillis();
+ fireStatusChanged("Running BulStem over " + document.getName());
+ fireProgressChanged(0);
+ int tokenCount = 0;
- }
+ for(Annotation token : allTokens) {
+ // for each Token annotation...
- public void loadStemmingRules(String fileName) throws Exception {
- stemmingRules.clear();
- FileInputStream fis = new FileInputStream(fileName);
- BufferedReader br = new BufferedReader(new InputStreamReader(fis));
- String s = null;
- while((s = br.readLine()) != null) {
- Matcher m = p.matcher(s);
- if(m.matches()) {
- int j = m.groupCount();
- if(j == 3) {
- if(Integer.parseInt(m.group(3)) > STEM_BOUNDARY) {
- stemmingRules.put(m.group(1), m.group(2));
- }
- }
+ // get the string feature
+ String tokenString = token.getFeatures().get("string").toString();
+
+ // stem the string feature and change it to lowercase
+ String stem = stem(tokenString).toLowerCase();
+
+ // store the new feature
+ token.getFeatures().put("stem", stem);
+
+ // report our progress
+ fireProgressChanged(tokenCount++ * 100 / allTokens.size());
}
+
+ // we've finished so report this
+ fireProcessFinished();
+ fireStatusChanged(document.getName() +
+ " stemmed in " +
+ NumberFormat.getInstance().format(
+ (double)(System.currentTimeMillis() - startTime) / 1000) +
+ " seconds!");
+ } else {
+ if(failOnMissingInputAnnotations) {
+ throw new ExecutionException("No tokens to process in document " +
+ document.getName() + "\n" + "Please run a tokeniser first!");
+ } else {
+ Utils
+ .logOnce(logger, Level.INFO,
+ "BulStem: no token annotations in input document - see debug log
for details.");
+ logger.debug("No input annotations in document " + document.getName());
+ }
}
}
- public String stem(String word) {
+ private String stem(String word) {
Matcher m = vocals.matcher(word);
if(!m.lookingAt()) { return word; }
+
for(int i = m.end() + 1; i < word.length(); i++) {
String suffix = word.substring(i);
- if((suffix = (String)stemmingRules.get(suffix)) != null) { return word
- .substring(0, i) + suffix; }
+ if((suffix = stemmingRules.get(suffix)) != null) {
+ // get the new stem by cutting up the word and adding the right suffix
+ // from the rules
+ return word.substring(0, i) + suffix;
+ }
}
return word;
}
// PR parameters
- @Optional
- @RunTime
- @CreoleParameter(comment = "Path to rules", defaultValue =
"resources/stem_rules_context_2_UTF-8.txt")
+ @CreoleParameter(comment = "Stemming Rules File", defaultValue =
"resources/stem_rules_context_2_UTF-8.txt")
public void setPathToRules(URL rulesURL) {
this.rulesURL = rulesURL;
}
@@ -165,5 +181,35 @@
return rulesURL;
}
-} // class MetaMapPR
+ @Optional
+ @RunTime
+ @CreoleParameter(comment = "The annotation set to use as input")
+ public void setAnnotationSetName(String annotationSetName) {
+ this.annotationSetName = annotationSetName;
+ }
+ public String getAnnotationSetName() {
+ return annotationSetName;
+ }
+
+ @RunTime
+ @CreoleParameter(comment = "The name of the base 'Token' annotation type",
defaultValue = "Token")
+ public void setAnnotationType(String annotationType) {
+ this.annotationType = annotationType;
+ }
+
+ public String getAnnotationType() {
+ return annotationType;
+ }
+
+ @RunTime
+ @Optional
+ @CreoleParameter(comment = "Throw an exception when there are none of the
required input annotations", defaultValue = "true")
+ public void setFailOnMissingInputAnnotations(Boolean fail) {
+ failOnMissingInputAnnotations = fail;
+ }
+
+ public Boolean getFailOnMissingInputAnnotations() {
+ return failOnMissingInputAnnotations;
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Sponsored by Intel(R) XDK
Develop, test and display web and hybrid apps with a single code base.
Download it for free now!
http://pubads.g.doubleclick.net/gampad/clk?id=111408631&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs