Revision: 17168
          http://sourceforge.net/p/gate/code/17168
Author:   markagreenwood
Date:     2013-12-09 17:38:07 +0000 (Mon, 09 Dec 2013)
Log Message:
-----------
too many changes to list -- I think possibly the class name is about the only 
thing to have been retained (and I was tempted to change that too)

Modified Paths:
--------------
    gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java

Modified: gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java
===================================================================
--- gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java   
2013-12-09 16:45:10 UTC (rev 17167)
+++ gate/trunk/plugins/Lang_Bulgarian/src/gate/bulstem/BulStemPR.java   
2013-12-09 17:38:07 UTC (rev 17168)
@@ -1,9 +1,8 @@
 /*
  * BulStemPR.java
  * 
+ * Copyright (c) 2013 The University of Sheffield.
  * 
- * Copyright (c) 2010,2011 The University of Sheffield.
- * 
  * This file is part of GATE (see http://gate.ac.uk/), and is free software,
  * licenced under the GNU Library General Public License, Version 2, June1991.
  * 
@@ -18,6 +17,7 @@
 import gate.AnnotationSet;
 import gate.ProcessingResource;
 import gate.Resource;
+import gate.Utils;
 import gate.creole.AbstractLanguageAnalyser;
 import gate.creole.ExecutionException;
 import gate.creole.ResourceInstantiationException;
@@ -27,136 +27,152 @@
 import gate.creole.metadata.RunTime;
 
 import java.io.BufferedReader;
-import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.io.Serializable;
-import java.net.URISyntaxException;
 import java.net.URL;
-import java.util.Hashtable;
+import java.text.NumberFormat;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
 /**
  * Stemming algorithm by Preslav Nakov.
  * 
  * @author Alexander Alexandrov, e-mail: [email protected], provided the JAVA
  *         implementation of the algorithm
  * @author Ivelina Nikolova, e-mail:[email protected], wrapped the stemmer for 
GATE
- * @since 2013-12-05
  */
-@CreoleResource(name = "Stemmer BulStem", helpURL = 
"http://lml.bas.bg/~nakov/bulstem/";, comment = "This plugin is an 
implementation of the BulStem stemmer algorithm for Bulgarian developed by 
Preslav Nakov.")
+@CreoleResource(name = "BulStem", helpURL = 
"http://lml.bas.bg/~nakov/bulstem/";, comment = "This plugin is an 
implementation of the BulStem stemmer algorithm for Bulgarian developed by 
Preslav Nakov.")
 public class BulStemPR extends AbstractLanguageAnalyser implements
   ProcessingResource, Serializable {
 
+  private static final long serialVersionUID = 257778017962925274L;
+
+  protected Logger logger = Logger.getLogger(this.getClass());
+
   private URL rulesURL;
 
-  public Hashtable stemmingRules = new Hashtable();
+  private String annotationSetName;
 
-  public int STEM_BOUNDARY = 1;
+  private String annotationType;
 
-  public static Pattern vocals = Pattern.compile("[^аъоуеиюя]*[аъоуеиюя]");
+  private Map<String, String> stemmingRules;;
 
-  public static Pattern p = Pattern
+  // should we make this an init param?
+  // at the moment this always excludes 8556 entries from the default rules 
file
+  private static final int STEM_BOUNDARY = 1;
+
+  private Boolean failOnMissingInputAnnotations = true;
+
+  private static final Pattern vocals = Pattern
+    .compile("[^аъоуеиюя]*[аъоуеиюя]");
+
+  public static final Pattern p = Pattern
     .compile("([а-я]+)\\s==>\\s([а-я]+)\\s([0-9]+)");
 
-  // Exit gracefully if exception caught on init()
-  private boolean gracefulExit;
-
   @Override
   public Resource init() throws ResourceInstantiationException {
+
     // check required parameters are set
-    if(rulesURL == null) {
-      // throw new
-      // ResourceInstantiationException("outputMode parameter must be set");
-      gate.util.Err.println("rulesURL parameter must be set");
-      gracefulExit = true;
+    if(rulesURL == null) { throw new ResourceInstantiationException(
+      "rulesURL param must be set"); }
+
+    stemmingRules = new HashMap<String, String>();
+
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(rulesURL.openStream()));
+      String s = null;
+      while((s = br.readLine()) != null) {
+        Matcher m = p.matcher(s);
+        if(m.matches()) {
+          if(Integer.parseInt(m.group(3)) > STEM_BOUNDARY) {
+            stemmingRules.put(m.group(1), m.group(2));
+          }
+        }
+      }
+    } catch(Exception e) {
+      throw new ResourceInstantiationException(e);
+    } finally {
+      if(br != null) IOUtils.closeQuietly(br);
     }
 
     return this;
-
   }
 
-  /* Set gracefulExit flag and clean up */
-  private void gracefulExit(String msg) {
-    gate.util.Err.println(msg);
-    cleanup();
-    fireProcessFinished();
-  }
-
   @Override
   public void execute() throws ExecutionException {
-    // check required parameters are set
-    if(rulesURL == null) {
-      // throw new
-      // ResourceInstantiationException("outputMode parameter must be set");
-      gracefulExit("rulesURL parameter must be set in BulStem PR");
-      return;
-    }
 
-    try {
-      loadStemmingRules(this.rulesURL.getPath());
-    } catch(URISyntaxException e1) {
-      // TODO Auto-generated catch block
-      e1.printStackTrace();
-    } catch(Exception e1) {
-      // TODO Auto-generated catch block
-      e1.printStackTrace();
-    }
+    // get all the tokens from the specified annotation set
+    AnnotationSet allTokens =
+      document.getAnnotations(annotationSetName).get(annotationType);
 
-    // Just process the entire document
-    // String docText = document.getContent().toString();
-    AnnotationSet allTokens = document.getAnnotations().get("Token");
-    try {
-      // System.out.println("bustem works");
-      this.processWithBulstem(allTokens);
-    } catch(Exception e) {
-      gracefulExit(e.getMessage());
-    }
-  }
+    if(allTokens.size() > 0) {
 
-  private void processWithBulstem(AnnotationSet allTokens) {
-    // TODO Auto-generated method stub
-    for(Annotation token : allTokens) {
-      String tokenString = token.getFeatures().get("string").toString();
-      String stem = stem(tokenString).toLowerCase();
-      token.getFeatures().put("stem", stem);
-    }
+      // sort out the status reporting stuff
+      long startTime = System.currentTimeMillis();
+      fireStatusChanged("Running BulStem over " + document.getName());
+      fireProgressChanged(0);
+      int tokenCount = 0;
 
-  }
+      for(Annotation token : allTokens) {
+        // for each Token annotation...
 
-  public void loadStemmingRules(String fileName) throws Exception {
-    stemmingRules.clear();
-    FileInputStream fis = new FileInputStream(fileName);
-    BufferedReader br = new BufferedReader(new InputStreamReader(fis));
-    String s = null;
-    while((s = br.readLine()) != null) {
-      Matcher m = p.matcher(s);
-      if(m.matches()) {
-        int j = m.groupCount();
-        if(j == 3) {
-          if(Integer.parseInt(m.group(3)) > STEM_BOUNDARY) {
-            stemmingRules.put(m.group(1), m.group(2));
-          }
-        }
+        // get the string feature
+        String tokenString = token.getFeatures().get("string").toString();
+
+        // stem the string feature and change it to lowercase
+        String stem = stem(tokenString).toLowerCase();
+
+        // store the new feature
+        token.getFeatures().put("stem", stem);
+
+        // report our progress
+        fireProgressChanged(tokenCount++ * 100 / allTokens.size());
       }
+
+      // we've finished so report this
+      fireProcessFinished();
+      fireStatusChanged(document.getName() +
+        " stemmed in " +
+        NumberFormat.getInstance().format(
+          (double)(System.currentTimeMillis() - startTime) / 1000) +
+        " seconds!");
+    } else {
+      if(failOnMissingInputAnnotations) {
+        throw new ExecutionException("No tokens to process in document " +
+          document.getName() + "\n" + "Please run a tokeniser first!");
+      } else {
+        Utils
+          .logOnce(logger, Level.INFO,
+            "BulStem: no token annotations in input document - see debug log 
for details.");
+        logger.debug("No input annotations in document " + document.getName());
+      }
     }
   }
 
-  public String stem(String word) {
+  private String stem(String word) {
     Matcher m = vocals.matcher(word);
     if(!m.lookingAt()) { return word; }
+
     for(int i = m.end() + 1; i < word.length(); i++) {
       String suffix = word.substring(i);
-      if((suffix = (String)stemmingRules.get(suffix)) != null) { return word
-        .substring(0, i) + suffix; }
+      if((suffix = stemmingRules.get(suffix)) != null) {
+        // get the new stem by cutting up the word and adding the right suffix
+        // from the rules
+        return word.substring(0, i) + suffix;
+      }
     }
     return word;
   }
 
   // PR parameters
-  @Optional
-  @RunTime
-  @CreoleParameter(comment = "Path to rules", defaultValue = 
"resources/stem_rules_context_2_UTF-8.txt")
+  @CreoleParameter(comment = "Stemming Rules File", defaultValue = 
"resources/stem_rules_context_2_UTF-8.txt")
   public void setPathToRules(URL rulesURL) {
     this.rulesURL = rulesURL;
   }
@@ -165,5 +181,35 @@
     return rulesURL;
   }
 
-} // class MetaMapPR
+  @Optional
+  @RunTime
+  @CreoleParameter(comment = "The annotation set to use as input")
+  public void setAnnotationSetName(String annotationSetName) {
+    this.annotationSetName = annotationSetName;
+  }
 
+  public String getAnnotationSetName() {
+    return annotationSetName;
+  }
+
+  @RunTime
+  @CreoleParameter(comment = "The name of the base 'Token' annotation type", 
defaultValue = "Token")
+  public void setAnnotationType(String annotationType) {
+    this.annotationType = annotationType;
+  }
+
+  public String getAnnotationType() {
+    return annotationType;
+  }
+
+  @RunTime
+  @Optional
+  @CreoleParameter(comment = "Throw an exception when there are none of the 
required input annotations", defaultValue = "true")
+  public void setFailOnMissingInputAnnotations(Boolean fail) {
+    failOnMissingInputAnnotations = fail;
+  }
+
+  public Boolean getFailOnMissingInputAnnotations() {
+    return failOnMissingInputAnnotations;
+  }
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Sponsored by Intel(R) XDK 
Develop, test and display web and hybrid apps with a single code base.
Download it for free now!
http://pubads.g.doubleclick.net/gampad/clk?id=111408631&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to