TextCategorizationPR.java

valyt Tue, 19 Nov 2013 04:28:18 -0800

Revision: 17105
          http://sourceforge.net/p/gate/code/17105
Author:   valyt
Date:     2013-11-19 12:27:16 +0000 (Tue, 19 Nov 2013)
Log Message:
-----------
The categorization PR implemented, and it even compiles.


No warranties provided as to it actually doing anything just yet...

Modified Paths:
--------------
    
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java

Modified: 
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
===================================================================
--- 
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
 2013-11-19 11:15:12 UTC (rev 17104)
+++ 
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
 2013-11-19 12:27:16 UTC (rev 17105)
@@ -1,23 +1,20 @@
-/**
- * 
+/*
+ *  TextCategorizationPR.java
+ *
+ *  Copyright (c) 1995-2013, The University of Sheffield. See the file
+ *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ *  This file is part of GATE (see http://gate.ac.uk/), and is free
+ *  software, licenced under the GNU Library General Public License,
+ *  Version 2, June 1991 (in the distribution as file licence.html,
+ *  and also available at http://gate.ac.uk/gate/licence.html).
+ *
+ *  Valentin Tablan, 19 Nov 2013
+ *
+ *  $Id$
  */
 package gate.ml.categorization;
 
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.StringReader;
-import java.io.UnsupportedEncodingException;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-
 import de.bwaldvogel.liblinear.Feature;
 import de.bwaldvogel.liblinear.FeatureNode;
 import de.bwaldvogel.liblinear.Linear;
@@ -29,6 +26,8 @@
 import edu.ucla.sspace.vector.DoubleVector;
 import gate.Annotation;
 import gate.AnnotationSet;
+import gate.Factory;
+import gate.FeatureMap;
 import gate.LanguageAnalyser;
 import gate.Resource;
 import gate.Utils;
@@ -39,6 +38,20 @@
 import gate.creole.metadata.CreoleResource;
 import gate.creole.metadata.RunTime;
 
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+
 /**
  * A simple text classification PR, using a <a 
  * href="https://github.com/fozziethebeat/S-Space/";>Semantic Space</a>
@@ -50,6 +63,8 @@
 public class TextCategorizationPR extends AbstractLanguageAnalyser implements
                                                                    
LanguageAnalyser {
   
+  private static Logger logger = Logger.getLogger(TextCategorizationPR.class);
+  
   /**
    * Serialisation UID.
    */
@@ -68,22 +83,34 @@
    */
   protected Model libLinearModel;
   
+
   /**
    * Thresholds to be used when classifying text annotations. If the model 
    * classifies an input annotation as class &quot;X&quot;, and &quot;X&quot;
    * exists as a key in this map, then the classification is only effected if 
    * the classification probability emitted by the model is greater than or 
    * equal to the value in the map. Values in this map are expected to be
-   * probabilities, i.e. positive values between 0.0 and 1.0.
+   * probabilities, i.e. positive values between 0.0 and 1.0. All negative 
+   * values are ignored and the default will be used instead (the category 
with 
+   * the highest probability wins, regardless of what the probability actually 
+   * is).
    * 
    * Note that the liblinear model used must be able to produce classification 
    * probabilities. At the time of writing, only Linear Regression models are 
    * able to do so, while SVM-based one are not. Models that cannot supply
    * probabilities will simply return 1.0 as the classification probability, 
    * causing values in this map to have no effect.   
+   */  
+  protected double[] categoryThresholds;
+  
+  
+  /**
+   * The String labels for the categories (the LIBLINEAR classifier simply 
+   * returns integer values for the labels).
    */
-  protected Map<String, Double> customClassThresholds;
+  protected String[] categoryLabels;
   
+  
   /**
    * The set of stop-words to be used. The values in this list are matched
    * verbatim against the values of the configured input feature on the input
@@ -147,6 +174,10 @@
    */
   protected URL modelURL;
 
+  /**
+   * URL to a file defining the categories used by this PR.
+   */
+  protected URL categoriesURL;
   
   @Override
   public Resource init() throws ResourceInstantiationException {
@@ -192,15 +223,16 @@
     // load the stop words
     stopWords = null;
     if(stopWordsURL != null) {
+      BufferedReader swReader = null;
       try {
-        BufferedReader swReader = new BufferedReader(
+        swReader = new BufferedReader(
             new InputStreamReader(stopWordsURL.openStream(), "UTF-8"));
         stopWords = new HashSet<String>();
         String line = swReader.readLine();
         while(line != null) {
           line = line.trim();
-          if(line.startsWith("#") || line.startsWith("//")) {
-            //ignore comment
+          if(line.startsWith("#") || line.startsWith("//") || line.length() == 
0) {
+            // ignore comment and empty lines
           } else {
             stopWords.add(line);
           }
@@ -209,8 +241,90 @@
       } catch(IOException e) {
         throw new ResourceInstantiationException(
             "I/O error while reading the stop words.", e);
+      } finally {
+        if (swReader != null) {
+          try {
+            swReader.close();
+          } catch(IOException e) {
+            throw new ResourceInstantiationException(
+                "I/O error whilke closing the stop words file.", e); 
+          }
+        }
       }
     }
+    
+    // read the categories
+    List<String> categoryNames = new ArrayList<String>();
+    List<Double> categoryProbs = new ArrayList<Double>();
+    boolean customProbs = false;
+    BufferedReader catReader = null;
+    if(categoriesURL != null) {
+      try {
+        catReader = new BufferedReader(
+            new InputStreamReader(categoriesURL.openStream(), "UTF-8"));
+        String line = catReader.readLine();
+        while(line != null) {
+          line = line.trim();
+          if(line.startsWith("#") || line.startsWith("//") || line.length() == 
0) {
+            // ignore comment and empty lines
+          } else {
+            String[] elems = line.split(",");
+            if(elems.length == 0) {
+              logger.warn("Ignoring illegal line in categories file: \"" + 
+                line + "\".");
+            } else {
+              categoryNames.add(elems[0]);
+              double prob = -1;
+              if(elems.length > 1) {
+                try {
+                  prob = Double.parseDouble(elems[1]);
+                } catch(NumberFormatException e) {
+                  logger.error(
+                      "Illegal value for probablity in categories file \"" + 
+                      elems[1] + "\" was ignored.");
+                }
+              }
+              categoryProbs.add(prob);
+              if(prob > 0) customProbs = true;
+              if(elems.length > 2) {
+                logger.warn("Line in categories file has more than 2 entries." 
+
+                " Entries starting with " + elems[2] + " were ignored. " +
+                "Line was:\n" + line);
+              }
+            }
+          }
+          line = catReader.readLine();
+        }
+        categoryLabels = categoryNames.toArray(new 
String[categoryNames.size()]);
+        if(customProbs) {
+          if(libLinearModel.isProbabilityModel()) {
+            categoryThresholds = new double[categoryProbs.size()];
+            for(int i = 0; i< categoryThresholds.length; i++) {
+              categoryThresholds[i] = categoryProbs.get(i);
+            }            
+          } else {
+            logger.warn("The LIBLINEAR model provided cannot supply " + 
+                "probabilities. Custom probability thresholds will be 
ignored.");
+            categoryThresholds = null;
+          }
+        }
+      } catch (IOException ioe){
+        throw new ResourceInstantiationException(
+            "I/O error while reading the categories file.", ioe); 
+      } finally {
+        if(catReader != null) try {
+          catReader.close();
+        } catch (IOException ioe) {
+          throw new ResourceInstantiationException(
+              "I/O error while closing the categories file.", ioe); 
+        }
+      }
+    } else {
+      throw new ResourceInstantiationException(
+          "No categories file URL was provided.");
+    }
+    
+    
     return super.init();
   }
 
@@ -239,6 +353,7 @@
       throw new ExecutionException("No output feature name provided."); 
     }
     
+    AnnotationSet outputAS = document.getAnnotations(outputASName);
     
     // collect instance annotations
     AnnotationSet inputAS = document.getAnnotations(inputASName);
@@ -273,12 +388,35 @@
           features[i] = new FeatureNode(i, instanceVector.get(i));
         }
         double[] probs = new double[libLinearModel.getNrClass()];
-        double label = Linear.predictValues(libLinearModel, features, probs);
+        // the value returned is always an int. Returned as a double because
+        // that's what cool C programmers do, or whatever...
+        // We're uncool Java types, so we bring it back to int.
+        int label = (int) Linear.predictValues(libLinearModel, features, 
probs);
+        double probability = 1.0;
         // do we need to check probabilities?
-        // TODO
-        if(customClassThresholds != null && customClassThresholds.size() > 0) {
-          
+        if(categoryThresholds != null && categoryThresholds[label] > 0) {
+          // prob[i] is the value of $\theta_T x$ 
+          // Once mapped through the logistic function, this becomes the 
+          // probability of the instance belonging to class i, as opposed to 
+          // all other classes.
+          // LIBLINEAR would normalize this so that all probs sum up to 1,
+          // but we don't want that, as we're only interested in the confidence
+          // the model has in this particular classification.
+          probability = probs[label];
+          // convert to an actual probability, by applying the logistic 
function
+          probability = 1 / (1 + Math.exp(-probability));
+          if(probability < categoryThresholds[label]) probability = -1;        
    
         }
+        if(probability > 0) {
+          // effect the classification
+          if(sameAnnotation) {
+            instAnn.getFeatures().put(outputFeatureName, 
categoryLabels[label]);
+          } else {
+            FeatureMap fm = Factory.newFeatureMap();
+            fm.put(outputFeatureName, categoryLabels[label]);
+            Utils.addAnn(outputAS, instAnn, outputAnnotationType, fm);
+          }
+        }
       }
       
     }
@@ -287,8 +425,9 @@
 
   @Override
   public void cleanup() {
-    // TODO Auto-generated method stub
-    super.cleanup();
+    libLinearModel = null;
+    semanticSpace = null;
+    stopWords = null;
   }
 
   public String getInputASName() {
@@ -445,7 +584,19 @@
   public void setStopWordsURL(URL stopWordsURL) {
     this.stopWordsURL = stopWordsURL;
   }
+
+  public URL getCategoriesURL() {
+    return categoriesURL;
+  }
+
+  /**
+   * Sets the URL to the file containing the categories.
+   * @param categoriesURL
+   */
+  @CreoleParameter(comment = "A comma-separated file containing the category " 
+ 
+      "name and, optionally, a custom probability threshold for that 
category.")
+  public void setCategoriesURL(URL categoriesURL) {
+    this.categoriesURL = categoriesURL;
+  }
   
-  
-  
 }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Shape the Mobile Experience: Free Subscription
Software experts and developers: Be at the forefront of tech innovation.
Intel(R) Software Adrenaline delivers strategic insight and game-changing 
conversations that shape the rapidly evolving mobile landscape. Sign up now. 
http://pubads.g.doubleclick.net/gampad/clk?id=63431311&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17105] gate/trunk/plugins/Text_Categorization/src/gate/ml /categorization/TextCategorizationPR.java

Reply via email to