Revision: 17104
http://sourceforge.net/p/gate/code/17104
Author: valyt
Date: 2013-11-19 11:15:12 +0000 (Tue, 19 Nov 2013)
Log Message:
-----------
Classes are dead, long live categories!
Modified Paths:
--------------
gate/trunk/plugins/Text_Categorization/.classpath
gate/trunk/plugins/Text_Categorization/.project
gate/trunk/plugins/Text_Categorization/build.xml
gate/trunk/plugins/Text_Categorization/creole.xml
Added Paths:
-----------
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
Removed Paths:
-------------
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextClassificationPR.java
gate/trunk/plugins/Text_Categorization/src/gate/ml/textclass/
Property Changed:
----------------
gate/trunk/plugins/Text_Categorization/
Index: gate/trunk/plugins/Text_Categorization
===================================================================
--- gate/trunk/plugins/Text_Categorization 2013-11-19 11:10:02 UTC (rev
17103)
+++ gate/trunk/plugins/Text_Categorization 2013-11-19 11:15:12 UTC (rev
17104)
Property changes on: gate/trunk/plugins/Text_Categorization
___________________________________________________________________
Modified: svn:ignore
## -1,3 +1,3 ##
classes
-textClassification.jar
+textCategorization.jar
.settings
Modified: gate/trunk/plugins/Text_Categorization/.classpath
===================================================================
--- gate/trunk/plugins/Text_Categorization/.classpath 2013-11-19 11:10:02 UTC
(rev 17103)
+++ gate/trunk/plugins/Text_Categorization/.classpath 2013-11-19 11:15:12 UTC
(rev 17104)
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
- <classpathentry kind="con"
path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=gate-plugin-text-classification&ivyXmlPath=build%2Fivy.xml&confs=*"/>
+ <classpathentry kind="con"
path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=gate-plugin-text-categorization&ivyXmlPath=build%2Fivy.xml&confs=*"/>
<classpathentry combineaccessrules="false" kind="src" path="/GATE"/>
<classpathentry kind="con"
path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
<classpathentry kind="output" path="classes"/>
Modified: gate/trunk/plugins/Text_Categorization/.project
===================================================================
--- gate/trunk/plugins/Text_Categorization/.project 2013-11-19 11:10:02 UTC
(rev 17103)
+++ gate/trunk/plugins/Text_Categorization/.project 2013-11-19 11:15:12 UTC
(rev 17104)
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
- <name>gate-plugin-text-classification</name>
+ <name>gate-plugin-text-categorization</name>
<comment></comment>
<projects>
</projects>
Modified: gate/trunk/plugins/Text_Categorization/build.xml
===================================================================
--- gate/trunk/plugins/Text_Categorization/build.xml 2013-11-19 11:10:02 UTC
(rev 17103)
+++ gate/trunk/plugins/Text_Categorization/build.xml 2013-11-19 11:15:12 UTC
(rev 17104)
@@ -8,7 +8,7 @@
<property name="src" location="src"/>
<property name="build" location="classes"/>
<property name="dist" location="dist"/>
- <property name="jar.location" location="textClassification.jar" />
+ <property name="jar.location" location="textCategorization.jar" />
<property name="test.jar.location" location="mltests.jar" />
<property name="gate.home" location="../.." />
Modified: gate/trunk/plugins/Text_Categorization/creole.xml
===================================================================
--- gate/trunk/plugins/Text_Categorization/creole.xml 2013-11-19 11:10:02 UTC
(rev 17103)
+++ gate/trunk/plugins/Text_Categorization/creole.xml 2013-11-19 11:15:12 UTC
(rev 17104)
@@ -1,5 +1,5 @@
<?xml version="1.0"?>
<!-- $Id: creole.xml 11850 2009-10-30 15:22:04Z ian_roberts $ -->
<CREOLE-DIRECTORY>
- <JAR SCAN="true">textClassification.jar</JAR>
+ <JAR SCAN="true">textCategorization.jar</JAR>
</CREOLE-DIRECTORY>
Copied:
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
(from rev 17103,
gate/trunk/plugins/Text_Categorization/src/gate/ml/textclass/TextClassificationPR.java)
===================================================================
---
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
(rev 0)
+++
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextCategorizationPR.java
2013-11-19 11:15:12 UTC (rev 17104)
@@ -0,0 +1,451 @@
+/**
+ *
+ */
+package gate.ml.categorization;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import de.bwaldvogel.liblinear.Feature;
+import de.bwaldvogel.liblinear.FeatureNode;
+import de.bwaldvogel.liblinear.Linear;
+import de.bwaldvogel.liblinear.Model;
+import edu.ucla.sspace.common.DocumentVectorBuilder;
+import edu.ucla.sspace.common.SemanticSpace;
+import edu.ucla.sspace.common.SemanticSpaceIO;
+import edu.ucla.sspace.vector.DenseVector;
+import edu.ucla.sspace.vector.DoubleVector;
+import gate.Annotation;
+import gate.AnnotationSet;
+import gate.LanguageAnalyser;
+import gate.Resource;
+import gate.Utils;
+import gate.creole.AbstractLanguageAnalyser;
+import gate.creole.ExecutionException;
+import gate.creole.ResourceInstantiationException;
+import gate.creole.metadata.CreoleParameter;
+import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.RunTime;
+
+/**
+ * A simple text classification PR, using a <a
+ * href="https://github.com/fozziethebeat/S-Space/">Semantic Space</a>
+ * implementation to generate vectors for the annotations to be classified,
+ * and a <a href="http://www.csie.ntu.edu.tw/~cjlin/liblinear">LIBLINEAR</a>
+ * model to perform the actual classification.
+ */
+@CreoleResource
+public class TextCategorizationPR extends AbstractLanguageAnalyser implements
+
LanguageAnalyser {
+
+ /**
+ * Serialisation UID.
+ */
+ private static final long serialVersionUID = 141243024466478134L;
+
+ /**
+ * The semantic space used to generate 'document' feature vectors for the
+ * annotations to be classified.
+ */
+ protected SemanticSpace semanticSpace;
+
+ protected DocumentVectorBuilder vectorBuilder;
+
+ /**
+ * The LibLinear model used for classification.
+ */
+ protected Model libLinearModel;
+
+ /**
+ * Thresholds to be used when classifying text annotations. If the model
+ * classifies an input annotation as class "X", and "X"
+ * exists as a key in this map, then the classification is only effected if
+ * the classification probability emitted by the model is greater than or
+ * equal to the value in the map. Values in this map are expected to be
+ * probabilities, i.e. positive values between 0.0 and 1.0.
+ *
+ * Note that the liblinear model used must be able to produce classification
+ * probabilities. At the time of writing, only Linear Regression models are
+ * able to do so, while SVM-based one are not. Models that cannot supply
+ * probabilities will simply return 1.0 as the classification probability,
+ * causing values in this map to have no effect.
+ */
+ protected Map<String, Double> customClassThresholds;
+
+ /**
+ * The set of stop-words to be used. The values in this list are matched
+ * verbatim against the values of the configured input feature on the input
+ * annotations.
+ */
+ protected Set<String> stopWords;
+
+ /**
+ * URL to a file containing stop words, one on each line.
+ * Lines starting with # or // are considered comments and are ignored.
+ * White space at the start and end of each line's content will be ignored.
+ */
+ protected URL stopWordsURL;
+
+ /**
+ * The name for the annotation set used for input.
+ */
+ protected String inputASName;
+
+ /**
+ * The type of input annotations.
+ */
+ protected String inputAnnotationType;
+
+ /**
+ * The type for token annotations.
+ */
+ protected String tokenAnnotationType;
+
+ /**
+ * The feature on token annotations used to collect the input text.
+ */
+ protected String inputFeatureName;
+
+ /**
+ * The name for the output annotation set.
+ */
+ protected String outputASName;
+
+ /**
+ * The type of annotations produced. If this is the same as
+ * inputAnnotationType, and the input and output annotation sets are the
same,
+ * then no new annotations will be created. The output feature value will
+ * simply be added to the input annotations.
+ */
+ protected String outputAnnotationType;
+
+ /**
+ * The name feature of the feature on the output annotations that stores the
+ * class.
+ */
+ protected String outputFeatureName;
+
+ /**
+ * URL for the file containing the serialized semantic space.
+ */
+ protected URL sematicSpaceURL;
+
+ /**
+ * The URL to the LIBLINEAR model used for classification.
+ */
+ protected URL modelURL;
+
+
+ @Override
+ public Resource init() throws ResourceInstantiationException {
+ // load the model
+ if(modelURL != null) {
+ InputStreamReader isr = null;
+ try{
+ // the default implementation uses ISO-8859-1, so we do the same
+ isr = new InputStreamReader(
+ new BufferedInputStream(modelURL.openStream()), "ISO-8859-1");
+ libLinearModel = Linear.loadModel(isr);
+ isr.close();
+ } catch(IOException ioe) {
+ throw new ResourceInstantiationException(
+ "IO Error while loading the model from " + modelURL, ioe);
+ }
+ } else throw new ResourceInstantiationException("No model URL provided.");
+ // load the semantic space
+ if(sematicSpaceURL != null) {
+ if(sematicSpaceURL.getProtocol().equalsIgnoreCase("file")) {
+ try {
+ semanticSpace = SemanticSpaceIO.load(sematicSpaceURL.getFile());
+ } catch(IOException e) {
+ throw new ResourceInstantiationException(
+ "I/O error while loading the semantic space from " +
+ sematicSpaceURL, e);
+ }
+ } else {
+ throw new ResourceInstantiationException(
+ "URL provided for the semantic space file (" + sematicSpaceURL +
+ ")was not to a local file.");
+ }
+ } else {
+ throw new ResourceInstantiationException(
+ "No URL provided for the semantic space file.");
+ }
+
+ // create the vector builder
+ Properties config = new Properties();
+ config.put(DocumentVectorBuilder.USE_TERM_FREQUENCIES_PROPERTY, true);
+ vectorBuilder = new DocumentVectorBuilder(semanticSpace, config);
+
+ // load the stop words
+ stopWords = null;
+ if(stopWordsURL != null) {
+ try {
+ BufferedReader swReader = new BufferedReader(
+ new InputStreamReader(stopWordsURL.openStream(), "UTF-8"));
+ stopWords = new HashSet<String>();
+ String line = swReader.readLine();
+ while(line != null) {
+ line = line.trim();
+ if(line.startsWith("#") || line.startsWith("//")) {
+ //ignore comment
+ } else {
+ stopWords.add(line);
+ }
+ line = swReader.readLine();
+ }
+ } catch(IOException e) {
+ throw new ResourceInstantiationException(
+ "I/O error while reading the stop words.", e);
+ }
+ }
+ return super.init();
+ }
+
+ @Override
+ public void execute() throws ExecutionException {
+ // normalize parameters
+ if(inputASName == null || inputASName.length() == 0) inputASName = "";
+ if(outputASName == null || outputASName.length() == 0) outputASName = "";
+ // should we use the input annotations for output
+ boolean sameAnnotation = inputASName.equals(outputASName) &&
+ inputAnnotationType.equals(outputAnnotationType);
+ // validate parameter values
+ if(inputAnnotationType == null || inputAnnotationType.length() == 0) {
+ throw new ExecutionException("No input annotation type provided.");
+ }
+ if(inputFeatureName == null || inputFeatureName.length() == 0) {
+ throw new ExecutionException("No input feature name provided.");
+ }
+ if(tokenAnnotationType == null || tokenAnnotationType.length() == 0) {
+ throw new ExecutionException("No token annotation type provided.");
+ }
+ if(outputAnnotationType == null || outputAnnotationType.length() == 0) {
+ throw new ExecutionException("No output annotation type provided.");
+ }
+ if(outputFeatureName == null || outputFeatureName.length() == 0) {
+ throw new ExecutionException("No output feature name provided.");
+ }
+
+
+ // collect instance annotations
+ AnnotationSet inputAS = document.getAnnotations(inputASName);
+ AnnotationSet instances = inputAS.get(inputAnnotationType);
+ for(Annotation instAnn : instances) {
+ // collect the tokens
+ List<Annotation> instTokens = Utils.inDocumentOrder(
+ Utils.getContainedAnnotations(inputAS, instAnn,
tokenAnnotationType));
+ // create a new context
+ StringBuilder instanceStrBld = new StringBuilder();
+ boolean first = true;
+ for(Annotation token : instTokens) {
+ String tokenString = (String)token.getFeatures().get(inputFeatureName);
+ if(tokenString != null && tokenString.length() > 0) {
+ if(first) {
+ first = false;
+ } else {
+ instanceStrBld.append(' ');
+ }
+ instanceStrBld.append(tokenString);
+ }
+ }
+
+ String instanceText = instanceStrBld.toString();
+ if(instanceText.length() > 0) {
+ DoubleVector instanceVector = vectorBuilder.buildVector(
+ new BufferedReader(new StringReader(instanceText)),
+ new DenseVector(semanticSpace.getVectorLength()));
+ // classify the vector
+ Feature[] features = new Feature[libLinearModel.getNrFeature()];
+ for(int i = 0; i < features.length; i++) {
+ features[i] = new FeatureNode(i, instanceVector.get(i));
+ }
+ double[] probs = new double[libLinearModel.getNrClass()];
+ double label = Linear.predictValues(libLinearModel, features, probs);
+ // do we need to check probabilities?
+ // TODO
+ if(customClassThresholds != null && customClassThresholds.size() > 0) {
+
+ }
+ }
+
+ }
+
+ }
+
+ @Override
+ public void cleanup() {
+ // TODO Auto-generated method stub
+ super.cleanup();
+ }
+
+ public String getInputASName() {
+ return inputASName;
+ }
+
+ /**
+ * Set the name for the annotation set used for input.
+ * @param inputASName
+ */
+ @CreoleParameter(comment="The name for the annotation set used for input",
+ defaultValue = "")
+ @RunTime
+ public void setInputASName(String inputASName) {
+ this.inputASName = inputASName;
+ }
+
+ public String getInputAnnotationType() {
+ return inputAnnotationType;
+ }
+
+ /**
+ * Set the type of input annotations
+ * @param inputAnnotationType
+ */
+ @CreoleParameter(comment="The type of input annotations",
+ defaultValue="Sentence")
+ @RunTime
+ public void setInputAnnotationType(String inputAnnotationType) {
+ this.inputAnnotationType = inputAnnotationType;
+ }
+
+ public String getTokenAnnotationType() {
+ return tokenAnnotationType;
+ }
+
+ /**
+ * Set the type for token annotations.
+ * @param tokenAnnotationType
+ */
+ @CreoleParameter(comment = "The type for token annotations.",
+ defaultValue = "Token")
+ @RunTime
+ public void setTokenAnnotationType(String tokenAnnotationType) {
+ this.tokenAnnotationType = tokenAnnotationType;
+ }
+
+ public String getInputFeatureName() {
+ return inputFeatureName;
+ }
+
+ /**
+ * Set the feature on token annotations used to collect the input text.
+ * @param inputFeatureName
+ */
+ @CreoleParameter(comment =
+ "The feature on the token annotations used to collect the text for " +
+ "each input annotation.", defaultValue = "root")
+ @RunTime
+ public void setInputFeatureName(String inputFeatureName) {
+ this.inputFeatureName = inputFeatureName;
+ }
+
+ public String getOutputASName() {
+ return outputASName;
+ }
+
+ /**
+ * Set the name for the output annotation set
+ * @param outputASName
+ */
+ @CreoleParameter(comment = "The name for the output annotation set",
+ defaultValue = "")
+ @RunTime
+ public void setOutputASName(String outputASName) {
+ this.outputASName = outputASName;
+ }
+
+ public String getOutputAnnotationType() {
+ return outputAnnotationType;
+ }
+
+ /**
+ * Set the type of annotations produced. If this is the same as
+ * inputAnnotationType, and the input and output annotation sets are the
same,
+ * then no new annotations will be created. The output feature value will
+ * simply be added to the input annotations.
+ * @param outputAnnotationName
+ */
+ @CreoleParameter(defaultValue = "Sentence", comment =
+ "The type of annotations produced. If this is the same as " +
+ "inputAnnotationType, and the input and output annotation sets are
the same, " +
+ "then no new annotations will be created. The output feature value
will " +
+ "simply be added to the input annotations. ")
+ @RunTime
+ public void setOutputAnnotationType(String outputAnnotationName) {
+ this.outputAnnotationType = outputAnnotationName;
+ }
+
+ public String getOutputFeatureName() {
+ return outputFeatureName;
+ }
+
+ /**
+ * Set the name feature of the feature on the output annotations that stores
+ * the class.
+ * @param outputFeatureName
+ */
+ @CreoleParameter(defaultValue = "class", comment =
+ "The name feature of the feature on the output annotations that " +
+ "stores the class. ")
+ @RunTime
+ public void setOutputFeatureName(String outputFeatureName) {
+ this.outputFeatureName = outputFeatureName;
+ }
+
+ public URL getSematicSpaceURL() {
+ return sematicSpaceURL;
+ }
+
+ /**
+ * Set the URL for the file containing the serialized semantic space.
+ * @param sematicSpaceURL
+ */
+ @CreoleParameter(comment =
+ "URL for the file containing the serialized semantic space. This must be
a local file:// URL.")
+ public void setSematicSpaceURL(URL sematicSpaceURL) {
+ this.sematicSpaceURL = sematicSpaceURL;
+ }
+
+ public URL getModelURL() {
+ return modelURL;
+ }
+
+ /**
+ * Set the URL to the LibLinear model used for classification.
+ * @param modelURL
+ */
+ @CreoleParameter(comment = "The URL to the LIBLINEAR model used for
classification")
+ public void setModelURL(URL modelURL) {
+ this.modelURL = modelURL;
+ }
+
+ public URL getStopWordsURL() {
+ return stopWordsURL;
+ }
+
+ /**
+ * Set the URL to a file containing stop words, one on each line.
+ * @param stopWordsURL
+ */
+ @CreoleParameter(
+ comment = "URL to a file containing stop words, one on each line, using
UTF-8.")
+ public void setStopWordsURL(URL stopWordsURL) {
+ this.stopWordsURL = stopWordsURL;
+ }
+
+
+
+}
Deleted:
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextClassificationPR.java
===================================================================
---
gate/trunk/plugins/Text_Categorization/src/gate/ml/textclass/TextClassificationPR.java
2013-11-19 11:10:02 UTC (rev 17103)
+++
gate/trunk/plugins/Text_Categorization/src/gate/ml/categorization/TextClassificationPR.java
2013-11-19 11:15:12 UTC (rev 17104)
@@ -1,451 +0,0 @@
-/**
- *
- */
-package gate.ml.textclass;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.StringReader;
-import java.io.UnsupportedEncodingException;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-
-import de.bwaldvogel.liblinear.Feature;
-import de.bwaldvogel.liblinear.FeatureNode;
-import de.bwaldvogel.liblinear.Linear;
-import de.bwaldvogel.liblinear.Model;
-import edu.ucla.sspace.common.DocumentVectorBuilder;
-import edu.ucla.sspace.common.SemanticSpace;
-import edu.ucla.sspace.common.SemanticSpaceIO;
-import edu.ucla.sspace.vector.DenseVector;
-import edu.ucla.sspace.vector.DoubleVector;
-import gate.Annotation;
-import gate.AnnotationSet;
-import gate.LanguageAnalyser;
-import gate.Resource;
-import gate.Utils;
-import gate.creole.AbstractLanguageAnalyser;
-import gate.creole.ExecutionException;
-import gate.creole.ResourceInstantiationException;
-import gate.creole.metadata.CreoleParameter;
-import gate.creole.metadata.CreoleResource;
-import gate.creole.metadata.RunTime;
-
-/**
- * A simple text classification PR, using a <a
- * href="https://github.com/fozziethebeat/S-Space/">Semantic Space</a>
- * implementation to generate vectors for the annotations to be classified,
- * and a <a href="http://www.csie.ntu.edu.tw/~cjlin/liblinear">LIBLINEAR</a>
- * model to perform the actual classification.
- */
-@CreoleResource
-public class TextClassificationPR extends AbstractLanguageAnalyser implements
-
LanguageAnalyser {
-
- /**
- * Serialisation UID.
- */
- private static final long serialVersionUID = 141243024466478134L;
-
- /**
- * The semantic space used to generate 'document' feature vectors for the
- * annotations to be classified.
- */
- protected SemanticSpace semanticSpace;
-
- protected DocumentVectorBuilder vectorBuilder;
-
- /**
- * The LibLinear model used for classification.
- */
- protected Model libLinearModel;
-
- /**
- * Thresholds to be used when classifying text annotations. If the model
- * classifies an input annotation as class "X", and "X"
- * exists as a key in this map, then the classification is only effected if
- * the classification probability emitted by the model is greater than or
- * equal to the value in the map. Values in this map are expected to be
- * probabilities, i.e. positive values between 0.0 and 1.0.
- *
- * Note that the liblinear model used must be able to produce classification
- * probabilities. At the time of writing, only Linear Regression models are
- * able to do so, while SVM-based one are not. Models that cannot supply
- * probabilities will simply return 1.0 as the classification probability,
- * causing values in this map to have no effect.
- */
- protected Map<String, Double> customClassThresholds;
-
- /**
- * The set of stop-words to be used. The values in this list are matched
- * verbatim against the values of the configured input feature on the input
- * annotations.
- */
- protected Set<String> stopWords;
-
- /**
- * URL to a file containing stop words, one on each line.
- * Lines starting with # or // are considered comments and are ignored.
- * White space at the start and end of each line's content will be ignored.
- */
- protected URL stopWordsURL;
-
- /**
- * The name for the annotation set used for input.
- */
- protected String inputASName;
-
- /**
- * The type of input annotations.
- */
- protected String inputAnnotationType;
-
- /**
- * The type for token annotations.
- */
- protected String tokenAnnotationType;
-
- /**
- * The feature on token annotations used to collect the input text.
- */
- protected String inputFeatureName;
-
- /**
- * The name for the output annotation set.
- */
- protected String outputASName;
-
- /**
- * The type of annotations produced. If this is the same as
- * inputAnnotationType, and the input and output annotation sets are the
same,
- * then no new annotations will be created. The output feature value will
- * simply be added to the input annotations.
- */
- protected String outputAnnotationType;
-
- /**
- * The name feature of the feature on the output annotations that stores the
- * class.
- */
- protected String outputFeatureName;
-
- /**
- * URL for the file containing the serialized semantic space.
- */
- protected URL sematicSpaceURL;
-
- /**
- * The URL to the LIBLINEAR model used for classification.
- */
- protected URL modelURL;
-
-
- @Override
- public Resource init() throws ResourceInstantiationException {
- // load the model
- if(modelURL != null) {
- InputStreamReader isr = null;
- try{
- // the default implementation uses ISO-8859-1, so we do the same
- isr = new InputStreamReader(
- new BufferedInputStream(modelURL.openStream()), "ISO-8859-1");
- libLinearModel = Linear.loadModel(isr);
- isr.close();
- } catch(IOException ioe) {
- throw new ResourceInstantiationException(
- "IO Error while loading the model from " + modelURL, ioe);
- }
- } else throw new ResourceInstantiationException("No model URL provided.");
- // load the semantic space
- if(sematicSpaceURL != null) {
- if(sematicSpaceURL.getProtocol().equalsIgnoreCase("file")) {
- try {
- semanticSpace = SemanticSpaceIO.load(sematicSpaceURL.getFile());
- } catch(IOException e) {
- throw new ResourceInstantiationException(
- "I/O error while loading the semantic space from " +
- sematicSpaceURL, e);
- }
- } else {
- throw new ResourceInstantiationException(
- "URL provided for the semantic space file (" + sematicSpaceURL +
- ")was not to a local file.");
- }
- } else {
- throw new ResourceInstantiationException(
- "No URL provided for the semantic space file.");
- }
-
- // create the vector builder
- Properties config = new Properties();
- config.put(DocumentVectorBuilder.USE_TERM_FREQUENCIES_PROPERTY, true);
- vectorBuilder = new DocumentVectorBuilder(semanticSpace, config);
-
- // load the stop words
- stopWords = null;
- if(stopWordsURL != null) {
- try {
- BufferedReader swReader = new BufferedReader(
- new InputStreamReader(stopWordsURL.openStream(), "UTF-8"));
- stopWords = new HashSet<String>();
- String line = swReader.readLine();
- while(line != null) {
- line = line.trim();
- if(line.startsWith("#") || line.startsWith("//")) {
- //ignore comment
- } else {
- stopWords.add(line);
- }
- line = swReader.readLine();
- }
- } catch(IOException e) {
- throw new ResourceInstantiationException(
- "I/O error while reading the stop words.", e);
- }
- }
- return super.init();
- }
-
- @Override
- public void execute() throws ExecutionException {
- // normalize parameters
- if(inputASName == null || inputASName.length() == 0) inputASName = "";
- if(outputASName == null || outputASName.length() == 0) outputASName = "";
- // should we use the input annotations for output
- boolean sameAnnotation = inputASName.equals(outputASName) &&
- inputAnnotationType.equals(outputAnnotationType);
- // validate parameter values
- if(inputAnnotationType == null || inputAnnotationType.length() == 0) {
- throw new ExecutionException("No input annotation type provided.");
- }
- if(inputFeatureName == null || inputFeatureName.length() == 0) {
- throw new ExecutionException("No input feature name provided.");
- }
- if(tokenAnnotationType == null || tokenAnnotationType.length() == 0) {
- throw new ExecutionException("No token annotation type provided.");
- }
- if(outputAnnotationType == null || outputAnnotationType.length() == 0) {
- throw new ExecutionException("No output annotation type provided.");
- }
- if(outputFeatureName == null || outputFeatureName.length() == 0) {
- throw new ExecutionException("No output feature name provided.");
- }
-
-
- // collect instance annotations
- AnnotationSet inputAS = document.getAnnotations(inputASName);
- AnnotationSet instances = inputAS.get(inputAnnotationType);
- for(Annotation instAnn : instances) {
- // collect the tokens
- List<Annotation> instTokens = Utils.inDocumentOrder(
- Utils.getContainedAnnotations(inputAS, instAnn,
tokenAnnotationType));
- // create a new context
- StringBuilder instanceStrBld = new StringBuilder();
- boolean first = true;
- for(Annotation token : instTokens) {
- String tokenString = (String)token.getFeatures().get(inputFeatureName);
- if(tokenString != null && tokenString.length() > 0) {
- if(first) {
- first = false;
- } else {
- instanceStrBld.append(' ');
- }
- instanceStrBld.append(tokenString);
- }
- }
-
- String instanceText = instanceStrBld.toString();
- if(instanceText.length() > 0) {
- DoubleVector instanceVector = vectorBuilder.buildVector(
- new BufferedReader(new StringReader(instanceText)),
- new DenseVector(semanticSpace.getVectorLength()));
- // classify the vector
- Feature[] features = new Feature[libLinearModel.getNrFeature()];
- for(int i = 0; i < features.length; i++) {
- features[i] = new FeatureNode(i, instanceVector.get(i));
- }
- double[] probs = new double[libLinearModel.getNrClass()];
- double label = Linear.predictValues(libLinearModel, features, probs);
- // do we need to check probabilities?
- // TODO
- if(customClassThresholds != null && customClassThresholds.size() > 0) {
-
- }
- }
-
- }
-
- }
-
- @Override
- public void cleanup() {
- // TODO Auto-generated method stub
- super.cleanup();
- }
-
- public String getInputASName() {
- return inputASName;
- }
-
- /**
- * Set the name for the annotation set used for input.
- * @param inputASName
- */
- @CreoleParameter(comment="The name for the annotation set used for input",
- defaultValue = "")
- @RunTime
- public void setInputASName(String inputASName) {
- this.inputASName = inputASName;
- }
-
- public String getInputAnnotationType() {
- return inputAnnotationType;
- }
-
- /**
- * Set the type of input annotations
- * @param inputAnnotationType
- */
- @CreoleParameter(comment="The type of input annotations",
- defaultValue="Sentence")
- @RunTime
- public void setInputAnnotationType(String inputAnnotationType) {
- this.inputAnnotationType = inputAnnotationType;
- }
-
- public String getTokenAnnotationType() {
- return tokenAnnotationType;
- }
-
- /**
- * Set the type for token annotations.
- * @param tokenAnnotationType
- */
- @CreoleParameter(comment = "The type for token annotations.",
- defaultValue = "Token")
- @RunTime
- public void setTokenAnnotationType(String tokenAnnotationType) {
- this.tokenAnnotationType = tokenAnnotationType;
- }
-
- public String getInputFeatureName() {
- return inputFeatureName;
- }
-
- /**
- * Set the feature on token annotations used to collect the input text.
- * @param inputFeatureName
- */
- @CreoleParameter(comment =
- "The feature on the token annotations used to collect the text for " +
- "each input annotation.", defaultValue = "root")
- @RunTime
- public void setInputFeatureName(String inputFeatureName) {
- this.inputFeatureName = inputFeatureName;
- }
-
- public String getOutputASName() {
- return outputASName;
- }
-
- /**
- * Set the name for the output annotation set
- * @param outputASName
- */
- @CreoleParameter(comment = "The name for the output annotation set",
- defaultValue = "")
- @RunTime
- public void setOutputASName(String outputASName) {
- this.outputASName = outputASName;
- }
-
- public String getOutputAnnotationType() {
- return outputAnnotationType;
- }
-
- /**
- * Set the type of annotations produced. If this is the same as
- * inputAnnotationType, and the input and output annotation sets are the
same,
- * then no new annotations will be created. The output feature value will
- * simply be added to the input annotations.
- * @param outputAnnotationName
- */
- @CreoleParameter(defaultValue = "Sentence", comment =
- "The type of annotations produced. If this is the same as " +
- "inputAnnotationType, and the input and output annotation sets are
the same, " +
- "then no new annotations will be created. The output feature value
will " +
- "simply be added to the input annotations. ")
- @RunTime
- public void setOutputAnnotationType(String outputAnnotationName) {
- this.outputAnnotationType = outputAnnotationName;
- }
-
- public String getOutputFeatureName() {
- return outputFeatureName;
- }
-
- /**
- * Set the name feature of the feature on the output annotations that stores
- * the class.
- * @param outputFeatureName
- */
- @CreoleParameter(defaultValue = "class", comment =
- "The name feature of the feature on the output annotations that " +
- "stores the class. ")
- @RunTime
- public void setOutputFeatureName(String outputFeatureName) {
- this.outputFeatureName = outputFeatureName;
- }
-
- public URL getSematicSpaceURL() {
- return sematicSpaceURL;
- }
-
- /**
- * Set the URL for the file containing the serialized semantic space.
- * @param sematicSpaceURL
- */
- @CreoleParameter(comment =
- "URL for the file containing the serialized semantic space. This must be
a local file:// URL.")
- public void setSematicSpaceURL(URL sematicSpaceURL) {
- this.sematicSpaceURL = sematicSpaceURL;
- }
-
- public URL getModelURL() {
- return modelURL;
- }
-
- /**
- * Set the URL to the LibLinear model used for classification.
- * @param modelURL
- */
- @CreoleParameter(comment = "The URL to the LIBLINEAR model used for
classification")
- public void setModelURL(URL modelURL) {
- this.modelURL = modelURL;
- }
-
- public URL getStopWordsURL() {
- return stopWordsURL;
- }
-
- /**
- * Set the URL to a file containing stop words, one on each line.
- * @param stopWordsURL
- */
- @CreoleParameter(
- comment = "URL to a file containing stop words, one on each line, using
UTF-8.")
- public void setStopWordsURL(URL stopWordsURL) {
- this.stopWordsURL = stopWordsURL;
- }
-
-
-
-}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Shape the Mobile Experience: Free Subscription
Software experts and developers: Be at the forefront of tech innovation.
Intel(R) Software Adrenaline delivers strategic insight and game-changing
conversations that shape the rapidly evolving mobile landscape. Sign up now.
http://pubads.g.doubleclick.net/gampad/clk?id=63431311&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs