Author: pkluegl
Date: Tue Jun 18 08:12:20 2013
New Revision: 1494062

URL: http://svn.apache.org/r1494062
Log:
UIMA-3005
- use # wildcard with CONTAINS condition - not really better but maybe a bit 
faster
- add shortcuts in rule evaluation (for testing)

Modified:
    
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java
    
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java
    
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java
    
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java

Modified: 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java?rev=1494062&r1=1494061&r2=1494062&view=diff
==============================================================================
--- 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java
 (original)
+++ 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java
 Tue Jun 18 08:12:20 2013
@@ -100,8 +100,10 @@ public class AnnotationRule extends Trab
       ruleString += frontBoundaryItem + after + " (";
 
     // We include all tokens between the boundaries.
-    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but 
faster
-
+    //ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? 
but faster
+    ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ... but 
faster
+    
+    
     // Check, if the rear item should be included and mark all tokens between 
the brackets as the
     // new annotation.
     if (rearItemInBorders)

Modified: 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java?rev=1494062&r1=1494061&r2=1494062&view=diff
==============================================================================
--- 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java
 (original)
+++ 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java
 Tue Jun 18 08:12:20 2013
@@ -89,9 +89,10 @@ public class ExpansionRule extends Shift
       ruleString += frontBoundaryItem + after + " (";
 
     // We include all tokens between the boundaries.
-    ruleString += "ANY*{-PARTOF(" + annotation.getType().getShortName() + ")} 
"; // like ANY*? but
+    //ruleString += "ANY*{-PARTOF(" + annotation.getType().getShortName() + 
")} "; // like ANY*? but
     // faster
-
+    ruleString += "#{-CONTAINS(" + annotation.getType().getShortName() + ")} ";
+    
     // these strings will be attached to the brackets
     String mark;
     if (((TrabalLearner) algorithm).getEnableFeatures())
@@ -133,8 +134,9 @@ public class ExpansionRule extends Shift
     ruleString += "(" + annotation.getType().getShortName() + unmark + " ";
 
     // We include all tokens between the boundaries.
-    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but 
faster
-
+//    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? 
but faster
+    ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ... but 
faster
+    
     // these strings will be attached to the brackets
     String mark;
     if (((TrabalLearner) algorithm).getEnableFeatures())

Modified: 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java?rev=1494062&r1=1494061&r2=1494062&view=diff
==============================================================================
--- 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java
 (original)
+++ 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java
 Tue Jun 18 08:12:20 2013
@@ -75,7 +75,7 @@ public class ShiftAllRule extends Shifti
         after = "{" + after + "}";
 
       // this part will delete the original annotation
-      String unmark = errorBoundaryItem + "{STARTSWITH(" + 
annotation.getType().getShortName()
+      String unmark = errorBoundaryItem + (frontItemInBorders ? "" : " ANY") 
+"{STARTSWITH(" + annotation.getType().getShortName()
               + ") -> UNMARK(" + annotation.getType().getShortName() + ", 
true)}";
 
       if (shiftToLeft)
@@ -96,9 +96,12 @@ public class ShiftAllRule extends Shifti
       ruleString += frontBoundaryItem + after + " (";
 
     // We include all tokens between the boundaries.
-    ruleString += "ANY*{-PARTOF(" + errorBoundaryItem + ")} " + unmark + " 
ANY*{-PARTOF("
+//    ruleString += "ANY*{-PARTOF(" + errorBoundaryItem + ")} " + unmark + " 
ANY*{-PARTOF("
+//            + rearBoundaryItem + ")}"; // like ANY*? but faster
+    ruleString += "#{-CONTAINS(" + errorBoundaryItem + ")} " + unmark + " 
#{-CONTAINS("
             + rearBoundaryItem + ")}"; // like ANY*? but faster
-
+    
+    
     // Check, if the rear item should be included and mark all tokens between 
the brackets as
     // the
     // new annotation.
@@ -111,7 +114,8 @@ public class ShiftAllRule extends Shifti
   private void compileShiftToRight(boolean frontItemInBorders, boolean 
rearItemInBorders,
           String before, String after, String action, String unmark) {
     // The old annotation begins before the new annotation
-    ruleString += unmark + " ANY*{-PARTOF(" + frontBoundaryItem + ")} ";
+//    ruleString += unmark + " ANY*{-PARTOF(" + frontBoundaryItem + ")} ";
+    ruleString += unmark + " #{-CONTAINS(" + frontBoundaryItem + ")} ";
 
     // If the front item is part of the future annotation, it has to be 
included in the
     // brackets.
@@ -121,8 +125,11 @@ public class ShiftAllRule extends Shifti
       ruleString += frontBoundaryItem + after + " (";
 
     // We include all tokens between the boundaries.
-    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but 
faster
+//    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? 
but faster
+    ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ANY*? but 
faster
 
+    
+    
     // Check, if the rear item should be included and mark all tokens between 
the brackets as
     // the
     // new annotation.
@@ -148,7 +155,8 @@ public class ShiftAllRule extends Shifti
     else
       ruleString += frontBoundaryItem + unmark + " (";
 
-    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but 
faster
+//    ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? 
but faster
+    ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ANY*? but 
faster
 
     // Check, if the rear item should be included and mark all tokens between 
the brackets as the
     // new annotation.

Modified: 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java?rev=1494062&r1=1494061&r2=1494062&view=diff
==============================================================================
--- 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java
 (original)
+++ 
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java
 Tue Jun 18 08:12:20 2013
@@ -15,7 +15,7 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
-*/
+ */
 
 package org.apache.uima.ruta.textruler.learner.trabal;
 
@@ -31,6 +31,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeMap;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.uima.cas.CAS;
@@ -40,6 +41,7 @@ import org.apache.uima.cas.FeatureStruct
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.ruta.textruler.core.GlobalCASSource;
 import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
 import org.apache.uima.ruta.textruler.core.TextRulerExample;
 import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
@@ -126,11 +128,13 @@ public class TrabalLearner extends TextR
 
   private Map<String, Double> idf;
 
+  private Map<String, TextRulerStatisticsCollector> inducedRules = new 
TreeMap<String, TextRulerStatisticsCollector>();
+
   public TrabalLearner(String inputFolderPath, String additionalFolderPath,
           String preprocessorTMfile, String tempFolderPath, String[] 
fullSlotTypeNames,
           Set<String> filterSet, boolean skip, TextRulerLearnerDelegate 
delegate) {
-    super(inputFolderPath, preprocessorTMfile, tempFolderPath, 
fullSlotTypeNames, filterSet,
-            skip, delegate);
+    super(inputFolderPath, preprocessorTMfile, tempFolderPath, 
fullSlotTypeNames, filterSet, skip,
+            delegate);
     this.inputDirectory = inputFolderPath;
     this.additionalFolderPath = additionalFolderPath;
   }
@@ -515,6 +519,8 @@ public class TrabalLearner extends TextR
    */
   private List<TrabalRule> runAlgorithm(Map<String, List<AnnotationError>> 
errorGrps) {
     removeBasics();
+    inducedRules.clear();
+
     List<TrabalRule> rules = new ArrayList<TrabalRule>();
     bestRulesForStatus.clear();
     int i = 1;
@@ -618,11 +624,12 @@ public class TrabalLearner extends TextR
         conditions = createConditions(learntRules);
       for (int i = 0; i < learntRules.size(); i++) {
         rules.add(learntRules.get(i));
-//        TODO amount of conditions? parameter for 50!
+        // TODO amount of conditions? parameter for 50!
         for (int j = 0; j < conditions.size() && j < 50; j++) {
           TrabalRule newRule = learntRules.get(i).copy();
           if (!newRule.getConditions().contains(conditions.get(j))) {
             newRule.addCondition(conditions.get(j), (j + 1));
+            newRule.getRuleString();
             newRules.add(newRule);
           }
         }
@@ -633,8 +640,6 @@ public class TrabalLearner extends TextR
     return learntRules;
   }
 
- 
-
   /**
    * Chooses the best final rules from the results of runAlgorithm().
    * 
@@ -1560,57 +1565,80 @@ public class TrabalLearner extends TextR
       return rules;
 
     List<TextRulerStatisticsCollector> sums = new 
ArrayList<TextRulerStatisticsCollector>();
+    for (TrabalRule each : rules) {
+      sums.add(new TextRulerStatisticsCollector());
+    }
     List<TextRulerExampleDocument> goldDocs;
     List<TextRulerExampleDocument> additionalDocs;
     goldDocs = documents.getDocuments();
     additionalDocs = additionalDocuments.getDocuments();
     CAS theTestCAS = getTestCAS();
-    for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {
-      for (int i = 0; i < goldDocs.size(); i++) {
-        String ruleInfo;
-        if (rules.get(ruleIndex).getAnnotation() != null
-                && rules.get(ruleIndex).getTargetAnnotation() != null) {
-          ruleInfo = " " + 
rules.get(ruleIndex).getAnnotation().getType().getShortName() + "("
-                  + rules.get(ruleIndex).getAnnotation().getBegin() + ","
-                  + rules.get(ruleIndex).getAnnotation().getEnd() + ") -> "
-                  + 
rules.get(ruleIndex).getTargetAnnotation().getType().getShortName() + "("
-                  + rules.get(ruleIndex).getTargetAnnotation().getBegin() + ","
-                  + rules.get(ruleIndex).getTargetAnnotation().getEnd() + ")";
-        } else if (rules.get(ruleIndex).getTargetAnnotation() != null) {
-          ruleInfo = " Annotate "
-                  + 
rules.get(ruleIndex).getTargetAnnotation().getType().getShortName() + "("
-                  + rules.get(ruleIndex).getTargetAnnotation().getBegin() + ","
-                  + rules.get(ruleIndex).getTargetAnnotation().getEnd() + ")";
-        } else {
-          ruleInfo = " Delete " + 
rules.get(ruleIndex).getAnnotation().getType().getShortName()
-                  + "(" + rules.get(ruleIndex).getAnnotation().getBegin() + ","
-                  + rules.get(ruleIndex).getAnnotation().getEnd() + ")";
-        }
-        TextRulerExampleDocument goldDoc = goldDocs.get(i);
-        TextRulerExampleDocument additionalDoc = additionalDocs.get(i);
-        sendStatusUpdateToDelegate("Testing " + ruleSet + ruleInfo + " on 
document " + (i + 1)
-                + " of " + goldDocs.size() + " : rule " + (ruleIndex + 1) + " 
of " + rules.size(),
-                TextRulerLearnerState.ML_RUNNING, false);
-        TextRulerStatisticsCollector sumC = new TextRulerStatisticsCollector();
-        prepareTestCas(theTestCAS, goldDoc, additionalDoc);
-        testRuleOnDocument((TrabalRule) rules.get(ruleIndex), goldDoc, 
additionalDoc, sumC,
-                theTestCAS);
-        if (sums.size() > ruleIndex) {
-          sums.get(ruleIndex).add(sumC);
-        } else {
-          sums.add(sumC);
+    int counter = 0;
+    for (TrabalRule rule : rules) {
+      counter++;
+      String ruleString = rule.getRuleString();
+      String ruleInfo = getRuleInfo(rule);
+      System.out.println("testing: " + ruleString);
+      if (inducedRules.containsKey(ruleString)) {
+        rule.setCoveringStatistics(inducedRules.get(ruleString));
+        System.out.println("skipped...");
+      } else {
+        for (int i = 0; i < goldDocs.size(); i++) {
+          TextRulerExampleDocument goldDoc = goldDocs.get(i);
+          TextRulerExampleDocument additionalDoc = additionalDocs.get(i);
+          sendStatusUpdateToDelegate("Testing " + ruleSet + ruleInfo + " on 
document " + (i + 1)
+                  + " of " + goldDocs.size() + " : rule " + counter + " of " + 
rules.size(),
+                  TextRulerLearnerState.ML_RUNNING, false);
+          TextRulerStatisticsCollector sumC = new 
TextRulerStatisticsCollector();
+          prepareTestCas(theTestCAS, goldDoc, additionalDoc);
+          testRuleOnDocument(rule, goldDoc, additionalDoc, sumC, theTestCAS);
+          sums.get(counter - 1).add(sumC);
+          int n = sumC.getCoveredNegativesCount();
+          int p = sumC.getCoveredPositivesCount();
+          int pnorm = p;
+          if (pnorm == 0) {
+            pnorm = 1;
+          }
+          if (n / pnorm > maxErrorRate) {
+            System.out.println("stopped:" + sumC);
+            break;
+          }
+
+          if (shouldAbort())
+            return rules;
         }
-        if (shouldAbort())
-          return rules;
+        TextRulerStatisticsCollector c = sums.get(counter - 1);
+        rule.setCoveringStatistics(sums.get(counter - 1));
+        inducedRules.put(ruleString, c);
       }
     }
     for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {
       rules.get(ruleIndex).setCoveringStatistics(sums.get(ruleIndex));
     }
+    GlobalCASSource.releaseCAS(theTestCAS);
     sums.clear();
     return rules;
   }
 
+  private String getRuleInfo(TrabalRule rule) {
+    String ruleInfo;
+    if (rule.getAnnotation() != null && rule.getTargetAnnotation() != null) {
+      ruleInfo = " " + rule.getAnnotation().getType().getShortName() + "("
+              + rule.getAnnotation().getBegin() + "," + 
rule.getAnnotation().getEnd() + ") -> "
+              + rule.getTargetAnnotation().getType().getShortName() + "("
+              + rule.getTargetAnnotation().getBegin() + "," + 
rule.getTargetAnnotation().getEnd()
+              + ")";
+    } else if (rule.getTargetAnnotation() != null) {
+      ruleInfo = " Annotate " + 
rule.getTargetAnnotation().getType().getShortName() + "("
+              + rule.getTargetAnnotation().getBegin() + "," + 
rule.getTargetAnnotation().getEnd()
+              + ")";
+    } else {
+      ruleInfo = " Delete " + rule.getAnnotation().getType().getShortName() + 
"("
+              + rule.getAnnotation().getBegin() + "," + 
rule.getAnnotation().getEnd() + ")";
+    }
+    return ruleInfo;
+  }
+
   /**
    * Resets the test CAS and adds the annotations of the additional data.
    * 
@@ -1628,6 +1656,7 @@ public class TrabalLearner extends TextR
     for (AnnotationFS fs : additionalCas.getAnnotationIndex()) {
       Type t = testCas.getTypeSystem().getType(fs.getType().getName());
       if (t != null) {
+        // TODO what about the features!!
         AnnotationFS createAnnotation = testCas.createAnnotation(t, 
fs.getBegin(), fs.getEnd());
         testCas.addFsToIndexes(createAnnotation);
       } else {
@@ -1823,7 +1852,6 @@ public class TrabalLearner extends TextR
     return getFileHeaderString(true) + result;
   }
 
-
   // // @Override
   // public AnalysisEngine getAnalysisEngine() {
   // if (ae == null) {
@@ -1859,7 +1887,7 @@ public class TrabalLearner extends TextR
    */
   public TextRulerExampleDocumentSet getAdditionalDocuments() {
     if (additionalDocuments == null) {
-      if(!StringUtils.isBlank(additionalFolderPath)) {
+      if (!StringUtils.isBlank(additionalFolderPath)) {
         additionalDocuments = new 
TextRulerExampleDocumentSet(additionalFolderPath, casCache);
       }
     }


Reply via email to