Author: pkluegl Date: Tue Jun 18 08:12:20 2013 New Revision: 1494062 URL: http://svn.apache.org/r1494062 Log: UIMA-3005 - use # wildcard with CONTAINS condition - not really better but maybe a bit faster - add shortcuts in rule evaluation (for testing)
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java?rev=1494062&r1=1494061&r2=1494062&view=diff ============================================================================== --- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java (original) +++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/AnnotationRule.java Tue Jun 18 08:12:20 2013 @@ -100,8 +100,10 @@ public class AnnotationRule extends Trab ruleString += frontBoundaryItem + after + " ("; // We include all tokens between the boundaries. - ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster - + //ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster + ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ... but faster + + // Check, if the rear item should be included and mark all tokens between the brackets as the // new annotation. if (rearItemInBorders) Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java?rev=1494062&r1=1494061&r2=1494062&view=diff ============================================================================== --- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java (original) +++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ExpansionRule.java Tue Jun 18 08:12:20 2013 @@ -89,9 +89,10 @@ public class ExpansionRule extends Shift ruleString += frontBoundaryItem + after + " ("; // We include all tokens between the boundaries. - ruleString += "ANY*{-PARTOF(" + annotation.getType().getShortName() + ")} "; // like ANY*? but + //ruleString += "ANY*{-PARTOF(" + annotation.getType().getShortName() + ")} "; // like ANY*? but // faster - + ruleString += "#{-CONTAINS(" + annotation.getType().getShortName() + ")} "; + // these strings will be attached to the brackets String mark; if (((TrabalLearner) algorithm).getEnableFeatures()) @@ -133,8 +134,9 @@ public class ExpansionRule extends Shift ruleString += "(" + annotation.getType().getShortName() + unmark + " "; // We include all tokens between the boundaries. - ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster - +// ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster + ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ... but faster + // these strings will be attached to the brackets String mark; if (((TrabalLearner) algorithm).getEnableFeatures()) Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java?rev=1494062&r1=1494061&r2=1494062&view=diff ============================================================================== --- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java (original) +++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/ShiftAllRule.java Tue Jun 18 08:12:20 2013 @@ -75,7 +75,7 @@ public class ShiftAllRule extends Shifti after = "{" + after + "}"; // this part will delete the original annotation - String unmark = errorBoundaryItem + "{STARTSWITH(" + annotation.getType().getShortName() + String unmark = errorBoundaryItem + (frontItemInBorders ? "" : " ANY") +"{STARTSWITH(" + annotation.getType().getShortName() + ") -> UNMARK(" + annotation.getType().getShortName() + ", true)}"; if (shiftToLeft) @@ -96,9 +96,12 @@ public class ShiftAllRule extends Shifti ruleString += frontBoundaryItem + after + " ("; // We include all tokens between the boundaries. - ruleString += "ANY*{-PARTOF(" + errorBoundaryItem + ")} " + unmark + " ANY*{-PARTOF(" +// ruleString += "ANY*{-PARTOF(" + errorBoundaryItem + ")} " + unmark + " ANY*{-PARTOF(" +// + rearBoundaryItem + ")}"; // like ANY*? but faster + ruleString += "#{-CONTAINS(" + errorBoundaryItem + ")} " + unmark + " #{-CONTAINS(" + rearBoundaryItem + ")}"; // like ANY*? but faster - + + // Check, if the rear item should be included and mark all tokens between the brackets as // the // new annotation. @@ -111,7 +114,8 @@ public class ShiftAllRule extends Shifti private void compileShiftToRight(boolean frontItemInBorders, boolean rearItemInBorders, String before, String after, String action, String unmark) { // The old annotation begins before the new annotation - ruleString += unmark + " ANY*{-PARTOF(" + frontBoundaryItem + ")} "; +// ruleString += unmark + " ANY*{-PARTOF(" + frontBoundaryItem + ")} "; + ruleString += unmark + " #{-CONTAINS(" + frontBoundaryItem + ")} "; // If the front item is part of the future annotation, it has to be included in the // brackets. @@ -121,8 +125,11 @@ public class ShiftAllRule extends Shifti ruleString += frontBoundaryItem + after + " ("; // We include all tokens between the boundaries. - ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster +// ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster + ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ANY*? but faster + + // Check, if the rear item should be included and mark all tokens between the brackets as // the // new annotation. @@ -148,7 +155,8 @@ public class ShiftAllRule extends Shifti else ruleString += frontBoundaryItem + unmark + " ("; - ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster +// ruleString += "ANY*{-PARTOF(" + rearBoundaryItem + ")}"; // like ANY*? but faster + ruleString += "#{-CONTAINS(" + rearBoundaryItem + ")}"; // like ANY*? but faster // Check, if the rear item should be included and mark all tokens between the brackets as the // new annotation. Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java?rev=1494062&r1=1494061&r2=1494062&view=diff ============================================================================== --- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java (original) +++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/trabal/TrabalLearner.java Tue Jun 18 08:12:20 2013 @@ -15,7 +15,7 @@ * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. -*/ + */ package org.apache.uima.ruta.textruler.learner.trabal; @@ -31,6 +31,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import org.apache.commons.lang3.StringUtils; import org.apache.uima.cas.CAS; @@ -40,6 +41,7 @@ import org.apache.uima.cas.FeatureStruct import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.ruta.textruler.core.GlobalCASSource; import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner; import org.apache.uima.ruta.textruler.core.TextRulerExample; import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument; @@ -126,11 +128,13 @@ public class TrabalLearner extends TextR private Map<String, Double> idf; + private Map<String, TextRulerStatisticsCollector> inducedRules = new TreeMap<String, TextRulerStatisticsCollector>(); + public TrabalLearner(String inputFolderPath, String additionalFolderPath, String preprocessorTMfile, String tempFolderPath, String[] fullSlotTypeNames, Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) { - super(inputFolderPath, preprocessorTMfile, tempFolderPath, fullSlotTypeNames, filterSet, - skip, delegate); + super(inputFolderPath, preprocessorTMfile, tempFolderPath, fullSlotTypeNames, filterSet, skip, + delegate); this.inputDirectory = inputFolderPath; this.additionalFolderPath = additionalFolderPath; } @@ -515,6 +519,8 @@ public class TrabalLearner extends TextR */ private List<TrabalRule> runAlgorithm(Map<String, List<AnnotationError>> errorGrps) { removeBasics(); + inducedRules.clear(); + List<TrabalRule> rules = new ArrayList<TrabalRule>(); bestRulesForStatus.clear(); int i = 1; @@ -618,11 +624,12 @@ public class TrabalLearner extends TextR conditions = createConditions(learntRules); for (int i = 0; i < learntRules.size(); i++) { rules.add(learntRules.get(i)); -// TODO amount of conditions? parameter for 50! + // TODO amount of conditions? parameter for 50! for (int j = 0; j < conditions.size() && j < 50; j++) { TrabalRule newRule = learntRules.get(i).copy(); if (!newRule.getConditions().contains(conditions.get(j))) { newRule.addCondition(conditions.get(j), (j + 1)); + newRule.getRuleString(); newRules.add(newRule); } } @@ -633,8 +640,6 @@ public class TrabalLearner extends TextR return learntRules; } - - /** * Chooses the best final rules from the results of runAlgorithm(). * @@ -1560,57 +1565,80 @@ public class TrabalLearner extends TextR return rules; List<TextRulerStatisticsCollector> sums = new ArrayList<TextRulerStatisticsCollector>(); + for (TrabalRule each : rules) { + sums.add(new TextRulerStatisticsCollector()); + } List<TextRulerExampleDocument> goldDocs; List<TextRulerExampleDocument> additionalDocs; goldDocs = documents.getDocuments(); additionalDocs = additionalDocuments.getDocuments(); CAS theTestCAS = getTestCAS(); - for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) { - for (int i = 0; i < goldDocs.size(); i++) { - String ruleInfo; - if (rules.get(ruleIndex).getAnnotation() != null - && rules.get(ruleIndex).getTargetAnnotation() != null) { - ruleInfo = " " + rules.get(ruleIndex).getAnnotation().getType().getShortName() + "(" - + rules.get(ruleIndex).getAnnotation().getBegin() + "," - + rules.get(ruleIndex).getAnnotation().getEnd() + ") -> " - + rules.get(ruleIndex).getTargetAnnotation().getType().getShortName() + "(" - + rules.get(ruleIndex).getTargetAnnotation().getBegin() + "," - + rules.get(ruleIndex).getTargetAnnotation().getEnd() + ")"; - } else if (rules.get(ruleIndex).getTargetAnnotation() != null) { - ruleInfo = " Annotate " - + rules.get(ruleIndex).getTargetAnnotation().getType().getShortName() + "(" - + rules.get(ruleIndex).getTargetAnnotation().getBegin() + "," - + rules.get(ruleIndex).getTargetAnnotation().getEnd() + ")"; - } else { - ruleInfo = " Delete " + rules.get(ruleIndex).getAnnotation().getType().getShortName() - + "(" + rules.get(ruleIndex).getAnnotation().getBegin() + "," - + rules.get(ruleIndex).getAnnotation().getEnd() + ")"; - } - TextRulerExampleDocument goldDoc = goldDocs.get(i); - TextRulerExampleDocument additionalDoc = additionalDocs.get(i); - sendStatusUpdateToDelegate("Testing " + ruleSet + ruleInfo + " on document " + (i + 1) - + " of " + goldDocs.size() + " : rule " + (ruleIndex + 1) + " of " + rules.size(), - TextRulerLearnerState.ML_RUNNING, false); - TextRulerStatisticsCollector sumC = new TextRulerStatisticsCollector(); - prepareTestCas(theTestCAS, goldDoc, additionalDoc); - testRuleOnDocument((TrabalRule) rules.get(ruleIndex), goldDoc, additionalDoc, sumC, - theTestCAS); - if (sums.size() > ruleIndex) { - sums.get(ruleIndex).add(sumC); - } else { - sums.add(sumC); + int counter = 0; + for (TrabalRule rule : rules) { + counter++; + String ruleString = rule.getRuleString(); + String ruleInfo = getRuleInfo(rule); + System.out.println("testing: " + ruleString); + if (inducedRules.containsKey(ruleString)) { + rule.setCoveringStatistics(inducedRules.get(ruleString)); + System.out.println("skipped..."); + } else { + for (int i = 0; i < goldDocs.size(); i++) { + TextRulerExampleDocument goldDoc = goldDocs.get(i); + TextRulerExampleDocument additionalDoc = additionalDocs.get(i); + sendStatusUpdateToDelegate("Testing " + ruleSet + ruleInfo + " on document " + (i + 1) + + " of " + goldDocs.size() + " : rule " + counter + " of " + rules.size(), + TextRulerLearnerState.ML_RUNNING, false); + TextRulerStatisticsCollector sumC = new TextRulerStatisticsCollector(); + prepareTestCas(theTestCAS, goldDoc, additionalDoc); + testRuleOnDocument(rule, goldDoc, additionalDoc, sumC, theTestCAS); + sums.get(counter - 1).add(sumC); + int n = sumC.getCoveredNegativesCount(); + int p = sumC.getCoveredPositivesCount(); + int pnorm = p; + if (pnorm == 0) { + pnorm = 1; + } + if (n / pnorm > maxErrorRate) { + System.out.println("stopped:" + sumC); + break; + } + + if (shouldAbort()) + return rules; } - if (shouldAbort()) - return rules; + TextRulerStatisticsCollector c = sums.get(counter - 1); + rule.setCoveringStatistics(sums.get(counter - 1)); + inducedRules.put(ruleString, c); } } for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) { rules.get(ruleIndex).setCoveringStatistics(sums.get(ruleIndex)); } + GlobalCASSource.releaseCAS(theTestCAS); sums.clear(); return rules; } + private String getRuleInfo(TrabalRule rule) { + String ruleInfo; + if (rule.getAnnotation() != null && rule.getTargetAnnotation() != null) { + ruleInfo = " " + rule.getAnnotation().getType().getShortName() + "(" + + rule.getAnnotation().getBegin() + "," + rule.getAnnotation().getEnd() + ") -> " + + rule.getTargetAnnotation().getType().getShortName() + "(" + + rule.getTargetAnnotation().getBegin() + "," + rule.getTargetAnnotation().getEnd() + + ")"; + } else if (rule.getTargetAnnotation() != null) { + ruleInfo = " Annotate " + rule.getTargetAnnotation().getType().getShortName() + "(" + + rule.getTargetAnnotation().getBegin() + "," + rule.getTargetAnnotation().getEnd() + + ")"; + } else { + ruleInfo = " Delete " + rule.getAnnotation().getType().getShortName() + "(" + + rule.getAnnotation().getBegin() + "," + rule.getAnnotation().getEnd() + ")"; + } + return ruleInfo; + } + /** * Resets the test CAS and adds the annotations of the additional data. * @@ -1628,6 +1656,7 @@ public class TrabalLearner extends TextR for (AnnotationFS fs : additionalCas.getAnnotationIndex()) { Type t = testCas.getTypeSystem().getType(fs.getType().getName()); if (t != null) { + // TODO what about the features!! AnnotationFS createAnnotation = testCas.createAnnotation(t, fs.getBegin(), fs.getEnd()); testCas.addFsToIndexes(createAnnotation); } else { @@ -1823,7 +1852,6 @@ public class TrabalLearner extends TextR return getFileHeaderString(true) + result; } - // // @Override // public AnalysisEngine getAnalysisEngine() { // if (ae == null) { @@ -1859,7 +1887,7 @@ public class TrabalLearner extends TextR */ public TextRulerExampleDocumentSet getAdditionalDocuments() { if (additionalDocuments == null) { - if(!StringUtils.isBlank(additionalFolderPath)) { + if (!StringUtils.isBlank(additionalFolderPath)) { additionalDocuments = new TextRulerExampleDocumentSet(additionalFolderPath, casCache); } }