Author: rec
Date: Tue Aug  9 10:51:04 2016
New Revision: 1755595

URL: http://svn.apache.org/viewvc?rev=1755595&view=rev
Log:
[UIMA-5046] Faster indexCovered implementation
- Added new implementation (typically approx. twice as fast as previous)
- Code currently still contains a lot of debugging lines (inactive) and an 
alternative purging method (inactive) to be removed later
- Unit test improved to report performance also for indexing and index lookups
- Increased number of iterations in randomized unit test to check new 
implementation more thoroughly

Modified:
    
uima/uimafit/trunk/uimafit-core/src/main/java/org/apache/uima/fit/util/CasUtil.java
    
uima/uimafit/trunk/uimafit-core/src/test/java/org/apache/uima/fit/util/JCasUtilTest.java

Modified: 
uima/uimafit/trunk/uimafit-core/src/main/java/org/apache/uima/fit/util/CasUtil.java
URL: 
http://svn.apache.org/viewvc/uima/uimafit/trunk/uimafit-core/src/main/java/org/apache/uima/fit/util/CasUtil.java?rev=1755595&r1=1755594&r2=1755595&view=diff
==============================================================================
--- 
uima/uimafit/trunk/uimafit-core/src/main/java/org/apache/uima/fit/util/CasUtil.java
 (original)
+++ 
uima/uimafit/trunk/uimafit-core/src/main/java/org/apache/uima/fit/util/CasUtil.java
 Tue Aug  9 10:51:04 2016
@@ -26,15 +26,16 @@ import static java.util.Collections.empt
 import static java.util.Collections.unmodifiableList;
 import static java.util.Collections.unmodifiableMap;
 
+import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Deque;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
-
 import org.apache.uima.cas.ArrayFS;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASRuntimeException;
@@ -130,7 +131,13 @@ public final class CasUtil {
     }
     final Type type = aCas.getTypeSystem().getType(typeName);
     if (type == null) {
-      throw new IllegalArgumentException("Undeclared type [" + aTypename + 
"]");
+      StringBuilder sb = new StringBuilder();
+      Iterator<Type> i = aCas.getTypeSystem().getTypeIterator();
+      while (i.hasNext()) {
+        sb.append(i.next().getName()).append('\n');
+      }
+      throw new IllegalArgumentException("Undeclared type [" + aTypename + "]. 
Available types: "
+              + sb);
     }
     return type;
   }
@@ -772,16 +779,159 @@ public final class CasUtil {
         }
       }
     };
-    for (AnnotationFS s : select(cas, type)) {
-      for (AnnotationFS u : selectCovered(cas, coveredType, s)) {
-        Collection<AnnotationFS> c = index.get(s);
-        if (c == EMPTY_LIST) {
-          c = new LinkedList<AnnotationFS>();
-          index.put(s, c);
+    
+    // Get the covering and covered annotations
+    Collection<AnnotationFS> collType = select(cas, type);
+    Collection<AnnotationFS> collCoveredType = select(cas, coveredType);
+    
+    // Convert them into array for faster access
+    AnnotationFS[] typeArray = collType.toArray(new 
AnnotationFS[collType.size()]);
+    AnnotationFS[] coveredTypeArray = collCoveredType
+            .toArray(new AnnotationFS[collCoveredType.size()]);
+    
+    // Keeps currently "open" annotations in a sorted order
+    Deque<AnnotationFS> memory = new ArrayDeque<>();
+    Deque<AnnotationFS> memory2 = new ArrayDeque<>();
+    
+    // Array cursors
+    int o = 0;
+    int i = 0;
+    
+    // Whether to log debug information
+    boolean debug = false;
+    
+    while ((o < typeArray.length || !memory.isEmpty()) && i < 
coveredTypeArray.length) {
+      // Always make sure the memory contains at least one covering annotation 
to compare against
+      if (memory.isEmpty()) {
+        memory.push(typeArray[o]);
+        if (debug) {
+          System.out.printf("NEXT OUTER%n");
+        }
+        o++;
+      }
+      AnnotationFS bottom = memory.peek();
+      
+      // Fast-forward over annotations which cannot be covered by the bottom 
element because they
+      // start earlier.
+      AnnotationFS iFS = coveredTypeArray[i];
+      while (i < coveredTypeArray.length-1 && iFS.getBegin() < 
bottom.getBegin()) {
+        if (debug) {
+          System.out.printf("Skipping inner: %d %d%n", iFS.getBegin(), 
iFS.getEnd());
         }
-        c.add(u);
+        i++;
+        iFS = coveredTypeArray[i];
+      }
+      
+      // Cache begin/end of current covered annotation for faster access
+      int iFSbegin = iFS.getBegin();
+      int iFSend = iFS.getEnd();
+
+      // If there is any chance that the covering annotations in the memory 
contains the current
+      // covered ...
+      if (bottom.getBegin() <= iFS.getBegin()) {
+        // ... collect as many potentially covering annotations as possible 
for the current covered
+        // annotation
+        while (o < typeArray.length && typeArray[o].getBegin() <= iFSbegin) {
+          memory.push(typeArray[o]);
+          o++;
+        }
+        
+        if (debug) {
+          StringBuilder sb2 = new StringBuilder();
+          for (AnnotationFS f : memory) {
+            sb2.append(f.getBegin()).append("-").append(f.getEnd()).append(" 
");
+          }
+          System.out.printf("OUTER 1: %s%n", sb2);
+          System.out.printf("INNER  : %d-%d%n", iFSbegin, iFSend);
+        }
+        
+        // Record covered annotations
+        for (AnnotationFS covering : memory) {
+          if (covering.getBegin() <= iFSbegin && iFS.getEnd() <= 
covering.getEnd()) {
+            Collection<AnnotationFS> c = index.get(covering);
+            if (c == EMPTY_LIST) {
+              c = new LinkedList<AnnotationFS>();
+              index.put(covering, c);
+            }
+            c.add(iFS);
+            if (debug) {
+              System.out.printf("-- %d-%d covers %d-%d%n", 
covering.getBegin(), covering.getEnd(),
+                      iFSbegin, iFSend);
+            }
+          }
+          else {
+            if (debug) {
+              System.out.printf("-- %d-%d NOT covers %d-%d%n", 
covering.getBegin(),
+                      covering.getEnd(), iFSbegin, iFSend);
+            }
+          }
+        }
+        
+        if (debug) {
+          System.out.printf("NEXT INNER 1%n");
+        }
+        i++;
+      }
+      else {
+        if (debug) {
+          StringBuilder sb = new StringBuilder();
+          for (AnnotationFS f : memory) {
+            sb.append(f.getBegin()).append("-").append(f.getEnd()).append(" ");
+          }
+          System.out.printf("OUTER 2: %s%n", sb);
+          System.out.printf("INNER  : %d-%d%n", iFSbegin, iFS.getEnd());
+          System.out.printf("HOPELESS!%n");
+          
+          System.out.printf("NEXT INNER 2%n");
+        }
+        i++;
+      }
+        
+      // Purge covering annotations from memory that cannot match anymore 
given the currently
+      int purgeImpl = 0;
+      // exampled covered annotation
+      if (purgeImpl == 0) {
+        // Alternative implementation: re-uses memory
+        Iterator<AnnotationFS> purgeIterator = memory.iterator();
+        while (purgeIterator.hasNext()) {
+          AnnotationFS purgeCandidate = purgeIterator.next();
+          if (purgeCandidate.getEnd() < iFS.getBegin()) {
+            if (debug) {
+              System.out.printf("Dropping: %d-%d%n", purgeCandidate.getBegin(),
+                      purgeCandidate.getEnd());
+            }
+            purgeIterator.remove();
+          }
+        }
+      }
+      
+      // Alternative implementation: uses more memory but faster?
+      if (purgeImpl == 1) {
+        memory2.clear();
+        for (AnnotationFS purgeCandidate : memory) {
+          if (purgeCandidate.getEnd() < iFS.getBegin()) {
+            if (debug) {
+              System.out.printf("Dropping: %d-%d%n", purgeCandidate.getBegin(),
+                      purgeCandidate.getEnd());
+            }
+          }
+          else {
+            memory2.add(purgeCandidate);
+          }
+        }
+        
+        // Swap
+        Deque<AnnotationFS> buf = memory;
+        memory = memory2;
+        memory2 = buf;
+      }
+      
+      if (debug) {
+        System.out.printf("outer: %d/%d  inner: %d/%d%n", o, typeArray.length, 
i,
+                coveredTypeArray.length);
       }
     }
+
     return unmodifiableMap(index);
   }
 

Modified: 
uima/uimafit/trunk/uimafit-core/src/test/java/org/apache/uima/fit/util/JCasUtilTest.java
URL: 
http://svn.apache.org/viewvc/uima/uimafit/trunk/uimafit-core/src/test/java/org/apache/uima/fit/util/JCasUtilTest.java?rev=1755595&r1=1755594&r2=1755595&view=diff
==============================================================================
--- 
uima/uimafit/trunk/uimafit-core/src/test/java/org/apache/uima/fit/util/JCasUtilTest.java
 (original)
+++ 
uima/uimafit/trunk/uimafit-core/src/test/java/org/apache/uima/fit/util/JCasUtilTest.java
 Tue Aug  9 10:51:04 2016
@@ -125,7 +125,7 @@ public class JCasUtilTest extends Compon
 
   @Test
   public void testSelectCoverRandom() throws Exception {
-    final int ITERATIONS = 10;
+    final int ITERATIONS = 25;
 
     for (int i = 0; i < ITERATIONS; i++) {
       CAS cas = jCas.getCas();
@@ -134,24 +134,41 @@ public class JCasUtilTest extends Compon
       JCas jcas = cas.getJCas();
       long timeNaive = 0;
       long timeOptimized = 0;
-      for (Token t : select(jcas, Token.class)) {
+      
+      // Prepare the index
+      long timeIndexed = System.currentTimeMillis();
+      Map<Sentence, Collection<Token>> index = indexCovered(jcas, 
Sentence.class, Token.class);
+      timeIndexed = System.currentTimeMillis() - timeIndexed;
+      
+      for (Sentence t : select(jcas, Sentence.class)) {
         long ti = System.currentTimeMillis();
         // The naive approach is assumed to be correct
-        List<Sentence> stem1 = selectCovered(jcas, Sentence.class, 
t.getBegin(), t.getEnd());
+        List<Token> expected = selectCovered(jcas, Token.class, t.getBegin(), 
t.getEnd());
         timeNaive += System.currentTimeMillis() - ti;
 
+        // Record time for optimized selectCovered
         ti = System.currentTimeMillis();
-        List<Sentence> stem2 = selectCovered(jcas, Sentence.class, t);
+        List<Token> actual1 = selectCovered(jcas, Token.class, t);
         timeOptimized += System.currentTimeMillis() - ti;
 
-        Collection<Sentence> stem3 = indexCovered(jcas, Token.class, 
Sentence.class).get(t);
+        // Record index lookup time
+        ti = System.currentTimeMillis();
+        Collection<Token> actual2 = index.get(t);
+        timeIndexed += System.currentTimeMillis() - ti;
 
-        check(jcas, t, stem1, stem2);
-        check(jcas, t, stem1, stem3);
+        check(jcas, t, expected, actual1);
+        check(jcas, t, expected, actual2);
+        
+        // System.out.printf("%n--- OK ---------------%n%n");
       }
-      System.out.format("Speed up factor %.2f [naive:%d optimized:%d 
diff:%d]\n",
-              (double) timeNaive / (double) timeOptimized, timeNaive, 
timeOptimized, timeNaive
-                      - timeOptimized);
+      System.out.printf(
+              "%3d Optimized: speed up factor %3.2f [naive:%4d optimized:%4d 
(diff:%4d)]%n", i,
+              (double) timeNaive / (double) timeOptimized, timeNaive, 
timeOptimized,
+              timeNaive - timeOptimized);
+      System.out.printf(
+              "%3d Indexed:   speed up factor %3.2f [naive:%4d indexed  :%4d 
(diff:%4d)]%n%n", i,
+              (double) timeNaive / (double) timeIndexed, timeNaive, 
timeIndexed,
+              timeNaive - timeIndexed);
     }
   }
 
@@ -274,7 +291,7 @@ public class JCasUtilTest extends Compon
     return t;
   }
 
-  private void check(JCas jcas, Token t, Collection<? extends Annotation> a1,
+  private void check(JCas jcas, Annotation t, Collection<? extends Annotation> 
a1,
           Collection<? extends Annotation> a2) {
     // List<Annotation> annos = new ArrayList<Annotation>();
     // FSIterator fs = jcas.getAnnotationIndex().iterator();


Reply via email to