Revision: 15986
          http://gate.svn.sourceforge.net/gate/?rev=15986&view=rev
Author:   adamfunk
Date:     2012-07-27 16:03:25 +0000 (Fri, 27 Jul 2012)
Log Message:
-----------
The FG can now successfully map Lookups from the temp document back to
the real one even if they do not exactly line up with the input
annotation (e.g., Token) boundaries.  This allows matching to
TreeTagger output like Token.lemma == "Spur|Spuren".

Searching is still done with Arrays.binarySearch(...) for speed, but
this is carried out in the dedicated data structure
FlexGazMappingTable (which stores temp/real offsets & calculates
output offsets that must line up with input ones).  A match
transferred from the temp document is extended left/right as necessary
to match input annotations' start/end offsets (respectively) in the
real document.

Restored the suppression of output Lookups that fall entirely outside
the input annotations boundaries (to deal with sections of text that
have been deliberately omitted from the AST, for example).

Modified Paths:
--------------
    gate/trunk/src/gate/creole/gazetteer/FlexibleGazetteer.java
    gate/trunk/src/gate/creole/gazetteer/NodePosition.java

Added Paths:
-----------
    gate/trunk/src/gate/creole/gazetteer/FlexGazMappingTable.java

Added: gate/trunk/src/gate/creole/gazetteer/FlexGazMappingTable.java
===================================================================
--- gate/trunk/src/gate/creole/gazetteer/FlexGazMappingTable.java               
                (rev 0)
+++ gate/trunk/src/gate/creole/gazetteer/FlexGazMappingTable.java       
2012-07-27 16:03:25 UTC (rev 15986)
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2012, The University of Sheffield.
+ * 
+ * This file is part of GATE (see http://gate.ac.uk/), and is free software,
+ * licenced under the GNU Library General Public License, Version 2, June1991.
+ * 
+ * A copy of this licence is included in the distribution in the file
+ * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
+ * $Id$
+ */
+package gate.creole.gazetteer;
+
+import java.util.*;
+
+public class FlexGazMappingTable {
+  
+  private Map<Long, NodePosition> startMap;
+  private Map<Long, NodePosition> endMap;
+  private long[] tempStartOffsets;
+  private long[] tempEndOffsets;
+  private boolean updated;
+  private int size;
+  
+  
+  public FlexGazMappingTable() {
+    startMap = new HashMap<Long, NodePosition>();
+    endMap = new HashMap<Long, NodePosition>();
+    tempStartOffsets = null;
+    tempEndOffsets = null;
+    size = 0;
+    updated = false;
+  }
+  
+  
+  private void add(NodePosition mapping) {
+    startMap.put(mapping.getTempStartOffset(), mapping);
+    endMap.put(mapping.getTempEndOffset(), mapping);
+    size++;
+    updated = false;
+  }
+  
+  
+  public Collection<NodePosition> getMappings() {
+    return startMap.values();
+  }
+  
+  
+  public void add(long originalStart, long originalEnd, long tempStart, long 
tempEnd) {
+    NodePosition mapping = new NodePosition(originalStart, originalEnd, 
tempStart, tempEnd);
+    add(mapping);
+  }
+  
+  
+  public int size() {
+    return this.size;
+  }
+  
+  
+  public boolean isEmpty() {
+    return this.size == 0;
+  }
+  
+  
+  private void update() {
+    if (updated) {
+      return;
+    }
+
+    tempStartOffsets = new long[size];
+    tempEndOffsets = new long[size];
+    
+    int i = 0;
+    for (Long key : startMap.keySet()) {
+      tempStartOffsets[i] = key.longValue();
+      tempEndOffsets[i] = startMap.get(key).getTempEndOffset();
+      i++;
+    }
+    
+    Arrays.sort(tempStartOffsets);
+    Arrays.sort(tempEndOffsets);
+    updated = true;
+  }
+  
+
+  /** Find the start offset of the latest original annotation
+   *  that starts at or before this temporary annotation.
+   *  This method MUST return a valid original annotation
+   *  start offset or -1.
+   * @param tempStartOffset
+   * @return -1 is the error code, sorry
+   */
+
+  public long getBestOriginalStart(long tempStartOffset) {
+    update();
+    int i = Arrays.binarySearch(tempStartOffsets, tempStartOffset);
+
+    // According to the binarySearch API, i = - insPt - 1
+
+    if (i == -1) {
+      // This means we've undershot the first original annotation
+      return -1L;
+    }
+    
+    if (i >= 0) {
+      return startMap.get(tempStartOffsets[i]).getOriginalStartOffset();
+    }
+    
+    /* Now we want the position before the insertion point 
+     * (we've already tested for undershooting the first 
+     * original annotation)   */
+    i = - i - 2;
+    return startMap.get(tempStartOffsets[i]).getOriginalStartOffset();
+  }
+  
+
+  /** Find the end offset of the first original annotation
+   *  that ends at or after this temporary annotation.  This method
+   *  MUST return a valid original annotation end offset or -1. 
+   * 
+   * @param tempEndOffset
+   * @return -1 is the error code, sorry
+   */
+  public long getBestOriginalEnd(long tempEndOffset) {
+    update();
+    int i = Arrays.binarySearch(tempEndOffsets, tempEndOffset);
+    if (i > 0) {
+      return endMap.get(tempEndOffsets[i]).getOriginalEndOffset();
+    }
+    
+    /* Acc. to the binarySearch API, i = - insPt - 1
+     * We want the insertion point, but if that is past the 
+     * existing end of the array, then 
+     * we have overshot the first original annotation    */
+    i = - i - 1;
+    if (i >= size) {
+      return -1L;
+    }
+    
+    return endMap.get(tempEndOffsets[i]).getOriginalEndOffset();
+  }
+
+  
+  public void dump() {
+    update();
+    for (int i = 0 ; i < size ; i++) {
+      long start = tempStartOffsets[i];
+      long end = tempEndOffsets[i];
+      NodePosition m = startMap.get(start);
+      System.out.format("FGMT: %d, %d : o(%d, %d) t(%d, %d)\n", start, end,
+          m.getOriginalStartOffset(), m.getOriginalEndOffset(),
+          m.getTempStartOffset(), m.getTempEndOffset() );
+    }
+    
+    
+  }
+   
+  
+}


Property changes on: 
gate/trunk/src/gate/creole/gazetteer/FlexGazMappingTable.java
___________________________________________________________________
Added: svn:keywords
   + Id
Added: svn:eol-style
   + native

Modified: gate/trunk/src/gate/creole/gazetteer/FlexibleGazetteer.java
===================================================================
--- gate/trunk/src/gate/creole/gazetteer/FlexibleGazetteer.java 2012-07-27 
01:18:55 UTC (rev 15985)
+++ gate/trunk/src/gate/creole/gazetteer/FlexibleGazetteer.java 2012-07-27 
16:03:25 UTC (rev 15986)
@@ -1,7 +1,7 @@
 /*
  * FlexibleGazetteer.java
  * 
- * Copyright (c) 2004-2011, The University of Sheffield.
+ * Copyright (c) 2004-2012, The University of Sheffield.
  * 
  * This file is part of GATE (see http://gate.ac.uk/), and is free software,
  * licenced under the GNU Library General Public License, Version 2, June1991.
@@ -9,8 +9,8 @@
  * A copy of this licence is included in the distribution in the file
  * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
  * 
- * Niraj Aswani 02/2002 $Id: FlexibleGazetteer.java 14808 2011-12-19 13:42:09Z
- * adamfunk $
+ * Niraj Aswani 02/2002
+ * $Id$
  */
 package gate.creole.gazetteer;
 
@@ -28,12 +28,9 @@
 import gate.creole.ExecutionException;
 import gate.creole.ResourceInstantiationException;
 import gate.util.InvalidOffsetException;
-
-import java.util.Arrays;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
+
 /**
  * <p>
  * Title: Flexible Gazetteer
@@ -55,9 +52,15 @@
  * @author niraj aswani
  * @version 1.0
  */
-public class FlexibleGazetteer extends AbstractLanguageAnalyser implements
-                                                               
ProcessingResource {
+public class FlexibleGazetteer extends AbstractLanguageAnalyser 
+  implements ProcessingResource {
+  
   private static final long serialVersionUID = -1023682327651886920L;
+  private static final String wrappedOutputASName = "Output";
+  private static final String wrappedInputASName = "Input";
+  
+  // SET TO false BEFORE CHECKING IN
+  private static final boolean DEBUG = false;
 
   /**
    * Does the actual loading and parsing of the lists. This method must be
@@ -96,15 +99,16 @@
       // keyVal[0] = annotation type
       // keyVal[1] = feature name
       // holds mapping for newly created annotations
-      Map<Long, NodePosition> annotationMappings =
-          new HashMap<Long, NodePosition>();
+      FlexGazMappingTable mappingTable = new FlexGazMappingTable();
       fireStatusChanged("Creating temporary Document for feature " + aFeature);
       StringBuilder newdocString =
           new StringBuilder(document.getContent().toString());
       // sort annotations
       List<Annotation> annotations =
           Utils.inDocumentOrder(inputAS.get(keyVal[0]));
+
       // remove duplicate annotations
+      // (this makes the reverse mapping much easier)
       removeOverlappingAnnotations(annotations);
       // initially no space is deducted
       int totalDeductedSpaces = 0;
@@ -129,31 +133,22 @@
         long newStartOffset = startOffset - totalDeductedSpaces;
         long newEndOffset = newStartOffset + newTokenValue.length();
         totalDeductedSpaces += lengthDifference;
-        // only include node if there's some difference in the offsets
-        if(startOffset != newStartOffset || endOffset != newEndOffset) {
-          // and make the entry for this
-          NodePosition mapping =
-              new NodePosition(startOffset, endOffset, newStartOffset,
-                  newEndOffset);
-          annotationMappings.put(newEndOffset, mapping);
-        }
+
+        mappingTable.add(startOffset, endOffset, newStartOffset, newEndOffset);
+        
         // and finally replace the actual string in the document
         // with the new document
         newdocString.replace((int)newStartOffset, (int)newStartOffset
             + (int)actualLength, newTokenValue);
       }
+
       // proceed only if there was any replacement Map
-      if(annotationMappings.isEmpty()) continue;
-      // storing end offsets of annotations in an array for quick
-      // lookup later on
-      long[] offsets = new long[annotationMappings.size()];
-      int index = 0;
-      for(Long aKey : annotationMappings.keySet()) {
-        offsets[index] = aKey;
-        index++;
-      }
-      // for binary search, offsets need to be in ascending order
-      Arrays.sort(offsets);
+      if(mappingTable.isEmpty()) continue;
+      
+      /* All the binary search stuff is done inside FlexGazMappingTable
+       * now, so it's guaranteed to return valid original annotation start
+       * and end offsets.       */
+      
       // otherwise create a temporary document for the new text
       Document tempDoc = null;
       // update the status
@@ -171,94 +166,73 @@
         tempDoc =
             (Document)Factory.createResource("gate.corpora.DocumentImpl",
                 params, features);
-      } catch(ResourceInstantiationException rie) {
-        throw new ExecutionException("Temporary document cannot be created",
-            rie);
+
+        /* Mark the temp document with the locations of the input annotations 
so
+         * that we can later eliminate Lookups that are out of scope.       */
+        for (NodePosition mapping : mappingTable.getMappings()) {
+          
tempDoc.getAnnotations(wrappedInputASName).add(mapping.getTempStartOffset(), 
+              mapping.getTempEndOffset(), "Input", Factory.newFeatureMap());
+        }
+      } 
+      catch(ResourceInstantiationException rie) {
+        throw new ExecutionException("Temporary document cannot be created", 
rie);
+      } 
+      catch(InvalidOffsetException e) {
+        throw new ExecutionException("Error duplicating Input annotations", e);
       }
       try {
         // lets create the gazetteer based on the provided gazetteer name
         gazetteerInst.setDocument(tempDoc);
-        gazetteerInst.setAnnotationSetName(this.outputASName);
+        gazetteerInst.setAnnotationSetName(wrappedOutputASName);
         fireStatusChanged("Executing Gazetteer...");
         gazetteerInst.execute();
         // now the tempDoc has been looked up, we need to shift the annotations
         // from this temp document to the original document
         fireStatusChanged("Transfering new annotations to the original 
one...");
-        AnnotationSet original = document.getAnnotations(outputASName);
-        // okay iterate over new annotations and transfer them back to
-        // the original document
-        for(Annotation currentLookup : tempDoc.getAnnotations(outputASName)) {
+        AnnotationSet originalDocOutput = 
document.getAnnotations(outputASName);
+        
+        if (DEBUG) {
+          mappingTable.dump();
+        }
+        
+        // Now iterate over the new annotations and transfer them from the 
+        // temp document back to the real one
+        for(Annotation currentLookup : 
tempDoc.getAnnotations(wrappedOutputASName)) {
           long tempStartOffset = Utils.start(currentLookup);
           long tempEndOffset = Utils.end(currentLookup);
-          long newStartOffset = tempStartOffset;
-          long newEndOffset = tempEndOffset;
-          long addedSpaces = 0;
-          // we find out the node before the current annotation's startoffset
-          // and it to find out the number of extra characters added
-          index = Arrays.binarySearch(offsets, newStartOffset);
-          // if index <0, the absolute position of it refers to the
-          // position after the node we want to access to
-          // find out the no. of extra characters added before the
-          // current position
-          if(index < 0) {
-            index = Math.abs(index) - 1;
-          }
-          if(index > 0) {
-            // go back one node
-            index--;
-            NodePosition node = annotationMappings.get(offsets[index]);
-            long oldEnd = node.getOriginalEndOffset();
-            addedSpaces = node.getNewEndOffset() - oldEnd;
-            newStartOffset -= addedSpaces;
-          }
-          // we are trying to find a node which holds information
-          // about the number of new characters added before
-          // the new end offset
-          index = Arrays.binarySearch(offsets, newEndOffset);
-          if(index < 0) {
-            index = Math.abs(index) - 1;
-          }
-          if(index >= 0) {
-            // if the index 0
-            // it means
-            // if points to the length of the array, it means,
-            // we need to refer to the last element
-            if(index == offsets.length) index--;
-            NodePosition node = annotationMappings.get(offsets[index]);
-            if(offsets[index] <= newEndOffset) {
-              long oldEnd = node.getOriginalEndOffset();
-              addedSpaces = node.getNewEndOffset() - oldEnd;
-            } else {
-              long oldStart = node.getOriginalStartOffset();
-              addedSpaces = node.getNewStartOffset() - oldStart;
-            }
-          }
-          newEndOffset -= addedSpaces;
-          try {
-            // before we do this, make sure there is no other annotation like
-            // this
-            AnnotationSet tempSet =
-                original.getContained(newStartOffset, newEndOffset).get(
-                    currentLookup.getType(), currentLookup.getFeatures());
-            boolean found = false;
-            for(Annotation annot : tempSet) {
-              if(Utils.start(annot) == newStartOffset
-                  && Utils.end(annot) == newEndOffset
-                  && annot.getFeatures().size() == currentLookup.getFeatures()
-                      .size()) {
-                found = true;
-                break;
+
+          /* Ignore annotations that fall entirely outside the input 
annotations,
+           * so that we don't get dodgy Lookups outside the area covered by
+           * Tokens copied into a restricted working set by the AST PR
+           * (for example)           */
+          if (coveredByInput(tempStartOffset, tempEndOffset, 
tempDoc.getAnnotations(wrappedInputASName)))  {
+            long destinationStart = 
mappingTable.getBestOriginalStart(tempStartOffset);
+            long destinationEnd = 
mappingTable.getBestOriginalEnd(tempEndOffset);
+
+            boolean valid = (destinationStart >= 0) && (destinationEnd >= 0);  
+
+            if (valid) {
+              // Now make sure there is no other annotation like this
+              AnnotationSet testSet = 
originalDocOutput.getContained(destinationStart, destinationEnd).get(
+                  currentLookup.getType(), currentLookup.getFeatures());
+              for(Annotation annot : testSet) {
+                if(Utils.start(annot) == destinationStart
+                    && Utils.end(annot) == destinationEnd
+                    && annot.getFeatures().size() == 
currentLookup.getFeatures().size()) {
+                  valid = false;
+                  break;
+                }
               }
             }
-            if(!found) {
-              original.add(newStartOffset, newEndOffset,
-                  currentLookup.getType(), currentLookup.getFeatures());
+            
+            if(valid) {
+              addToOriginal(originalDocOutput, destinationStart, 
destinationEnd, 
+                  tempStartOffset, tempEndOffset, currentLookup, tempDoc);
             }
-          } catch(InvalidOffsetException e) {
-            throw new ExecutionException(e);
-          }
+          } // END if coveredByInput(...)
         } // END for OVER ALL THE Lookups
-      } finally {
+      } 
+      finally {
         gazetteerInst.setDocument(null);
         if(tempDoc != null) {
           // now remove the newDoc
@@ -269,9 +243,10 @@
     fireProcessFinished();
   } // END execute METHOD
 
+  
   /**
    * Removes the overlapping annotations. preserves the one that appears first
-   * in the list
+   * in the list.  This assumes the list has been sorted already.
    * 
    * @param annotations
    */
@@ -288,6 +263,40 @@
     }
   }
 
+  
+  /* We try hard not to cause InvalidOffsetExceptions, but let's have
+   * some better debugging info in case they happen.
+   */
+  private void addToOriginal(AnnotationSet original, long originalStart, long 
originalEnd, 
+      long tempStart, long tempEnd, Annotation tempLookup, Document tempDoc) 
throws ExecutionException {
+    try {
+      original.add(originalStart, originalEnd, tempLookup.getType(), 
tempLookup.getFeatures());
+    }
+    catch(InvalidOffsetException ioe) {
+      String errorDetails = String.format("temp %d, %d [%s]-> original %d, %d  
", tempStart, tempEnd, Utils.stringFor(tempDoc, tempLookup), 
+          originalStart, originalEnd);
+      throw new ExecutionException(errorDetails, ioe);
+    }
+  }
+
+  
+  
+  /* Is this Lookup within the scope of the input annotations?  It might not 
be, if Token annotations
+   * have been copied by AST only over the significant sections of the 
document.
+   */
+  private boolean coveredByInput(long tempStart, long tempEnd, AnnotationSet 
tempInputAS) {
+    if (tempInputAS.getCovering(wrappedInputASName, tempStart, 
tempStart).isEmpty()) {
+      return false;
+    }
+    // implied else
+    if (tempInputAS.getCovering(wrappedInputASName, tempEnd, 
tempEnd).isEmpty()) {
+      return false;
+    }
+    // implied else
+    return true;
+  }
+
+  
   /**
    * Sets the document to work on
    * 

Modified: gate/trunk/src/gate/creole/gazetteer/NodePosition.java
===================================================================
--- gate/trunk/src/gate/creole/gazetteer/NodePosition.java      2012-07-27 
01:18:55 UTC (rev 15985)
+++ gate/trunk/src/gate/creole/gazetteer/NodePosition.java      2012-07-27 
16:03:25 UTC (rev 15986)
@@ -37,10 +37,10 @@
   private long originalEndOffset;
 
   /** The new start offset after the changes */
-  private long newStartOffset;
+  private long tempStartOffset;
 
   /** The new end offset after the changes */
-  private long newEndOffset;
+  private long tempEndOffset;
 
   /**
    * constructor
@@ -52,8 +52,8 @@
   public NodePosition(long osn, long oen, long nsn, long nen) {
     originalStartOffset = osn;
     originalEndOffset = oen;
-    newStartOffset = nsn;
-    newEndOffset = nen;
+    tempStartOffset = nsn;
+    tempEndOffset = nen;
   }
 
   /**
@@ -76,16 +76,16 @@
    * Returns new start offset
    * @return  a <tt>long</tt> value.
    */
-  public long getNewStartOffset() {
-    return newStartOffset;
+  public long getTempStartOffset() {
+    return tempStartOffset;
   }
 
   /**
    * Returns the new end offset
    * @return a <tt>long</tt> value.
    */
-  public long getNewEndOffset() {
-    return newEndOffset;
+  public long getTempEndOffset() {
+    return tempEndOffset;
   }
 
 }
@@ -94,12 +94,12 @@
 class NodePositionComparator implements Comparator<NodePosition> {
 
   public int compare(NodePosition arg0, NodePosition arg1) {
-    long diff = arg0.getNewStartOffset() - arg1.getNewStartOffset();
+    long diff = arg0.getTempStartOffset() - arg1.getTempStartOffset();
     if (diff != 0L) {
       return (int) Long.signum(diff);
     }
     // implied else
-    diff = arg0.getNewEndOffset() - arg1.getNewEndOffset();
+    diff = arg0.getTempEndOffset() - arg1.getTempEndOffset();
     if (diff != 0L) {
       return (int) Long.signum(diff);
     }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to