OriginalMarkupMetadataHelper.java

valyt Sat, 15 Feb 2014 10:25:05 -0800

Revision: 17314
          http://sourceforge.net/p/gate/code/17314
Author:   valyt
Date:     2014-02-15 18:24:09 +0000 (Sat, 15 Feb 2014)
Log Message:
-----------
Patch to allow the storing of zero-length tags.


Modified Paths:
--------------
    
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java

Modified: 
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java
===================================================================
--- 
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java   
    2014-02-15 16:00:06 UTC (rev 17313)
+++ 
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java   
    2014-02-15 18:24:09 UTC (rev 17314)
@@ -29,7 +29,6 @@
 import java.util.SortedMap;
 import java.util.TreeMap;
 
-
 import gate.Annotation;
 import gate.AnnotationSet;
 import gate.GateConstants;
@@ -89,22 +88,22 @@
     //key = token offset for close tag
     //value: list of tag IDs that end at that location
     SortedMap<Integer, LinkedList<String>> spansToEnd = 
-      new TreeMap<Integer, LinkedList<String>>();
-    Iterator<int[]> tagIter = docTags.tags != null ? 
-            docTags.tags.iterator() : null;
+        new TreeMap<Integer, LinkedList<String>>();
+    Iterator<int[]> tagIter =
+        docTags.tags != null ? docTags.tags.iterator() : null;
     int[] currentTag = (tagIter != null && tagIter.hasNext()) ? 
             tagIter.next() : null;
     Iterator<Binding> hitIter = hits != null ? hits.iterator() : null;
     Binding currentHit = (hitIter != null && hitIter.hasNext()) ? 
             hitIter.next() : null;
-    for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++){
-      if(docTags != null){
+    for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++) {
+      if(docTags != null) {
         //check if we need to open any tags here
         while((currentTag != null && currentTag[1] == tokIdx) ||
-              (currentHit != null && currentHit.getTermPosition() == tokIdx)){
+              (currentHit != null && currentHit.getTermPosition() == tokIdx)) {
           //we need to open a tag or a hit
           if(currentTag != null && currentTag[1] == tokIdx &&
-             currentHit != null && currentHit.getTermPosition() == tokIdx){
+             currentHit != null && currentHit.getTermPosition() == tokIdx) {
             //we have both a tag and a hit, starting at the same position
             //we start the one that ends later, with a preference for a tag
             //(as hits should be inner-most)
@@ -114,19 +113,25 @@
               output.append(openingTag);
               
               String closingTag = getClosingTag(openingTag);
-              LinkedList<String> spans = spansToEnd.get(currentTag[2]);
-              if(spans == null){
-                spans = new LinkedList<String>();
-                spansToEnd.put(currentTag[2], spans);
+              if(currentTag[1] == currentTag[2]) {
+                // zero-length tag
+                output.append(closingTag);
+              } else {
+                // queue the closing tag for later
+                LinkedList<String> spans = spansToEnd.get(currentTag[2]);
+                if(spans == null){
+                  spans = new LinkedList<String>();
+                  spansToEnd.put(currentTag[2], spans);
+                }
+                spans.addFirst(closingTag);                
               }
-              spans.addFirst(closingTag);
               //consume the tag
               currentTag = (tagIter != null && tagIter.hasNext()) ? 
                       tagIter.next() : null;
             }else{
               //consume the HIT
               output.append(HIT_OPENING_TAG);
-              int spanEnd = currentHit.getTermPosition() + 
currentHit.getLength() -1; 
+              int spanEnd = currentHit.getTermPosition() + 
currentHit.getLength(); 
               LinkedList<String> spans = spansToEnd.get(spanEnd);
               if(spans == null){
                 spans = new LinkedList<String>();
@@ -142,19 +147,25 @@
             String openingTag = docTags.tagDescriptors.get(currentTag[0]);
             output.append(openingTag);
             String closingTag = getClosingTag(openingTag);
-            LinkedList<String> spans = spansToEnd.get(currentTag[2]);
-            if(spans == null){
-              spans = new LinkedList<String>();
-              spansToEnd.put(currentTag[2], spans);
+            if(currentTag[1] == currentTag[2]) {
+              // zero-length tag
+              output.append(closingTag);
+            } else {
+              // queue the closing tag for later
+              LinkedList<String> spans = spansToEnd.get(currentTag[2]);
+              if(spans == null){
+                spans = new LinkedList<String>();
+                spansToEnd.put(currentTag[2], spans);
+              }
+              spans.addFirst(closingTag);                
             }
-            spans.addFirst(closingTag);
             //consume the tag
             currentTag = (tagIter != null && tagIter.hasNext()) ? 
                     tagIter.next() : null;
           } else {
             //we only have a HIT to use
             output.append(HIT_OPENING_TAG);
-            int spanEnd = currentHit.getTermPosition() + 
currentHit.getLength() -1;
+            int spanEnd = currentHit.getTermPosition() + 
currentHit.getLength();
             LinkedList<String> spans = spansToEnd.get(spanEnd);
             if(spans == null){
               spans = new LinkedList<String>();
@@ -167,11 +178,10 @@
           }
         }
       }
-      //write the token
+      // write the token
       output.append(tokens[tokIdx]);
-      
-      //check if we need to close any spans here
-      while(spansToEnd.size() > 0 && spansToEnd.firstKey() == tokIdx){
+      // check if we need to close any tags here
+      while(spansToEnd.size() > 0 && spansToEnd.firstKey() == tokIdx + 1){
         LinkedList<String> closingTags = 
spansToEnd.remove(spansToEnd.firstKey());
         for(String aTag : closingTags){
           output.append(aTag);
@@ -179,8 +189,11 @@
       }
       //write the non-token, if any
       if(tokIdx < nonTokens.length) output.append(nonTokens[tokIdx]);
-
     }
+    // write the last nonToken, if any
+    if(tokens.length <= nonTokens.length){
+      output.append(nonTokens[tokens.length - 1]);
+    }
   }
 
   /* (non-Javadoc)
@@ -211,27 +224,36 @@
     for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++){
       long tokStart = tokens[tokIdx].getStartNode().getOffset();
       long tokEnd = tokens[tokIdx].getEndNode().getOffset();
-      //see if there are any tags to close at this offset
-      while(tagsToEnd.size() > 0 && tagsToEnd.firstKey() <= tokStart){
+      // see if there are any tags to close at this offset
+      Long firstTagEnd = tagsToEnd.isEmpty() ? null : tagsToEnd.firstKey();
+      while(tagsToEnd.size() > 0 && firstTagEnd <= tokStart) {
         //get all tags ending inside the previous token or the space before 
the 
         //current token
-        LinkedList<Integer> tags = tagsToEnd.remove(tagsToEnd.firstKey());
+        LinkedList<Integer> tags = tagsToEnd.remove(firstTagEnd);
         for(int aTag : tags){
-          documentTags.tags.get(aTag)[2] = tokIdx -1;
+          documentTags.tags.get(aTag)[2] = tokIdx;
         }
+        firstTagEnd = tagsToEnd.isEmpty() ? null : tagsToEnd.firstKey();
       }
       //see if we need to save any tags at this offset
       while(currentTag != null){
         if(tagStart < tokEnd){
           //the current tag starts within the current token
           int tagDescId = getTagId(currentTag, documentTags);
-          documentTags.tags.add(new int[]{tagDescId, tokIdx, -1});
-          LinkedList<Integer> tagsEnding = tagsToEnd.get(tagEnd);
-          if(tagsEnding == null){
-            tagsEnding = new LinkedList<Integer>();
-            tagsToEnd.put(tagEnd, tagsEnding);
+          int[] newTag = new int[]{tagDescId, tokIdx, -1};
+          documentTags.tags.add(newTag);
+          // if the new tag is zero-length, we actually know its ending 
position
+          if(tagEnd <= tokStart) {
+            newTag[2] = tokIdx;  
+          } else {
+            // we queue it, and we'll find the end position later
+            LinkedList<Integer> tagsEnding = tagsToEnd.get(tagEnd);
+            if(tagsEnding == null){
+              tagsEnding = new LinkedList<Integer>();
+              tagsToEnd.put(tagEnd, tagsEnding);
+            }
+            tagsEnding.addFirst(documentTags.tags.size() -1);            
           }
-          tagsEnding.addFirst(documentTags.tags.size() -1);
           //update the current tag
           currentTag = tagsiter.hasNext() ? tagsiter.next() : null;
           tagStart = currentTag == null ? -1 : 
currentTag.getStartNode().getOffset();
@@ -244,7 +266,7 @@
     }//for tokens
     while(tagsToEnd.size() > 0){
       //we did not close all tags yet
-      int tokIdx = tokens.length -1;
+      int tokIdx = tokens.length;
       LinkedList<Integer> tags = tagsToEnd.remove(tagsToEnd.firstKey());
       for(int aTag : tags){
         documentTags.tags.get(aTag)[2] = tokIdx;
@@ -256,7 +278,7 @@
       //token
       int tokIdx = tokens.length -1;
       int tagDescId = getTagId(currentTag, documentTags);
-      documentTags.tags.add(new int[]{tagDescId, tokIdx, tokIdx});
+      documentTags.tags.add(new int[]{tagDescId, tokIdx, tokIdx + 1});
       //update the current tag
       currentTag = tagsiter.hasNext() ? tagsiter.next() : null;
       tagStart = currentTag == null ? -1 : 
currentTag.getStartNode().getOffset();
@@ -408,7 +430,10 @@
    * <ol>
    *   <li>the index in the {@link #tagDescriptors} array for the tag</li>
    *   <li>the start offset for the tag (in terms of token position);</li>
-   *   <li>the end offset for the tag (in terms of token position);</li>
+   *   <li>the end offset for the tag (in terms of token position); This 
+   *   corresponds to the first token that is <strong>not<strong> part of the
+   *   tag, hence it could point to a non-existent token for tags that include
+   *   the last token in the document.</li>
    * </ol>
    * 
    */
@@ -442,6 +467,19 @@
       }
     }
     
+    @Override
+    public String toString() {
+      StringBuffer str = new StringBuffer();
+      boolean first = true;
+      for(int[] aTag : tags) {
+        if(first) first = false;
+        else str.append(' ');
+        str.append(tagDescriptors.get(aTag[0])).append('(').append(aTag[1])
+            .append(':').append(aTag[2]).append(')');
+      }
+      return str.toString();
+    }
+
     /**
      * A set used internally to ensure uniqueness of the tag descriptors. 
      */

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17314] mimir/trunk/mimir-core/src/gate/mimir/index/ OriginalMarkupMetadataHelper.java

Reply via email to