MetaMapPR.java

philgooch Wed, 20 Apr 2011 11:01:28 -0700

Revision: 13729
          http://gate.svn.sourceforge.net/gate/?rev=13729&view=rev
Author:   philgooch
Date:     2011-04-20 18:01:19 +0000 (Wed, 20 Apr 2011)


Log Message:
-----------
Removed lineBreakCount parameter as no longer required (text is now 
pre-chunked). Modified processWithMetaMap() to pre-chunk text into chunks 
separated by 2 or more line breaks, and strip leading space but keep track of 
start offsets for each chunk.

Modified Paths:
--------------
    gate/trunk/plugins/Tagger_MetaMap/src/gate/metamap/MetaMapPR.java

Modified: gate/trunk/plugins/Tagger_MetaMap/src/gate/metamap/MetaMapPR.java
===================================================================
--- gate/trunk/plugins/Tagger_MetaMap/src/gate/metamap/MetaMapPR.java   
2011-04-20 17:59:22 UTC (rev 13728)
+++ gate/trunk/plugins/Tagger_MetaMap/src/gate/metamap/MetaMapPR.java   
2011-04-20 18:01:19 UTC (rev 13729)
@@ -37,6 +37,7 @@
 import info.olteanu.interfaces.StringFilter;
 
 import java.util.regex.Pattern;
+import java.util.regex.Matcher;
 
 /** 
  * This class is the implementation of the resource METAMAP.
@@ -56,10 +57,8 @@
     private Boolean annotatePhrases; // annotate MetaMap phrase chunks with 
outputASType
     private String metaMapOptions;   // MetaMap command line string
     private Boolean annotateNegEx;  // output NegEx results as features to 
outputASType
-    private Integer linebreakCount;    // number of linebreak/whitespace 
characters between paragraph chunks
     private TaggerMode taggerMode;  // term tagger behaviour: all instances, 
first only, or first+coreferences
-    
-    
+
     @Override
     public Resource init() throws ResourceInstantiationException {
 
@@ -76,7 +75,6 @@
         return this;
     }
 
-    
     @Override
     public void execute() throws ExecutionException {
         MetaMapApi mmInst;
@@ -92,9 +90,6 @@
             throw new ExecutionException("outputASType parameter must be 
set!");
         }
 
-        if (linebreakCount == null || !(linebreakCount instanceof Integer)) {
-            linebreakCount = 2;
-        }
 
         // set up MetaMap API instance
         try {
@@ -123,7 +118,7 @@
                 // once through MetaMap
                 HashMap<Integer, ArrayList<Integer>> termMapById = new 
HashMap<Integer, ArrayList<Integer>>();        // map of annots with content 
duplicated by other annots, indexed by annot id
                 HashMap<String, Integer> termMapByString = new HashMap<String, 
Integer>();    // as above, but indexed by string content
-                
+
                 // iterate over each annot of type inputAnn
                 while (itr.hasNext()) {
 
@@ -175,8 +170,8 @@
                         } else {
                             // string annContent already processed, so keep a 
note
                             // of the annotation id - we'll copy over the 
MetaMap annotations later
-                            Integer uniqueId =  
termMapByString.get(annContent);
-                            ArrayList<Integer> dupAnnIds =  
termMapById.get(uniqueId);
+                            Integer uniqueId = termMapByString.get(annContent);
+                            ArrayList<Integer> dupAnnIds = 
termMapById.get(uniqueId);
                             dupAnnIds.add(annId);
                         }
                     } else {
@@ -232,7 +227,7 @@
             return;
         }
         Set keys = termMapById.keySet();
-        // System.out.println("Keys: " + keys);
+        
         Iterator keyIter = keys.iterator();
 
         while (keyIter.hasNext()) {
@@ -245,8 +240,6 @@
             fm.put("coreferences:", dupAnnIds);
             AnnotationSet mmAnns = 
outputAS.getContained(pAnn.getStartNode().getOffset(), 
pAnn.getEndNode().getOffset()).get(outputASType);
 
-            //System.out.println(pAnn.getId() + ":" + mmAnns.toString());
-
             // Copy all the MetaMap annots within mmAnns to the
             // unprocessed annotations that have the same string value
             // as the annots we processed through MetaMap
@@ -358,7 +351,6 @@
         return api;
     }
 
-
     /**
      * 
      * @param pcm - phrase chunk returned by MetaMap
@@ -452,8 +444,6 @@
         } // end for
     }
 
-
-
     /**
      *
      * @param pcm - phrase chunk returned by MetaMap
@@ -463,12 +453,12 @@
      * @throws Exception
      */
     public void processMappings(PCM pcm, List<Negation> negList, Long 
lngInitialOffset, Long lngEndOffset) throws Exception {
-        
+
         List<Mapping> mappings = pcm.getMappingList();
 
         // Mappings are ordered by score, so outputting the first item will 
give the mapping
         // with the highest score, if requested
-        if ( outputMode.equals(OutputMode.HighestMappingOnly) && ! 
(mappings.isEmpty()) ) {
+        if (outputMode.equals(OutputMode.HighestMappingOnly) && 
!(mappings.isEmpty())) {
             processEvents(mappings.get(0).getEvList(), negList, "Mapping", 
lngInitialOffset, lngEndOffset);
         } else {
             for (Mapping map : mappings) {
@@ -477,8 +467,6 @@
         }
     }
 
-
-    
     /**
      *
      * @param pcm - phrase chunk returned by MetaMap
@@ -528,8 +516,6 @@
         } // end for
     }
 
-
-
     /**
      * @param api - MetaMapApi instance
      * @param text - text to be annotated
@@ -543,24 +529,44 @@
          */
         List<Result> resultList = null;
 
-        String asciiText = filterNonAscii(normalizeString(text));
+        String asciiText = filterNonAscii(normalizeString(text)) + "\n\n";
 
-        resultList = api.processCitationsFromString(asciiText);
+        // Create a pattern that strips leading whitespace and
+        // and chunks the text delimited by 2 or more blank lines
+        Pattern pattern = 
Pattern.compile("(?s)((\\p{Space}|\\p{Cntrl})*)(.+?)\n[\\s]*\n");
 
-        int resultLength = 0;
+        Matcher m = pattern.matcher(asciiText);
+        int iStart = 0;
+        String chunk = "";
+        int chunkLength = 0;
 
-        if (resultList != null) {
-            for (Result result : resultList) {
-                if (result != null) {
-                    this.processUtterances(result, lngInitialOffset + 
resultLength, (lngEndOffset == null) ? null : lngEndOffset + resultLength);
-                    // Need to assume linebreakCount line breaks between each 
result chunk.
-                    // The API processCitationsFromString() method chunks text 
in linebreak separated fragments,
-                    // but does not keep track of the offset of each fragment.
-                    resultLength = resultLength + 
result.getInputText().length() + linebreakCount;
-                } else {
-                    throw new Exception("NULL result instance! ");
+        while (m.find(iStart)) {
+            lngInitialOffset += m.group(1).length();
+            chunk = m.group(3);
+            chunkLength = chunk.length();
+
+            if (!chunk.trim().isEmpty()) {
+                resultList = api.processCitationsFromString(chunk);
+
+                int resultLength = 0;
+
+                if (resultList != null) {
+                    for (Result result : resultList) {
+                        if (result != null) {
+                            this.processUtterances(result, lngInitialOffset + 
resultLength, (lngEndOffset == null) ? null : lngEndOffset + resultLength);
+                            // We've pre-chunked the text now so this loop 
should only iterate once
+                            // but, if for some reason it iterates more often, 
we need to keep track
+                            // of the length of the previous result to update 
the start offset
+                            resultLength = resultLength + 
result.getInputText().length();
+                        } else {
+                            throw new Exception("NULL result instance! ");
+                        }
+                    }
                 }
             }
+
+            lngInitialOffset += chunkLength;
+            iStart = m.end(3);
         }
     }
 
@@ -696,17 +702,7 @@
         return annotateNegEx;
     }
 
-    @RunTime
-    @CreoleParameter(defaultValue = "2",
-    comment = "Number of linebreak/whitespace characters between paragraphs")
-    public void setLinebreakCount(Integer linebreakCount) {
-        this.linebreakCount = linebreakCount;
-    }
 
-    public Integer getLinebreakCount() {
-        return linebreakCount;
-    }
-
     @RunTime
     @CreoleParameter(defaultValue = "CoReference",
     comment = "Map first instance of a term only, first instance plus 
coreferences, or all instances independently")


This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.

------------------------------------------------------------------------------
Benefiting from Server Virtualization: Beyond Initial Workload 
Consolidation -- Increasing the use of server virtualization is a top
priority.Virtualization can reduce costs, simplify management, and improve 
application availability and disaster protection. Learn more about boosting 
the value of server virtualization. http://p.sf.net/sfu/vmware-sfdev2dev
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[13729] gate/trunk/plugins/Tagger_MetaMap/src/gate/metamap /MetaMapPR.java

Reply via email to