Revision: 19047
          http://sourceforge.net/p/gate/code/19047
Author:   ian_roberts
Date:     2015-12-23 14:08:05 +0000 (Wed, 23 Dec 2015)
Log Message:
-----------
Pulled out the common parts of pubmed and cochrane formats into something 
parameterizable, so we can use it as a basis for other formats that use the 
same database-style "KEY: value" type of syntax.

Modified Paths:
--------------
    gate/trunk/plugins/Format_PubMed/build.xml
    
gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java
    
gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java

Modified: gate/trunk/plugins/Format_PubMed/build.xml
===================================================================
--- gate/trunk/plugins/Format_PubMed/build.xml  2015-12-23 02:22:20 UTC (rev 
19046)
+++ gate/trunk/plugins/Format_PubMed/build.xml  2015-12-23 14:08:05 UTC (rev 
19047)
@@ -37,8 +37,8 @@
   <target name="compile" depends="init"
         description="compile the source " >
     <!-- Compile the java code from ${src} into ${build} -->
-    <javac srcdir="${src}" destdir="${build}" source="1.6" target="1.6"
-           classpathref="compile.classpath">
+    <javac srcdir="${src}" destdir="${build}" source="1.7" target="1.7"
+           classpathref="compile.classpath" debug="true" 
debuglevel="lines,source">
        <compilerarg value="-Xmaxwarns" />
                <compilerarg value="${gate.compile.maxwarnings}" />
                <compilerarg value="-Xlint:all" />
@@ -55,7 +55,7 @@
              classpathref="compile.classpath"
              encoding="UTF-8"
              windowtitle="${plugin.name} JavaDoc"
-             source="1.6">
+             source="1.7">
       <sourcepath>
         <pathelement location="${src}" />
       </sourcepath>

Modified: 
gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java
===================================================================
--- 
gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java
   2015-12-23 02:22:20 UTC (rev 19046)
+++ 
gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java
   2015-12-23 14:08:05 UTC (rev 19047)
@@ -15,29 +15,15 @@
  */
 package gate.corpora;
 
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.Serializable;
-import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.log4j.Logger;
-
-import gate.AnnotationSet;
-import gate.Document;
-import gate.Factory;
-import gate.GateConstants;
 import gate.Resource;
 import gate.creole.ResourceInstantiationException;
 import gate.creole.metadata.AutoInstance;
+import gate.creole.metadata.CreoleParameter;
 import gate.creole.metadata.CreoleResource;
 import gate.util.DocumentFormatException;
-import gate.util.InvalidOffsetException;
-import gate.util.Strings;
 
+import java.util.List;
+
 /**
  * A document format analyser for Cochrane text documents. Use mime type value 
  * "text/x-cochrane", or file extension ".cochrane.txt" to access this document
@@ -49,7 +35,7 @@
       "the correct file extension.", 
   autoinstances = {@AutoInstance(hidden=true)},
   isPrivate = true)
-public class CochraneTextDocumentFormat extends TextualDocumentFormat {
+public class CochraneTextDocumentFormat extends PubmedTextDocumentFormat {
 
   private static final long serialVersionUID = 8362288605943414676L;
 
@@ -61,19 +47,7 @@
   
   private static final String COCHRANE_ID = "ID";
   
-  protected static final Logger logger = Logger.getLogger(
-      CochraneTextDocumentFormat.class);
-  
   /* (non-Javadoc)
-   * @see gate.DocumentFormat#supportsRepositioning()
-   */
-  @Override
-  public Boolean supportsRepositioning() {
-    return false;
-  }
-
-  
-  /* (non-Javadoc)
    * @see gate.corpora.TextualDocumentFormat#init()
    */
   @Override
@@ -104,127 +78,23 @@
     mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype());
     suffixes2mimeTypeMap.remove("cochrane.txt");
   }
+
+  @CreoleParameter(defaultValue = "(?<CODE>[A-Z]+): (?<VALUE>.*)")
+  public void setFieldPattern(String fieldPattern) {
+    super.setFieldPattern(fieldPattern);
+  }
   
-  /* (non-Javadoc)
-   * @see gate.corpora.TextualDocumentFormat#unpackMarkup(gate.Document)
-   */
-  @Override
-  public void unpackMarkup(Document doc) throws DocumentFormatException {
-    try {
-      BufferedReader content = new BufferedReader(new StringReader(
-          doc.getContent().toString()));
-      Map<String, Serializable> fields = new HashMap<String, Serializable>();
-      String line = content.readLine();
-      String key = null;
-      StringBuilder value = new StringBuilder();
-      Pattern linePatt =  Pattern.compile("([A-Z]+): (.*)");
-      while(line!= null) {
-        Matcher matcher = linePatt.matcher(line);
-        if(matcher.matches()) {
-          // new field
-          if(key != null) {
-            // save old value
-            PubmedUtils.addFieldValue(key, value.toString(), fields);
-          }
-          key = matcher.group(1).trim();
-          value.delete(0, value.length());
-          value.append(matcher.group(2));
-        } else {
-          // a non-assignment line -> append to previous value
-          if(value.length() == 0) {
-            logger.warn("Ignoring invalid input line:\""  +
-              line +  "\"");
-          } else {
-            value.append(Strings.getNl()).append(line.trim());
-          }
-        }
-        line = content.readLine();
-      }
-      if(key != null) {
-        // save old value
-        PubmedUtils.addFieldValue(key, value.toString(), fields);
-      }
-      StringBuilder docText = new StringBuilder();
-      // add document title
-      int titleStart = docText.length();
-      int titleEnd = titleStart;
-      Serializable aField = fields.remove(COCHRANE_TITLE);
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-        titleEnd = docText.length();
-        docText.append(Strings.getNl()).append(Strings.getNl());
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document title in document " + 
-            (docName != null ? docName : ""));
-      }
-      // add ID
-      int idStart = docText.length();
-      int idEnd = idStart;
-      aField = fields.get(COCHRANE_ID);
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-        idEnd = docText.length();
-        docText.append(Strings.getNl()).append(Strings.getNl());
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document ID in document " + 
-            (docName != null ? docName : ""));
-      }      
-      // add authors
-      int authorStart = docText.length();
-      int authorEnd = authorStart;
-      aField = fields.get(COCHRANE_AUTHORS);
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-        authorEnd = docText.length();
-        docText.append(Strings.getNl()).append(Strings.getNl());
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document authors in document " + 
-            (docName != null ? docName : ""));
-      }
-      // and the document abstract
-      aField = fields.remove(COCHRANE_ABSTRACT);
-      int absStart = docText.length();
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document abstract in document " + 
-            (docName != null ? docName : ""));
-      }
-      int absEnd = docText.length();
-      doc.setContent(new DocumentContentImpl(docText.toString()));
-      
-      AnnotationSet origMkups = doc.getAnnotations(
-          GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
-      if(titleEnd > titleStart){
-        origMkups.add((long)titleStart, (long)titleEnd, "title", 
-            Factory.newFeatureMap());
-      }
-      if(idEnd > idStart){
-        origMkups.add((long)idStart, (long)idEnd, "id", 
-            Factory.newFeatureMap());
-      }      
-      if(authorEnd > authorStart) {
-        origMkups.add((long)authorStart, (long)authorEnd, "authors", 
-            Factory.newFeatureMap());
-      }      
-      if(absEnd > absStart) {
-        origMkups.add((long)absStart, (long)absEnd, "abstract", 
-            Factory.newFeatureMap());
-      }
-      // everything else becomes document features
-      doc.getFeatures().putAll(fields);
-    } catch(IOException e) {
-      throw new DocumentFormatException("Error while unpacking markup",e); 
-    } catch(InvalidOffsetException e) {
-      throw new DocumentFormatException("Error while unpacking markup",e);
-    }
-    
-    // now let the text unpacker also do its job
-    super.unpackMarkup(doc);
-  }  
-  
+
+  @CreoleParameter(defaultValue = COCHRANE_TITLE + "=title;" + COCHRANE_ID +
+      "=id;" + COCHRANE_AUTHORS + "=authors;" + COCHRANE_ABSTRACT +
+      "=abstract")
+  public void setFieldsForText(List<String> fieldsForText) {
+    super.setFieldsForText(fieldsForText);
+  }
+
+  @CreoleParameter(defaultValue = COCHRANE_TITLE + ";" + COCHRANE_ABSTRACT)
+  public void setExcludeFromFeatures(List<String> excludeFromFeatures) {
+    super.setExcludeFromFeatures(excludeFromFeatures);
+  }
+
 }

Modified: 
gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java
===================================================================
--- 
gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java 
    2015-12-23 02:22:20 UTC (rev 19046)
+++ 
gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java 
    2015-12-23 14:08:05 UTC (rev 19047)
@@ -22,7 +22,9 @@
 import gate.Resource;
 import gate.creole.ResourceInstantiationException;
 import gate.creole.metadata.AutoInstance;
+import gate.creole.metadata.CreoleParameter;
 import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.Optional;
 import gate.util.DocumentFormatException;
 import gate.util.InvalidOffsetException;
 import gate.util.Strings;
@@ -32,6 +34,7 @@
 import java.io.Serializable;
 import java.io.StringReader;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -64,6 +67,14 @@
   protected static final Logger logger = Logger.getLogger(
       PubmedTextDocumentFormat.class);
   
+  protected String fieldPattern;
+
+  protected String ignorePattern;
+
+  protected List<String> fieldsForText;
+
+  protected List<String> excludeFromFeatures;
+
   /* (non-Javadoc)
    * @see gate.DocumentFormat#supportsRepositioning()
    */
@@ -72,6 +83,57 @@
     return false;
   }
 
+  @CreoleParameter(comment = "Regular expression that matches the (whole of 
the) "
+      + "first line of a new field. The expression should include two named "
+      + "capturing groups, <CODE> for the field code and <VALUE> for the "
+      + "field value.", defaultValue = "(?<CODE>....)- (?<VALUE>.*)")
+  public void setFieldPattern(String fieldPattern) {
+    this.fieldPattern = fieldPattern;
+  }
+
+  public String getFieldPattern() {
+    return fieldPattern;
+  }
+
+  @Optional
+  @CreoleParameter(comment = "Regular expression that matches (the whole of) "
+      + "any lines that should be silently ignored.  If unspecified, all "
+      + "lines are considered.")
+  public void setIgnorePattern(String ignorePattern) {
+    this.ignorePattern = ignorePattern;
+  }
+
+  public String getIgnorePattern() {
+    return ignorePattern;
+  }
+
+  @CreoleParameter(comment = "Fields which should be mapped into the document "
+      + "text.  Each entry in this list should be a string of the form "
+      + "fieldcode=annotationtype, the corresponding fields will be "
+      + "concatenated together, separated by blank lines, to form the content "
+      + "of the unpacked document, and each will be covered by an annotation 
of "
+      + "the appropriate type in the Original markups set.",
+      defaultValue = PUBMED_TITLE + "=title;" + PUBMED_ID + "=id;" +
+                PUBMED_AUTHORS + "=authors;" + PUBMED_ABSTRACT + "=abstract")
+  public void setFieldsForText(List<String> fieldsForText) {
+    this.fieldsForText = fieldsForText;
+  }
+
+  public List<String> getFieldsForText() {
+    return fieldsForText;
+  }
+
+  @CreoleParameter(comment = "Fields which should not be mapped to document "
+      + "features. All fields found in the text which are not mentioned here "
+      + "will be stored as features on the document.",
+      defaultValue = PUBMED_TITLE + ";" + PUBMED_ABSTRACT)
+  public void setExcludeFromFeatures(List<String> excludeFromFeatures) {
+    this.excludeFromFeatures = excludeFromFeatures;
+  }
+
+  public List<String> getExcludeFromFeatures() {
+    return excludeFromFeatures;
+  }
   
   /* (non-Javadoc)
    * @see gate.corpora.TextualDocumentFormat#init()
@@ -118,25 +180,32 @@
       String line = content.readLine();
       String key = null;
       StringBuilder value = new StringBuilder();
-      Pattern linePatt =  Pattern.compile("(....)- (.*)");
+      Pattern ignorePatt = null;
+      if(ignorePattern != null) {
+        ignorePatt = Pattern.compile(ignorePattern);
+      }
+      Pattern linePatt =  Pattern.compile(fieldPattern);
       while(line!= null) {
-        Matcher matcher = linePatt.matcher(line);
-        if(matcher.matches()) {
-          // new field
-          if(key != null) {
-            // save old value
-            PubmedUtils.addFieldValue(key, value.toString(), fields);
-          }
-          key = matcher.group(1).trim();
-          value.delete(0, value.length());
-          value.append(matcher.group(2));
-        } else {
-          // a non-assignment line -> append to previous value
-          if(value.length() == 0) {
-            logger.warn("Ignoring invalid input line:\""  +
-               line +  "\"");
+        // skip ignorable lines
+        if(ignorePatt == null || !ignorePatt.matcher(line).matches()) {
+          Matcher matcher = linePatt.matcher(line);
+          if(matcher.matches()) {
+            // new field
+            if(key != null) {
+              // save old value
+              PubmedUtils.addFieldValue(key, value.toString(), fields);
+            }
+            key = matcher.group("CODE").trim();
+            value.delete(0, value.length());
+            value.append(matcher.group("VALUE"));
           } else {
-            value.append(Strings.getNl()).append(line.trim());
+            // a non-assignment line -> append to previous value
+            if(value.length() == 0) {
+              logger.warn("Ignoring invalid input line:\""  +
+                  line +       "\"");
+            } else {
+              value.append(Strings.getNl()).append(line.trim());
+            }
           }
         }
         line = content.readLine();
@@ -146,77 +215,41 @@
         PubmedUtils.addFieldValue(key, value.toString(), fields);
       }
       StringBuilder docText = new StringBuilder();
-      // add document title
-      int titleStart = docText.length();
-      int titleEnd = titleStart;
-      Serializable aField = fields.remove(PUBMED_TITLE);
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-        titleEnd = docText.length();
-        docText.append(Strings.getNl()).append(Strings.getNl());
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document title in document " + 
-            (docName != null ? docName : ""));
+      // build document text
+      int[] starts = new int[fieldsForText.size()];
+      int[] ends = new int[fieldsForText.size()];
+      for(int i = 0; i < fieldsForText.size(); i++) {
+        String[] field = fieldsForText.get(i).split("=", 2);
+        starts[i] = docText.length();
+        ends[i] = starts[i];
+        Serializable aField = fields.get(field[0]);
+        if(aField != null) {
+          docText.append(PubmedUtils.getFieldValueString(aField));
+          ends[i] = docText.length();
+          docText.append(Strings.getNl()).append(Strings.getNl());
+        } else {
+          String docName = doc.getName();  
+          logger.warn("Could not find " + field[1] + " in document " + 
+              (docName != null ? docName : ""));
+        }
       }
-      // add ID
-      int idStart = docText.length();
-      int idEnd = idStart;
-      aField = fields.get(PUBMED_ID);
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-        idEnd = docText.length();
-        docText.append(Strings.getNl()).append(Strings.getNl());
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document ID in document " + 
-            (docName != null ? docName : ""));
-      }
-      // add authors
-      int authorStart = docText.length();
-      int authorEnd = authorStart;
-      aField = fields.get(PUBMED_AUTHORS);
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-        authorEnd = docText.length();
-        docText.append(Strings.getNl()).append(Strings.getNl());
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document authors in document " + 
-            (docName != null ? docName : ""));
-      }
-      // and the document abstract
-      aField = fields.remove(PUBMED_ABSTRACT);
-      int absStart = docText.length();
-      if(aField != null) {
-        docText.append(PubmedUtils.getFieldValueString(aField));
-      } else {
-        String docName = doc.getName();  
-        logger.warn("Could not find document abstract in document " + 
-            (docName != null ? docName : ""));
-      }
-      int absEnd = docText.length();
+
       doc.setContent(new DocumentContentImpl(docText.toString()));
       
       AnnotationSet origMkups = doc.getAnnotations(
           GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
-      if(titleEnd > titleStart){
-        origMkups.add((long)titleStart, (long)titleEnd, "title", 
-            Factory.newFeatureMap());
+      for(int i = 0; i < fieldsForText.size(); i++) {
+        String[] field = fieldsForText.get(i).split("=", 2);
+        if(ends[i] > starts[i]) {
+          origMkups.add((long)starts[i], (long)ends[i], field[1], 
+              Factory.newFeatureMap());
+        }
       }
-      if(idEnd > idStart){
-        origMkups.add((long)idStart, (long)idEnd, "id", 
-            Factory.newFeatureMap());
+
+      // everything else becomes document features
+      for(String keyToExclude : excludeFromFeatures) {
+        fields.remove(keyToExclude);
       }
-      if(authorEnd > authorStart) {
-        origMkups.add((long)authorStart, (long)authorEnd, "authors", 
-            Factory.newFeatureMap());
-      }
-      if(absEnd > absStart) {
-        origMkups.add((long)absStart, (long)absEnd, "abstract", 
-            Factory.newFeatureMap());
-      }
-      // everything else becomes document features
       doc.getFeatures().putAll(fields);
     } catch(IOException e) {
       throw new DocumentFormatException("Error while unpacking markup",e); 

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to