Revision: 19047 http://sourceforge.net/p/gate/code/19047 Author: ian_roberts Date: 2015-12-23 14:08:05 +0000 (Wed, 23 Dec 2015) Log Message: ----------- Pulled out the common parts of pubmed and cochrane formats into something parameterizable, so we can use it as a basis for other formats that use the same database-style "KEY: value" type of syntax.
Modified Paths: -------------- gate/trunk/plugins/Format_PubMed/build.xml gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java Modified: gate/trunk/plugins/Format_PubMed/build.xml =================================================================== --- gate/trunk/plugins/Format_PubMed/build.xml 2015-12-23 02:22:20 UTC (rev 19046) +++ gate/trunk/plugins/Format_PubMed/build.xml 2015-12-23 14:08:05 UTC (rev 19047) @@ -37,8 +37,8 @@ <target name="compile" depends="init" description="compile the source " > <!-- Compile the java code from ${src} into ${build} --> - <javac srcdir="${src}" destdir="${build}" source="1.6" target="1.6" - classpathref="compile.classpath"> + <javac srcdir="${src}" destdir="${build}" source="1.7" target="1.7" + classpathref="compile.classpath" debug="true" debuglevel="lines,source"> <compilerarg value="-Xmaxwarns" /> <compilerarg value="${gate.compile.maxwarnings}" /> <compilerarg value="-Xlint:all" /> @@ -55,7 +55,7 @@ classpathref="compile.classpath" encoding="UTF-8" windowtitle="${plugin.name} JavaDoc" - source="1.6"> + source="1.7"> <sourcepath> <pathelement location="${src}" /> </sourcepath> Modified: gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java =================================================================== --- gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java 2015-12-23 02:22:20 UTC (rev 19046) +++ gate/trunk/plugins/Format_PubMed/src/gate/corpora/CochraneTextDocumentFormat.java 2015-12-23 14:08:05 UTC (rev 19047) @@ -15,29 +15,15 @@ */ package gate.corpora; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.Serializable; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.log4j.Logger; - -import gate.AnnotationSet; -import gate.Document; -import gate.Factory; -import gate.GateConstants; import gate.Resource; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.AutoInstance; +import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.util.DocumentFormatException; -import gate.util.InvalidOffsetException; -import gate.util.Strings; +import java.util.List; + /** * A document format analyser for Cochrane text documents. Use mime type value * "text/x-cochrane", or file extension ".cochrane.txt" to access this document @@ -49,7 +35,7 @@ "the correct file extension.", autoinstances = {@AutoInstance(hidden=true)}, isPrivate = true) -public class CochraneTextDocumentFormat extends TextualDocumentFormat { +public class CochraneTextDocumentFormat extends PubmedTextDocumentFormat { private static final long serialVersionUID = 8362288605943414676L; @@ -61,19 +47,7 @@ private static final String COCHRANE_ID = "ID"; - protected static final Logger logger = Logger.getLogger( - CochraneTextDocumentFormat.class); - /* (non-Javadoc) - * @see gate.DocumentFormat#supportsRepositioning() - */ - @Override - public Boolean supportsRepositioning() { - return false; - } - - - /* (non-Javadoc) * @see gate.corpora.TextualDocumentFormat#init() */ @Override @@ -104,127 +78,23 @@ mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype()); suffixes2mimeTypeMap.remove("cochrane.txt"); } + + @CreoleParameter(defaultValue = "(?<CODE>[A-Z]+): (?<VALUE>.*)") + public void setFieldPattern(String fieldPattern) { + super.setFieldPattern(fieldPattern); + } - /* (non-Javadoc) - * @see gate.corpora.TextualDocumentFormat#unpackMarkup(gate.Document) - */ - @Override - public void unpackMarkup(Document doc) throws DocumentFormatException { - try { - BufferedReader content = new BufferedReader(new StringReader( - doc.getContent().toString())); - Map<String, Serializable> fields = new HashMap<String, Serializable>(); - String line = content.readLine(); - String key = null; - StringBuilder value = new StringBuilder(); - Pattern linePatt = Pattern.compile("([A-Z]+): (.*)"); - while(line!= null) { - Matcher matcher = linePatt.matcher(line); - if(matcher.matches()) { - // new field - if(key != null) { - // save old value - PubmedUtils.addFieldValue(key, value.toString(), fields); - } - key = matcher.group(1).trim(); - value.delete(0, value.length()); - value.append(matcher.group(2)); - } else { - // a non-assignment line -> append to previous value - if(value.length() == 0) { - logger.warn("Ignoring invalid input line:\"" + - line + "\""); - } else { - value.append(Strings.getNl()).append(line.trim()); - } - } - line = content.readLine(); - } - if(key != null) { - // save old value - PubmedUtils.addFieldValue(key, value.toString(), fields); - } - StringBuilder docText = new StringBuilder(); - // add document title - int titleStart = docText.length(); - int titleEnd = titleStart; - Serializable aField = fields.remove(COCHRANE_TITLE); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - titleEnd = docText.length(); - docText.append(Strings.getNl()).append(Strings.getNl()); - } else { - String docName = doc.getName(); - logger.warn("Could not find document title in document " + - (docName != null ? docName : "")); - } - // add ID - int idStart = docText.length(); - int idEnd = idStart; - aField = fields.get(COCHRANE_ID); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - idEnd = docText.length(); - docText.append(Strings.getNl()).append(Strings.getNl()); - } else { - String docName = doc.getName(); - logger.warn("Could not find document ID in document " + - (docName != null ? docName : "")); - } - // add authors - int authorStart = docText.length(); - int authorEnd = authorStart; - aField = fields.get(COCHRANE_AUTHORS); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - authorEnd = docText.length(); - docText.append(Strings.getNl()).append(Strings.getNl()); - } else { - String docName = doc.getName(); - logger.warn("Could not find document authors in document " + - (docName != null ? docName : "")); - } - // and the document abstract - aField = fields.remove(COCHRANE_ABSTRACT); - int absStart = docText.length(); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - } else { - String docName = doc.getName(); - logger.warn("Could not find document abstract in document " + - (docName != null ? docName : "")); - } - int absEnd = docText.length(); - doc.setContent(new DocumentContentImpl(docText.toString())); - - AnnotationSet origMkups = doc.getAnnotations( - GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); - if(titleEnd > titleStart){ - origMkups.add((long)titleStart, (long)titleEnd, "title", - Factory.newFeatureMap()); - } - if(idEnd > idStart){ - origMkups.add((long)idStart, (long)idEnd, "id", - Factory.newFeatureMap()); - } - if(authorEnd > authorStart) { - origMkups.add((long)authorStart, (long)authorEnd, "authors", - Factory.newFeatureMap()); - } - if(absEnd > absStart) { - origMkups.add((long)absStart, (long)absEnd, "abstract", - Factory.newFeatureMap()); - } - // everything else becomes document features - doc.getFeatures().putAll(fields); - } catch(IOException e) { - throw new DocumentFormatException("Error while unpacking markup",e); - } catch(InvalidOffsetException e) { - throw new DocumentFormatException("Error while unpacking markup",e); - } - - // now let the text unpacker also do its job - super.unpackMarkup(doc); - } - + + @CreoleParameter(defaultValue = COCHRANE_TITLE + "=title;" + COCHRANE_ID + + "=id;" + COCHRANE_AUTHORS + "=authors;" + COCHRANE_ABSTRACT + + "=abstract") + public void setFieldsForText(List<String> fieldsForText) { + super.setFieldsForText(fieldsForText); + } + + @CreoleParameter(defaultValue = COCHRANE_TITLE + ";" + COCHRANE_ABSTRACT) + public void setExcludeFromFeatures(List<String> excludeFromFeatures) { + super.setExcludeFromFeatures(excludeFromFeatures); + } + } Modified: gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java =================================================================== --- gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java 2015-12-23 02:22:20 UTC (rev 19046) +++ gate/trunk/plugins/Format_PubMed/src/gate/corpora/PubmedTextDocumentFormat.java 2015-12-23 14:08:05 UTC (rev 19047) @@ -22,7 +22,9 @@ import gate.Resource; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.AutoInstance; +import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; +import gate.creole.metadata.Optional; import gate.util.DocumentFormatException; import gate.util.InvalidOffsetException; import gate.util.Strings; @@ -32,6 +34,7 @@ import java.io.Serializable; import java.io.StringReader; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -64,6 +67,14 @@ protected static final Logger logger = Logger.getLogger( PubmedTextDocumentFormat.class); + protected String fieldPattern; + + protected String ignorePattern; + + protected List<String> fieldsForText; + + protected List<String> excludeFromFeatures; + /* (non-Javadoc) * @see gate.DocumentFormat#supportsRepositioning() */ @@ -72,6 +83,57 @@ return false; } + @CreoleParameter(comment = "Regular expression that matches the (whole of the) " + + "first line of a new field. The expression should include two named " + + "capturing groups, <CODE> for the field code and <VALUE> for the " + + "field value.", defaultValue = "(?<CODE>....)- (?<VALUE>.*)") + public void setFieldPattern(String fieldPattern) { + this.fieldPattern = fieldPattern; + } + + public String getFieldPattern() { + return fieldPattern; + } + + @Optional + @CreoleParameter(comment = "Regular expression that matches (the whole of) " + + "any lines that should be silently ignored. If unspecified, all " + + "lines are considered.") + public void setIgnorePattern(String ignorePattern) { + this.ignorePattern = ignorePattern; + } + + public String getIgnorePattern() { + return ignorePattern; + } + + @CreoleParameter(comment = "Fields which should be mapped into the document " + + "text. Each entry in this list should be a string of the form " + + "fieldcode=annotationtype, the corresponding fields will be " + + "concatenated together, separated by blank lines, to form the content " + + "of the unpacked document, and each will be covered by an annotation of " + + "the appropriate type in the Original markups set.", + defaultValue = PUBMED_TITLE + "=title;" + PUBMED_ID + "=id;" + + PUBMED_AUTHORS + "=authors;" + PUBMED_ABSTRACT + "=abstract") + public void setFieldsForText(List<String> fieldsForText) { + this.fieldsForText = fieldsForText; + } + + public List<String> getFieldsForText() { + return fieldsForText; + } + + @CreoleParameter(comment = "Fields which should not be mapped to document " + + "features. All fields found in the text which are not mentioned here " + + "will be stored as features on the document.", + defaultValue = PUBMED_TITLE + ";" + PUBMED_ABSTRACT) + public void setExcludeFromFeatures(List<String> excludeFromFeatures) { + this.excludeFromFeatures = excludeFromFeatures; + } + + public List<String> getExcludeFromFeatures() { + return excludeFromFeatures; + } /* (non-Javadoc) * @see gate.corpora.TextualDocumentFormat#init() @@ -118,25 +180,32 @@ String line = content.readLine(); String key = null; StringBuilder value = new StringBuilder(); - Pattern linePatt = Pattern.compile("(....)- (.*)"); + Pattern ignorePatt = null; + if(ignorePattern != null) { + ignorePatt = Pattern.compile(ignorePattern); + } + Pattern linePatt = Pattern.compile(fieldPattern); while(line!= null) { - Matcher matcher = linePatt.matcher(line); - if(matcher.matches()) { - // new field - if(key != null) { - // save old value - PubmedUtils.addFieldValue(key, value.toString(), fields); - } - key = matcher.group(1).trim(); - value.delete(0, value.length()); - value.append(matcher.group(2)); - } else { - // a non-assignment line -> append to previous value - if(value.length() == 0) { - logger.warn("Ignoring invalid input line:\"" + - line + "\""); + // skip ignorable lines + if(ignorePatt == null || !ignorePatt.matcher(line).matches()) { + Matcher matcher = linePatt.matcher(line); + if(matcher.matches()) { + // new field + if(key != null) { + // save old value + PubmedUtils.addFieldValue(key, value.toString(), fields); + } + key = matcher.group("CODE").trim(); + value.delete(0, value.length()); + value.append(matcher.group("VALUE")); } else { - value.append(Strings.getNl()).append(line.trim()); + // a non-assignment line -> append to previous value + if(value.length() == 0) { + logger.warn("Ignoring invalid input line:\"" + + line + "\""); + } else { + value.append(Strings.getNl()).append(line.trim()); + } } } line = content.readLine(); @@ -146,77 +215,41 @@ PubmedUtils.addFieldValue(key, value.toString(), fields); } StringBuilder docText = new StringBuilder(); - // add document title - int titleStart = docText.length(); - int titleEnd = titleStart; - Serializable aField = fields.remove(PUBMED_TITLE); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - titleEnd = docText.length(); - docText.append(Strings.getNl()).append(Strings.getNl()); - } else { - String docName = doc.getName(); - logger.warn("Could not find document title in document " + - (docName != null ? docName : "")); + // build document text + int[] starts = new int[fieldsForText.size()]; + int[] ends = new int[fieldsForText.size()]; + for(int i = 0; i < fieldsForText.size(); i++) { + String[] field = fieldsForText.get(i).split("=", 2); + starts[i] = docText.length(); + ends[i] = starts[i]; + Serializable aField = fields.get(field[0]); + if(aField != null) { + docText.append(PubmedUtils.getFieldValueString(aField)); + ends[i] = docText.length(); + docText.append(Strings.getNl()).append(Strings.getNl()); + } else { + String docName = doc.getName(); + logger.warn("Could not find " + field[1] + " in document " + + (docName != null ? docName : "")); + } } - // add ID - int idStart = docText.length(); - int idEnd = idStart; - aField = fields.get(PUBMED_ID); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - idEnd = docText.length(); - docText.append(Strings.getNl()).append(Strings.getNl()); - } else { - String docName = doc.getName(); - logger.warn("Could not find document ID in document " + - (docName != null ? docName : "")); - } - // add authors - int authorStart = docText.length(); - int authorEnd = authorStart; - aField = fields.get(PUBMED_AUTHORS); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - authorEnd = docText.length(); - docText.append(Strings.getNl()).append(Strings.getNl()); - } else { - String docName = doc.getName(); - logger.warn("Could not find document authors in document " + - (docName != null ? docName : "")); - } - // and the document abstract - aField = fields.remove(PUBMED_ABSTRACT); - int absStart = docText.length(); - if(aField != null) { - docText.append(PubmedUtils.getFieldValueString(aField)); - } else { - String docName = doc.getName(); - logger.warn("Could not find document abstract in document " + - (docName != null ? docName : "")); - } - int absEnd = docText.length(); + doc.setContent(new DocumentContentImpl(docText.toString())); AnnotationSet origMkups = doc.getAnnotations( GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); - if(titleEnd > titleStart){ - origMkups.add((long)titleStart, (long)titleEnd, "title", - Factory.newFeatureMap()); + for(int i = 0; i < fieldsForText.size(); i++) { + String[] field = fieldsForText.get(i).split("=", 2); + if(ends[i] > starts[i]) { + origMkups.add((long)starts[i], (long)ends[i], field[1], + Factory.newFeatureMap()); + } } - if(idEnd > idStart){ - origMkups.add((long)idStart, (long)idEnd, "id", - Factory.newFeatureMap()); + + // everything else becomes document features + for(String keyToExclude : excludeFromFeatures) { + fields.remove(keyToExclude); } - if(authorEnd > authorStart) { - origMkups.add((long)authorStart, (long)authorEnd, "authors", - Factory.newFeatureMap()); - } - if(absEnd > absStart) { - origMkups.add((long)absStart, (long)absEnd, "abstract", - Factory.newFeatureMap()); - } - // everything else becomes document features doc.getFeatures().putAll(fields); } catch(IOException e) { throw new DocumentFormatException("Error while unpacking markup",e); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs