Revision: 17222
          http://sourceforge.net/p/gate/code/17222
Author:   markagreenwood
Date:     2014-01-08 17:16:30 +0000 (Wed, 08 Jan 2014)
Log Message:
-----------
the new document normalizer plugin

Added Paths:
-----------
    gate/trunk/plugins/DocumentNormalizer/
    gate/trunk/plugins/DocumentNormalizer/build.xml
    gate/trunk/plugins/DocumentNormalizer/creole.xml
    gate/trunk/plugins/DocumentNormalizer/doc/
    gate/trunk/plugins/DocumentNormalizer/doc/README.TXT
    gate/trunk/plugins/DocumentNormalizer/resources/
    gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst
    gate/trunk/plugins/DocumentNormalizer/src/
    gate/trunk/plugins/DocumentNormalizer/src/gate/
    gate/trunk/plugins/DocumentNormalizer/src/gate/creole/
    
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java

Added: gate/trunk/plugins/DocumentNormalizer/build.xml
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/build.xml                             
(rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/build.xml     2014-01-08 17:16:30 UTC 
(rev 17222)
@@ -0,0 +1,77 @@
+<project name="DocumentNormalizer" basedir=".">
+       <property file="build.properties" />
+
+       <property name="gate.home" location="../.." />
+       <property name="gate.lib" location="${gate.home}/lib" />
+       <property name="src.dir" location="src" />
+       <property name="classes.dir" location="classes" />
+       <property name="jar.location" location="doc-normalizer.jar" />
+       <property name="doc.dir" location="doc" />
+       <property name="javadoc.dir" location="${doc.dir}/javadoc" />
+
+       <!-- Path to compile - includes gate.jar and GATE/lib/*.jar -->
+       <path id="compile.classpath">
+               <pathelement location="${gate.home}/bin/gate.jar" />
+               <fileset dir="${gate.lib}">
+                       <include name="**/*.jar" />
+                       <include name="**/*.zip" />
+               </fileset>
+       </path>
+
+       <!-- create build directory structure -->
+       <target name="prepare">
+               <mkdir dir="${classes.dir}" />
+       </target>
+
+       <!-- compile the source -->
+       <target name="compile" depends="prepare">
+               <javac classpathref="compile.classpath" srcdir="${src.dir}" 
destdir="${classes.dir}" debug="true" debuglevel="lines,source" 
encoding="UTF-8" source="1.5" target="1.5" />
+       </target>
+
+       <target name="resources" depends="prepare">
+               <!-- <copy todir="${classes.dir}/gate/resources" 
includeEmptyDirs="true">
+                       <fileset dir="${src.dir}/gate/resources" />
+               </copy> -->
+       </target>
+
+       <!-- create the JAR file -->
+       <target name="jar" depends="compile, resources">
+               <jar destfile="${jar.location}" update="false" 
basedir="${classes.dir}" />
+       </target>
+
+       <!-- remove the generated .class files -->
+       <target name="clean.classes">
+               <delete dir="${classes.dir}" />
+       </target>
+
+       <!-- Clean up - remove .class and .jar files -->
+       <target name="clean" depends="clean.classes">
+               <delete file="${jar.location}" />
+       </target>
+
+       <!-- Targets used by the main GATE build file:
+         build: build the plugin - just calls "jar" target
+         test : run the unit tests - there aren't any
+         distro.prepare: remove intermediate files that shouldn't be in the
+                         distribution
+       -->
+
+       <!-- Build JavaDoc documentation -->
+       <target name="doc.prepare">
+               <mkdir dir="${javadoc.dir}" />
+       </target>
+
+       <target name="javadoc" depends="doc.prepare">
+               <javadoc destdir="${javadoc.dir}" packagenames="*" 
classpathref="compile.classpath" encoding="UTF-8" windowtitle="Annotations As 
HTML Microdata JavaDoc" source="1.5">
+                       <sourcepath>
+                               <pathelement location="${src.dir}" />
+                       </sourcepath>
+                       <link href="http://java.sun.com/j2se/1.5.0/docs/api/"; />
+                       <link href="http://gate.ac.uk/gate/doc/javadoc/"; />
+               </javadoc>
+       </target>
+
+       <target name="build" depends="jar" />
+       <target name="test" />
+       <target name="distro.prepare" depends="clean.classes" />
+</project>

Added: gate/trunk/plugins/DocumentNormalizer/creole.xml
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/creole.xml                            
(rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/creole.xml    2014-01-08 17:16:30 UTC 
(rev 17222)
@@ -0,0 +1,3 @@
+<CREOLE-DIRECTORY>
+  <JAR SCAN="true">doc-normalizer.jar</JAR>
+</CREOLE-DIRECTORY>

Added: gate/trunk/plugins/DocumentNormalizer/doc/README.TXT
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/doc/README.TXT                        
        (rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/doc/README.TXT        2014-01-08 
17:16:30 UTC (rev 17222)
@@ -0,0 +1,16 @@
+A simple PR to allow for basic document normalization. Should usually be run as
+the first PR in a pipeline after Document Reset. The PR edits the document
+content and so once it has been run over a document once, future executions
+will have no effect although will require processing time.
+
+The PR works from a file of replacements. Essentially this file consists of
+pairs of lines. The first line specifics the text to replace, while the second
+line signifies what will be substituted in its place. The first line can be a
+regular expression, but back references cannot be used within the second line.
+
+The most common use for this PR is to normalise punctuation symbols as WYSIWYG
+editors often automatically replace standard apostrophe and hyphen symbols with
+more fancy versions. This makes processing text difficult as gazetteer lists,
+JAPE grammars and other resources usually assume the use of the standard
+symbols, i.e. the ones on the keyboard. The default config file is aimed at
+normalizing such cases.

Added: gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst            
                (rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst    
2014-01-08 17:16:30 UTC (rev 17222)
@@ -0,0 +1,6 @@
+’
+'
+—
+-
+–
+-

Added: 
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java
===================================================================
--- 
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java   
                            (rev 0)
+++ 
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java   
    2014-01-08 17:16:30 UTC (rev 17222)
@@ -0,0 +1,139 @@
+/*
+ * DocumentNoramlizaer.java
+ *
+ * Copyright (c) 2011-2013, The University of Sheffield.
+ * 
+ * This file is part of GATE (see http://gate.ac.uk/), and is free software,
+ * Licensed under the GNU Library General Public License, Version 3, June 2007
+ * (in the distribution as file licence.html, and also available at
+ * http://gate.ac.uk/gate/licence.html).
+ *
+ * Mark A. Greenwood, 10/11/2011
+ */
+
+package gate.creole;
+
+import gate.Resource;
+import gate.corpora.DocumentContentImpl;
+import gate.creole.metadata.CreoleParameter;
+import gate.creole.metadata.CreoleResource;
+import gate.util.InvalidOffsetException;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@CreoleResource
+public class DocumentNormalizer extends AbstractLanguageAnalyser {
+
+  private static final long serialVersionUID = -6780562970645480555L;
+
+  private List<Replacement> replacements = new ArrayList<Replacement>();
+
+  private URL listURL;
+
+  private String encoding;
+
+  @CreoleParameter(defaultValue = "resources/replacements.lst",
+    comment = "the file controlling the replacements to be made")
+  public void setReplacementsURL(URL listURL) {
+    this.listURL = listURL;
+  }
+
+  public URL getReplacementsURL() {
+    return listURL;
+  }
+
+  @CreoleParameter(defaultValue = "UTF-8",
+    comment = "The encoding of the replacements file")
+  public void setEncoding(String encoding) {
+    this.encoding = encoding;
+  }
+
+  public String getEncoding() {
+    return encoding;
+  }
+
+  @Override
+  public Resource init() throws ResourceInstantiationException {
+    if(encoding == null)
+      throw new ResourceInstantiationException("Encoding must be specified!");
+    if(listURL == null)
+      throw new ResourceInstantiationException(
+              "URL of replacements file must be specified!");
+
+    replacements.clear();
+
+    try {
+      BufferedReader in =
+              new BufferedReader(new InputStreamReader(listURL.openStream(),
+                      encoding));
+      String from = in.readLine();
+      while(from != null) {
+        String to = in.readLine();
+
+        if(to == null)
+          throw new ResourceInstantiationException("Non-Matching 
Replacement!");
+
+        replacements.add(new Replacement(Pattern.compile(from), to));
+
+        from = in.readLine();
+      }
+    } catch(Exception e) {
+      throw new ResourceInstantiationException(e);
+    }
+
+    return this;
+  }
+
+  @Override
+  public void execute() throws ExecutionException {
+
+    try {
+      for(Replacement r : replacements) {
+
+        String docContent = document.getContent().toString();
+
+        Matcher m = r.from.matcher(docContent);
+
+        String replacement = r.to;
+        int rl = replacement.length();
+
+        long offset = 0;
+
+        while(m.find()) {
+
+          long start = m.start() + offset;
+          long end = m.end() + offset;
+
+          document.edit(start, end, new DocumentContentImpl(replacement));
+
+          offset += rl - (m.end() - m.start());
+
+        }
+      }
+    } catch(InvalidOffsetException e) {
+      throw new ExecutionException(e);
+    }
+  }
+
+  private static class Replacement {
+    protected Pattern from;
+
+    protected String to;
+
+    public Replacement(Pattern from, String to) {
+      this.from = from;
+      this.to = to;
+    }
+
+    @Override
+    public String toString() {
+      return from + " --> " + to;
+    }
+  }
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to