Revision: 17222
http://sourceforge.net/p/gate/code/17222
Author: markagreenwood
Date: 2014-01-08 17:16:30 +0000 (Wed, 08 Jan 2014)
Log Message:
-----------
the new document normalizer plugin
Added Paths:
-----------
gate/trunk/plugins/DocumentNormalizer/
gate/trunk/plugins/DocumentNormalizer/build.xml
gate/trunk/plugins/DocumentNormalizer/creole.xml
gate/trunk/plugins/DocumentNormalizer/doc/
gate/trunk/plugins/DocumentNormalizer/doc/README.TXT
gate/trunk/plugins/DocumentNormalizer/resources/
gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst
gate/trunk/plugins/DocumentNormalizer/src/
gate/trunk/plugins/DocumentNormalizer/src/gate/
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java
Added: gate/trunk/plugins/DocumentNormalizer/build.xml
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/build.xml
(rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/build.xml 2014-01-08 17:16:30 UTC
(rev 17222)
@@ -0,0 +1,77 @@
+<project name="DocumentNormalizer" basedir=".">
+ <property file="build.properties" />
+
+ <property name="gate.home" location="../.." />
+ <property name="gate.lib" location="${gate.home}/lib" />
+ <property name="src.dir" location="src" />
+ <property name="classes.dir" location="classes" />
+ <property name="jar.location" location="doc-normalizer.jar" />
+ <property name="doc.dir" location="doc" />
+ <property name="javadoc.dir" location="${doc.dir}/javadoc" />
+
+ <!-- Path to compile - includes gate.jar and GATE/lib/*.jar -->
+ <path id="compile.classpath">
+ <pathelement location="${gate.home}/bin/gate.jar" />
+ <fileset dir="${gate.lib}">
+ <include name="**/*.jar" />
+ <include name="**/*.zip" />
+ </fileset>
+ </path>
+
+ <!-- create build directory structure -->
+ <target name="prepare">
+ <mkdir dir="${classes.dir}" />
+ </target>
+
+ <!-- compile the source -->
+ <target name="compile" depends="prepare">
+ <javac classpathref="compile.classpath" srcdir="${src.dir}"
destdir="${classes.dir}" debug="true" debuglevel="lines,source"
encoding="UTF-8" source="1.5" target="1.5" />
+ </target>
+
+ <target name="resources" depends="prepare">
+ <!-- <copy todir="${classes.dir}/gate/resources"
includeEmptyDirs="true">
+ <fileset dir="${src.dir}/gate/resources" />
+ </copy> -->
+ </target>
+
+ <!-- create the JAR file -->
+ <target name="jar" depends="compile, resources">
+ <jar destfile="${jar.location}" update="false"
basedir="${classes.dir}" />
+ </target>
+
+ <!-- remove the generated .class files -->
+ <target name="clean.classes">
+ <delete dir="${classes.dir}" />
+ </target>
+
+ <!-- Clean up - remove .class and .jar files -->
+ <target name="clean" depends="clean.classes">
+ <delete file="${jar.location}" />
+ </target>
+
+ <!-- Targets used by the main GATE build file:
+ build: build the plugin - just calls "jar" target
+ test : run the unit tests - there aren't any
+ distro.prepare: remove intermediate files that shouldn't be in the
+ distribution
+ -->
+
+ <!-- Build JavaDoc documentation -->
+ <target name="doc.prepare">
+ <mkdir dir="${javadoc.dir}" />
+ </target>
+
+ <target name="javadoc" depends="doc.prepare">
+ <javadoc destdir="${javadoc.dir}" packagenames="*"
classpathref="compile.classpath" encoding="UTF-8" windowtitle="Annotations As
HTML Microdata JavaDoc" source="1.5">
+ <sourcepath>
+ <pathelement location="${src.dir}" />
+ </sourcepath>
+ <link href="http://java.sun.com/j2se/1.5.0/docs/api/" />
+ <link href="http://gate.ac.uk/gate/doc/javadoc/" />
+ </javadoc>
+ </target>
+
+ <target name="build" depends="jar" />
+ <target name="test" />
+ <target name="distro.prepare" depends="clean.classes" />
+</project>
Added: gate/trunk/plugins/DocumentNormalizer/creole.xml
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/creole.xml
(rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/creole.xml 2014-01-08 17:16:30 UTC
(rev 17222)
@@ -0,0 +1,3 @@
+<CREOLE-DIRECTORY>
+ <JAR SCAN="true">doc-normalizer.jar</JAR>
+</CREOLE-DIRECTORY>
Added: gate/trunk/plugins/DocumentNormalizer/doc/README.TXT
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/doc/README.TXT
(rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/doc/README.TXT 2014-01-08
17:16:30 UTC (rev 17222)
@@ -0,0 +1,16 @@
+A simple PR to allow for basic document normalization. Should usually be run as
+the first PR in a pipeline after Document Reset. The PR edits the document
+content and so once it has been run over a document once, future executions
+will have no effect although will require processing time.
+
+The PR works from a file of replacements. Essentially this file consists of
+pairs of lines. The first line specifics the text to replace, while the second
+line signifies what will be substituted in its place. The first line can be a
+regular expression, but back references cannot be used within the second line.
+
+The most common use for this PR is to normalise punctuation symbols as WYSIWYG
+editors often automatically replace standard apostrophe and hyphen symbols with
+more fancy versions. This makes processing text difficult as gazetteer lists,
+JAPE grammars and other resources usually assume the use of the standard
+symbols, i.e. the ones on the keyboard. The default config file is aimed at
+normalizing such cases.
Added: gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst
===================================================================
--- gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst
(rev 0)
+++ gate/trunk/plugins/DocumentNormalizer/resources/replacements.lst
2014-01-08 17:16:30 UTC (rev 17222)
@@ -0,0 +1,6 @@
+’
+'
+—
+-
+–
+-
Added:
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java
===================================================================
---
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java
(rev 0)
+++
gate/trunk/plugins/DocumentNormalizer/src/gate/creole/DocumentNormalizer.java
2014-01-08 17:16:30 UTC (rev 17222)
@@ -0,0 +1,139 @@
+/*
+ * DocumentNoramlizaer.java
+ *
+ * Copyright (c) 2011-2013, The University of Sheffield.
+ *
+ * This file is part of GATE (see http://gate.ac.uk/), and is free software,
+ * Licensed under the GNU Library General Public License, Version 3, June 2007
+ * (in the distribution as file licence.html, and also available at
+ * http://gate.ac.uk/gate/licence.html).
+ *
+ * Mark A. Greenwood, 10/11/2011
+ */
+
+package gate.creole;
+
+import gate.Resource;
+import gate.corpora.DocumentContentImpl;
+import gate.creole.metadata.CreoleParameter;
+import gate.creole.metadata.CreoleResource;
+import gate.util.InvalidOffsetException;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@CreoleResource
+public class DocumentNormalizer extends AbstractLanguageAnalyser {
+
+ private static final long serialVersionUID = -6780562970645480555L;
+
+ private List<Replacement> replacements = new ArrayList<Replacement>();
+
+ private URL listURL;
+
+ private String encoding;
+
+ @CreoleParameter(defaultValue = "resources/replacements.lst",
+ comment = "the file controlling the replacements to be made")
+ public void setReplacementsURL(URL listURL) {
+ this.listURL = listURL;
+ }
+
+ public URL getReplacementsURL() {
+ return listURL;
+ }
+
+ @CreoleParameter(defaultValue = "UTF-8",
+ comment = "The encoding of the replacements file")
+ public void setEncoding(String encoding) {
+ this.encoding = encoding;
+ }
+
+ public String getEncoding() {
+ return encoding;
+ }
+
+ @Override
+ public Resource init() throws ResourceInstantiationException {
+ if(encoding == null)
+ throw new ResourceInstantiationException("Encoding must be specified!");
+ if(listURL == null)
+ throw new ResourceInstantiationException(
+ "URL of replacements file must be specified!");
+
+ replacements.clear();
+
+ try {
+ BufferedReader in =
+ new BufferedReader(new InputStreamReader(listURL.openStream(),
+ encoding));
+ String from = in.readLine();
+ while(from != null) {
+ String to = in.readLine();
+
+ if(to == null)
+ throw new ResourceInstantiationException("Non-Matching
Replacement!");
+
+ replacements.add(new Replacement(Pattern.compile(from), to));
+
+ from = in.readLine();
+ }
+ } catch(Exception e) {
+ throw new ResourceInstantiationException(e);
+ }
+
+ return this;
+ }
+
+ @Override
+ public void execute() throws ExecutionException {
+
+ try {
+ for(Replacement r : replacements) {
+
+ String docContent = document.getContent().toString();
+
+ Matcher m = r.from.matcher(docContent);
+
+ String replacement = r.to;
+ int rl = replacement.length();
+
+ long offset = 0;
+
+ while(m.find()) {
+
+ long start = m.start() + offset;
+ long end = m.end() + offset;
+
+ document.edit(start, end, new DocumentContentImpl(replacement));
+
+ offset += rl - (m.end() - m.start());
+
+ }
+ }
+ } catch(InvalidOffsetException e) {
+ throw new ExecutionException(e);
+ }
+ }
+
+ private static class Replacement {
+ protected Pattern from;
+
+ protected String to;
+
+ public Replacement(Pattern from, String to) {
+ this.from = from;
+ this.to = to;
+ }
+
+ @Override
+ public String toString() {
+ return from + " --> " + to;
+ }
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs