Revision: 17142
http://sourceforge.net/p/gate/code/17142
Author: markagreenwood
Date: 2013-11-29 17:10:22 +0000 (Fri, 29 Nov 2013)
Log Message:
-----------
a new plugin that supports loading documents from CSV files via a
ResourceHelper which adds a new right-click option to instances of Corpus
Added Paths:
-----------
gate/trunk/plugins/Format_CSV/
gate/trunk/plugins/Format_CSV/.classpath
gate/trunk/plugins/Format_CSV/.project
gate/trunk/plugins/Format_CSV/build.xml
gate/trunk/plugins/Format_CSV/creole.xml
gate/trunk/plugins/Format_CSV/lib/
gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
gate/trunk/plugins/Format_CSV/src/
gate/trunk/plugins/Format_CSV/src/gate/
gate/trunk/plugins/Format_CSV/src/gate/corpora/
gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java
Added: gate/trunk/plugins/Format_CSV/.classpath
===================================================================
--- gate/trunk/plugins/Format_CSV/.classpath (rev 0)
+++ gate/trunk/plugins/Format_CSV/.classpath 2013-11-29 17:10:22 UTC (rev
17142)
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src"/>
+ <classpathentry kind="con"
path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+ <classpathentry combineaccessrules="false" kind="src" path="/GATE"/>
+ <classpathentry kind="lib" path="lib/opencsv-2.3.jar"/>
+ <classpathentry kind="output" path="classes"/>
+</classpath>
Added: gate/trunk/plugins/Format_CSV/.project
===================================================================
--- gate/trunk/plugins/Format_CSV/.project (rev 0)
+++ gate/trunk/plugins/Format_CSV/.project 2013-11-29 17:10:22 UTC (rev
17142)
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>Format_CSV</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ </natures>
+</projectDescription>
Added: gate/trunk/plugins/Format_CSV/build.xml
===================================================================
--- gate/trunk/plugins/Format_CSV/build.xml (rev 0)
+++ gate/trunk/plugins/Format_CSV/build.xml 2013-11-29 17:10:22 UTC (rev
17142)
@@ -0,0 +1,104 @@
+<project name="Format_CSV" basedir="." default="all" >
+ <!-- Prevent Ant from warning about includeantruntime not being set -->
+ <property name="build.sysclasspath" value="ignore" />
+
+ <property file="build.properties" />
+ <property name="plugin.name" value="Format_CSV"/>
+
+ <!-- Make environment variables available -->
+ <property environment="env" />
+
+ <!-- If environment variable GATE_HOME is set, use it for
+ gate.home (unless it was already set in build.properties -->
+ <condition property="gate.home" value="${env.GATE_HOME}">
+ <isset property="env.GATE_HOME"/>
+ </condition>
+
+ <property name="gate.home" location="../.." />
+ <property name="gate.lib" location="${gate.home}/lib" />
+ <property name="gate.jar" location="${gate.home}/bin/gate.jar" />
+ <property name="src.dir" location="src" />
+ <property name="lib.dir" location="lib" />
+ <property name="classes.dir" location="classes" />
+ <property name="jar.location" location="format-csv.jar" />
+ <property name="doc.dir" location="doc" />
+ <property name="javadoc.dir" location="${doc.dir}/javadoc" />
+
+ <!-- Path to compile - includes gate.jar and GATE/lib/*.jar -->
+ <path id="compile.classpath">
+ <fileset dir="${lib.dir}">
+ <include name="**/*.jar" />
+ </fileset>
+ <pathelement location="${gate.jar}" />
+ <fileset dir="${gate.lib}">
+ <include name="**/*.jar" />
+ <include name="**/*.zip" />
+ </fileset>
+ </path>
+
+ <!-- create build directory structure -->
+ <target name="prepare">
+ <mkdir dir="${classes.dir}" />
+ </target>
+
+ <!-- compile the source -->
+ <target name="compile" depends="prepare">
+ <javac classpathref="compile.classpath"
+ srcdir="${src.dir}"
+ destdir="${classes.dir}"
+ debug="true"
+ debuglevel="lines,source"
+ source="1.6"
+ target="1.6" />
+ </target>
+
+ <!-- Build JavaDoc documentation -->
+ <target name="doc.prepare">
+ <mkdir dir="${javadoc.dir}" />
+ </target>
+
+ <target name="javadoc" depends="doc.prepare">
+ <javadoc destdir="${javadoc.dir}" packagenames="*"
+ classpathref="compile.classpath"
+ encoding="UTF-8"
+ windowtitle="${plugin.name} JavaDoc"
+ source="1.6">
+ <sourcepath>
+ <pathelement location="${src.dir}" />
+ </sourcepath>
+ <link href="http://docs.oracle.com/javase/6/docs/api/"
/>
+ <link href="http://gate.ac.uk/gate/doc/javadoc/" />
+ </javadoc>
+ </target>
+
+
+ <!-- create the JAR file -->
+ <target name="jar" depends="compile" >
+ <jar destfile="${jar.location}"
+ update="false"
+ basedir="${classes.dir}" />
+ </target>
+
+ <!-- remove the generated .class files -->
+ <target name="clean.classes" >
+ <delete dir="${classes.dir}" />
+ </target>
+
+ <!-- Clean up - remove .class and .jar files -->
+ <target name="clean" depends="clean.classes" >
+ <delete file="${jar.location}" />
+ </target>
+
+ <!-- Build everything - the code and JavaDoc -->
+ <target name="all" depends="jar, javadoc" />
+
+ <!-- Targets used by the main GATE build file:
+ build: build the plugin - just calls "jar" target
+ test : run the unit tests - there aren't any
+ distro.prepare: remove intermediate files that shouldn't be in the
+ distribution
+ -->
+ <target name="build" depends="jar" />
+ <target name="test" />
+ <target name="distro.prepare" depends="clean.classes" />
+</project>
Added: gate/trunk/plugins/Format_CSV/creole.xml
===================================================================
--- gate/trunk/plugins/Format_CSV/creole.xml (rev 0)
+++ gate/trunk/plugins/Format_CSV/creole.xml 2013-11-29 17:10:22 UTC (rev
17142)
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+<CREOLE-DIRECTORY>
+ <JAR SCAN="true">format-csv.jar</JAR>
+ <JAR>lib/opencsv-2.3.jar</JAR>
+</CREOLE-DIRECTORY>
Added: gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
===================================================================
(Binary files differ)
Index: gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
===================================================================
--- gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar 2013-11-29 15:39:42 UTC
(rev 17141)
+++ gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar 2013-11-29 17:10:22 UTC
(rev 17142)
Property changes on: gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Added: gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java
===================================================================
--- gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java
(rev 0)
+++ gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java
2013-11-29 17:10:22 UTC (rev 17142)
@@ -0,0 +1,442 @@
+/*
+ * CSVImporter.java
+ *
+ * Copyright (c) 2013, The University of Sheffield. See the file COPYRIGHT.txt
+ * in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ * This file is part of GATE (see http://gate.ac.uk/), and is free software,
+ * licenced under the GNU Library General Public License, Version 2, June 1991
+ * (in the distribution as file licence.html, and also available at
+ * http://gate.ac.uk/gate/licence.html).
+ *
+ * Mark A. Greenwood, 10/09/2013
+ */
+
+package gate.corpora;
+
+import gate.Corpus;
+import gate.Document;
+import gate.Factory;
+import gate.FeatureMap;
+import gate.creole.metadata.AutoInstance;
+import gate.creole.metadata.CreoleResource;
+import gate.gui.MainFrame;
+import gate.gui.NameBearerHandle;
+import gate.gui.ResourceHelper;
+import gate.util.ExtensionFileFilter;
+import gate.util.Files;
+
+import java.awt.GridBagConstraints;
+import java.awt.GridBagLayout;
+import java.awt.Insets;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.swing.AbstractAction;
+import javax.swing.Action;
+import javax.swing.JButton;
+import javax.swing.JCheckBox;
+import javax.swing.JComponent;
+import javax.swing.JFileChooser;
+import javax.swing.JLabel;
+import javax.swing.JOptionPane;
+import javax.swing.JPanel;
+import javax.swing.JSpinner;
+import javax.swing.JTextField;
+import javax.swing.SpinnerNumberModel;
+
+import org.apache.commons.io.IOUtils;
+
+import au.com.bytecode.opencsv.CSVReader;
+
+@SuppressWarnings("serial")
+@CreoleResource(name = "CSV Corpus Populater", tool = true, autoinstances =
@AutoInstance)
+public class CSVImporter extends ResourceHelper {
+
+ private static JComponent dialog = new JPanel();
+
+ private static SpinnerNumberModel textColModel = new SpinnerNumberModel(0, 0,
+ Integer.MAX_VALUE, 1);
+
+ private static JCheckBox cboFeatures = new JCheckBox(
+ "1st Row Contains Column Labels", true);
+
+ private static JCheckBox cboDocuments = new JCheckBox(
+ "Create One Document Per Row", false);
+
+ private static JTextField txtURL = new JTextField(30);
+
+ private static FileFilter CSV_FILE_FILTER = new ExtensionFileFilter(
+ "CSV Files (*.csv)", "csv");
+
+ static {
+ // we'll use the same dialog instance regardless of the corpus we are
+ // populating so we'll create a single static instance
+
+ dialog.setLayout(new GridBagLayout());
+
+ GridBagConstraints constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 0;
+ constraints.gridwidth = 2;
+ constraints.anchor = GridBagConstraints.WEST;
+ constraints.fill = GridBagConstraints.NONE;
+ constraints.insets = new Insets(0, 0, 0, 5);
+ dialog.add(new JLabel("CSV File URL:"), constraints);
+
+ constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 0;
+ constraints.gridwidth = 3;
+ constraints.fill = GridBagConstraints.HORIZONTAL;
+ constraints.insets = new Insets(0, 0, 0, 10);
+ dialog.add(txtURL, constraints);
+
+ constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 0;
+ constraints.gridwidth = 1;
+ constraints.anchor = GridBagConstraints.NORTHWEST;
+ JButton btnCSVURL = new JButton(MainFrame.getIcon("open-file"));
+ dialog.add(btnCSVURL, constraints);
+
+ constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 1;
+ constraints.gridwidth = 3;
+ constraints.anchor = GridBagConstraints.NORTHWEST;
+ constraints.insets = new Insets(0, 0, 15, 5);
+ dialog.add(new JLabel("Document Content Is In Column"), constraints);
+
+ constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 1;
+ constraints.gridwidth = 3;
+ constraints.anchor = GridBagConstraints.NORTHWEST;
+ dialog.add(new JSpinner(textColModel), constraints);
+
+ constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 2;
+ constraints.gridwidth = GridBagConstraints.RELATIVE;
+ constraints.anchor = GridBagConstraints.NORTHWEST;
+ dialog.add(cboFeatures, constraints);
+
+ constraints = new GridBagConstraints();
+ constraints.gridx = GridBagConstraints.RELATIVE;
+ constraints.gridy = 3;
+ constraints.gridwidth = GridBagConstraints.RELATIVE;
+ constraints.anchor = GridBagConstraints.NORTHWEST;
+ dialog.add(cboDocuments, constraints);
+
+ btnCSVURL.addActionListener(new ActionListener() {
+ @Override
+ public void actionPerformed(ActionEvent e) {
+ JFileChooser filer = MainFrame.getFileChooser();
+
+ filer.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES);
+ filer.setDialogTitle("Select a CSV File");
+ filer.resetChoosableFileFilters();
+ filer.setAcceptAllFileFilterUsed(false);
+ filer
+
.addChoosableFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
+ filer
+ .setFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
+
+ if(filer.showOpenDialog(dialog) != JFileChooser.APPROVE_OPTION) return;
+ try {
+ txtURL.setText(filer.getSelectedFile().toURI().toURL()
+ .toExternalForm());
+ } catch(IOException ioe) {
+ // do nothing here
+ }
+ }
+ });
+ }
+
+ @Override
+ protected List<Action> buildActions(final NameBearerHandle handle) {
+ List<Action> actions = new ArrayList<Action>();
+
+ if(!(handle.getTarget() instanceof Corpus)) return actions;
+
+ actions.add(new AbstractAction("Populate from CSV File") {
+ @Override
+ public void actionPerformed(ActionEvent e) {
+
+ // display the populater dialog and return if it is cancelled
+ if(JOptionPane.showConfirmDialog(null, dialog,
+ "Populate From CSV File", JOptionPane.OK_CANCEL_OPTION,
+ JOptionPane.PLAIN_MESSAGE) != JOptionPane.OK_OPTION) return;
+
+ // we want to run the population in a separate thread so we don't lock
+ // up the GUI
+ Thread thread =
+ new Thread(Thread.currentThread().getThreadGroup(),
+ "CSV Corpus Populater") {
+
+ public void run() {
+ try {
+ // see if we can convert the URL to a File instance
+ File file = null;
+ try {
+ file = Files.fileFromURL(new URL(txtURL.getText()));
+ } catch(IllegalArgumentException iae) {
+ // this will happen if someone enters an actual URL, but we
+ // handle that later so we can just ignore the exception for
+ // now and keep going
+ }
+
+ if(file != null && file.isDirectory()) {
+ // if we have a File instance and that points at a directory
+ // then....
+
+ // get all the CSV files in the directory structure
+ File[] files =
+ Files.listFilesRecursively(file, CSV_FILE_FILTER);
+
+ for(File f : files) {
+ // for each file...
+
+ // skip directories as we don't want to handle those
+ if(f.isDirectory()) continue;
+
+ if(cboDocuments.isSelected()) {
+ // if we are creating lots of documents from a single
file
+ // then call the populate method passing through all the
+ // options from the GUI
+ populate((Corpus)handle.getTarget(), f.toURI().toURL(),
+ (Integer)textColModel.getValue(),
+ cboFeatures.isSelected());
+ } else {
+ // if we are creating a single document from a single
file
+ // then call the createDoc method passing through all the
+ // options from the GUI
+ createDoc((Corpus)handle.getTarget(), f.toURI().toURL(),
+ (Integer)textColModel.getValue(),
+ cboFeatures.isSelected());
+ }
+ }
+ } else {
+ // we have a single URL to process so...
+
+ if(cboDocuments.isSelected()) {
+ // if we are creating lots of documents from a single file
+ // then call the populate method passing through all the
+ // options from the GUI
+ populate((Corpus)handle.getTarget(),
+ new URL(txtURL.getText()),
+ (Integer)textColModel.getValue(),
+ cboFeatures.isSelected());
+ } else {
+ // if we are creating a single document from a single file
+ // then call the createDoc method passing through all the
+ // options from the GUI
+ createDoc((Corpus)handle.getTarget(),
+ new URL(txtURL.getText()),
+ (Integer)textColModel.getValue(),
+ cboFeatures.isSelected());
+ }
+ }
+ } catch(Exception e) {
+ // TODO give a sensible error message
+ e.printStackTrace();
+ }
+ }
+ };
+
+ // let's leave the GUI nice and responsive
+ thread.setPriority(Thread.MIN_PRIORITY);
+
+ // lets get to it and do some actual work!
+ thread.start();
+
+ }
+ });
+
+ return actions;
+ }
+
+ /**
+ * Create a new document from each row and push it into the specified corpus
+ *
+ * @param corpus
+ * the Corpus to add documents to
+ * @param csv
+ * the URL of the CSV file to processes
+ * @param column
+ * the (zero index based) column which contains the text content
+ * @param colLabels
+ * true if the first row contains column labels, true otherwise
+ */
+ public static void populate(Corpus corpus, URL csv, int column,
+ boolean colLabels) {
+ CSVReader reader = null;
+ try {
+ // open a CSVReader over the URL
+ reader = new CSVReader(new InputStreamReader(csv.openStream()));
+
+ // if we are adding features read the first line
+ String[] features = (colLabels ? reader.readNext() : null);
+
+ String[] nextLine;
+ while((nextLine = reader.readNext()) != null) {
+ // for each line in the file...
+
+ // skip the line if there are less columns than we need to get to the
+ // content
+ if(column >= nextLine.length) continue;
+
+ // skip the line if the column with the content is empty
+ if(nextLine[column].trim().equals("")) continue;
+
+ FeatureMap fmap = Factory.newFeatureMap();
+ if(colLabels) {
+ // copy all the features from the row into a FeatureMap using the
+ // labels from the first line
+ for(int i = 0; i < features.length; ++i) {
+ if(i != column && i < nextLine.length) {
+ fmap.put(features[i], nextLine[i]);
+ }
+ }
+ }
+
+ // setup the initialization params for the document
+ FeatureMap params = Factory.newFeatureMap();
+ params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
+ nextLine[column]);
+
+ // create the document
+ Document doc =
+ (Document)Factory.createResource(
+ gate.corpora.DocumentImpl.class.getName(), params, fmap);
+
+ // add the document to the corpus
+ corpus.add(doc);
+
+ if(corpus.getLRPersistenceId() != null) {
+ // persistent corpus -> unload the document
+ corpus.unloadDocument(doc);
+ Factory.deleteResource(doc);
+ }
+
+ }
+
+ if(corpus.getDataStore() != null) {
+ // if this corpus is in a datastore make sure we sync it back
+ corpus.getDataStore().sync(corpus);
+ }
+ } catch(Exception e) {
+ // not much we can do other than report the exception
+ throw new RuntimeException("Unable to open CSV file: " + csv, e);
+ } finally {
+ // if we opened the reader successfully then close it so we don't leak
+ // file handles
+ if(reader != null) IOUtils.closeQuietly(reader);
+ }
+ }
+
+ /**
+ * Creates a single document from the CSV file
+ *
+ * @param corpus
+ * the Corpus to add documents to
+ * @param csv
+ * the URL of the CSV file to processes
+ * @param column
+ * the (zero index based) column which contains the text content
+ * @param colLabels
+ * true if the first row contains column labels, true otherwise
+ */
+ public static void createDoc(Corpus corpus, URL csv, int column,
+ boolean colLabels) {
+ CSVReader reader = null;
+ Document doc = null;
+ try {
+ // open a CSVReader over the URL
+ reader = new CSVReader(new InputStreamReader(csv.openStream()));
+
+ // if we are adding features read the first line
+ String[] features = (colLabels ? reader.readNext() : null);
+
+ // create an empty document to which we will add the content as we go
+ doc = Factory.newDocument("");
+
+ String[] nextLine;
+ while((nextLine = reader.readNext()) != null) {
+ // for each line in the file...
+
+ // skip the line if there are less columns than we need to get to the
+ // content
+ if(column >= nextLine.length) continue;
+
+ // skip the line if the column with the doc content is empty
+ if(nextLine[column].trim().equals("")) continue;
+
+ FeatureMap fmap = Factory.newFeatureMap();
+ if(colLabels) {
+ // put the data from the other columns into a FeatureMap using the
+ // labels from the first row
+ for(int i = 0; i < features.length; ++i) {
+ if(i != column && i < nextLine.length) {
+ fmap.put(features[i], nextLine[i]);
+ }
+ }
+ }
+
+ // find out how long the document currently is
+ // TODO can we keep a running track of this to avoid this call?
+ long length = doc.getContent().size();
+
+ // add the new text to the document
+ doc.edit(length, length, new DocumentContentImpl(nextLine[column] +
+ "\n\n"));
+
+ // add the spanning annotation to the Original markups set, we use the
+ // type "Text" if the columns don't have labels
+ doc.getAnnotations("Original markups").add(length,
+ length + nextLine[column].length(),
+ (colLabels ? features[column] : "Text"), fmap);
+ }
+
+ // store the original csv file URL as a document feature
+ doc.getFeatures().put("csvURL", csv.toExternalForm());
+
+ // so that the doc gets recreated properly put the XML for the doc we
just
+ // created into the init param that will be used if the document is
+ // recreated
+ doc.setParameterValue(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
+ doc.toXml());
+
+ // add the document to the corpus
+ corpus.add(doc);
+
+ if(corpus.getLRPersistenceId() != null) {
+ // persistent corpus -> unload the document
+ corpus.unloadDocument(doc);
+ Factory.deleteResource(doc);
+ }
+
+ if(corpus.getDataStore() != null) {
+ // if this corpus is in a datastore make sure we sync it back
+ corpus.getDataStore().sync(corpus);
+ }
+ } catch(Exception e) {
+ // if we failed somewhere then delete the part built document
+ if(doc != null) Factory.deleteResource(doc);
+
+ // throw a "helpful" exception
+ throw new RuntimeException("Unable to open CSV file: " + csv, e);
+ } finally {
+ // if we got as far as opening a reader over the file then close it
+ if(reader != null) IOUtils.closeQuietly(reader);
+ }
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349351&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs