Revision: 17142
          http://sourceforge.net/p/gate/code/17142
Author:   markagreenwood
Date:     2013-11-29 17:10:22 +0000 (Fri, 29 Nov 2013)
Log Message:
-----------
a new plugin that supports loading documents from CSV files via a 
ResourceHelper which adds a new right-click option to instances of Corpus

Added Paths:
-----------
    gate/trunk/plugins/Format_CSV/
    gate/trunk/plugins/Format_CSV/.classpath
    gate/trunk/plugins/Format_CSV/.project
    gate/trunk/plugins/Format_CSV/build.xml
    gate/trunk/plugins/Format_CSV/creole.xml
    gate/trunk/plugins/Format_CSV/lib/
    gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
    gate/trunk/plugins/Format_CSV/src/
    gate/trunk/plugins/Format_CSV/src/gate/
    gate/trunk/plugins/Format_CSV/src/gate/corpora/
    gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java

Added: gate/trunk/plugins/Format_CSV/.classpath
===================================================================
--- gate/trunk/plugins/Format_CSV/.classpath                            (rev 0)
+++ gate/trunk/plugins/Format_CSV/.classpath    2013-11-29 17:10:22 UTC (rev 
17142)
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+       <classpathentry kind="src" path="src"/>
+       <classpathentry kind="con" 
path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+       <classpathentry combineaccessrules="false" kind="src" path="/GATE"/>
+       <classpathentry kind="lib" path="lib/opencsv-2.3.jar"/>
+       <classpathentry kind="output" path="classes"/>
+</classpath>

Added: gate/trunk/plugins/Format_CSV/.project
===================================================================
--- gate/trunk/plugins/Format_CSV/.project                              (rev 0)
+++ gate/trunk/plugins/Format_CSV/.project      2013-11-29 17:10:22 UTC (rev 
17142)
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+       <name>Format_CSV</name>
+       <comment></comment>
+       <projects>
+       </projects>
+       <buildSpec>
+               <buildCommand>
+                       <name>org.eclipse.jdt.core.javabuilder</name>
+                       <arguments>
+                       </arguments>
+               </buildCommand>
+       </buildSpec>
+       <natures>
+               <nature>org.eclipse.jdt.core.javanature</nature>
+       </natures>
+</projectDescription>

Added: gate/trunk/plugins/Format_CSV/build.xml
===================================================================
--- gate/trunk/plugins/Format_CSV/build.xml                             (rev 0)
+++ gate/trunk/plugins/Format_CSV/build.xml     2013-11-29 17:10:22 UTC (rev 
17142)
@@ -0,0 +1,104 @@
+<project name="Format_CSV" basedir="." default="all" >
+       <!-- Prevent Ant from warning about includeantruntime not being set -->
+       <property name="build.sysclasspath" value="ignore" />
+
+       <property file="build.properties" />
+       <property name="plugin.name" value="Format_CSV"/>
+
+       <!-- Make environment variables available -->
+       <property environment="env" />
+
+       <!-- If environment variable GATE_HOME is set, use it for
+       gate.home (unless it was already set in build.properties -->
+       <condition property="gate.home" value="${env.GATE_HOME}">
+               <isset property="env.GATE_HOME"/>
+       </condition>
+
+       <property name="gate.home" location="../.." />
+       <property name="gate.lib" location="${gate.home}/lib" />
+       <property name="gate.jar" location="${gate.home}/bin/gate.jar" />
+       <property name="src.dir" location="src" />
+       <property name="lib.dir" location="lib" />
+       <property name="classes.dir" location="classes" />
+       <property name="jar.location" location="format-csv.jar" />
+       <property name="doc.dir" location="doc" />
+       <property name="javadoc.dir" location="${doc.dir}/javadoc" />
+
+       <!-- Path to compile - includes gate.jar and GATE/lib/*.jar -->
+       <path id="compile.classpath">
+               <fileset dir="${lib.dir}">
+                       <include name="**/*.jar" />
+               </fileset>
+               <pathelement location="${gate.jar}" />
+               <fileset dir="${gate.lib}">
+                       <include name="**/*.jar" />
+                       <include name="**/*.zip" />
+               </fileset>
+       </path>
+
+       <!-- create build directory structure -->
+       <target name="prepare">
+               <mkdir dir="${classes.dir}" />
+       </target>
+
+       <!-- compile the source -->
+       <target name="compile" depends="prepare">
+               <javac classpathref="compile.classpath"
+           srcdir="${src.dir}"
+           destdir="${classes.dir}"
+           debug="true"
+           debuglevel="lines,source"
+           source="1.6"
+           target="1.6" />
+       </target>
+
+       <!-- Build JavaDoc documentation -->
+       <target name="doc.prepare">
+               <mkdir dir="${javadoc.dir}" />
+       </target>
+
+       <target name="javadoc" depends="doc.prepare">
+               <javadoc destdir="${javadoc.dir}" packagenames="*"
+             classpathref="compile.classpath"
+             encoding="UTF-8"
+             windowtitle="${plugin.name} JavaDoc"
+             source="1.6">
+                       <sourcepath>
+                               <pathelement location="${src.dir}" />
+                       </sourcepath>
+                       <link href="http://docs.oracle.com/javase/6/docs/api/"; 
/>
+                       <link href="http://gate.ac.uk/gate/doc/javadoc/"; />
+               </javadoc>
+       </target>
+
+
+       <!-- create the JAR file -->
+       <target name="jar" depends="compile" >
+               <jar destfile="${jar.location}"
+         update="false"
+         basedir="${classes.dir}" />
+       </target>
+
+       <!-- remove the generated .class files -->
+       <target name="clean.classes" >
+               <delete dir="${classes.dir}" />
+       </target>
+
+       <!-- Clean up - remove .class and .jar files -->
+       <target name="clean" depends="clean.classes" >
+               <delete file="${jar.location}" />
+       </target>
+
+       <!-- Build everything - the code and JavaDoc -->
+       <target name="all" depends="jar, javadoc" />
+
+       <!-- Targets used by the main GATE build file:
+         build: build the plugin - just calls "jar" target
+         test : run the unit tests - there aren't any
+         distro.prepare: remove intermediate files that shouldn't be in the
+                         distribution
+  -->
+       <target name="build" depends="jar" />
+       <target name="test" />
+       <target name="distro.prepare" depends="clean.classes" />
+</project>

Added: gate/trunk/plugins/Format_CSV/creole.xml
===================================================================
--- gate/trunk/plugins/Format_CSV/creole.xml                            (rev 0)
+++ gate/trunk/plugins/Format_CSV/creole.xml    2013-11-29 17:10:22 UTC (rev 
17142)
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+<CREOLE-DIRECTORY>
+  <JAR SCAN="true">format-csv.jar</JAR>
+  <JAR>lib/opencsv-2.3.jar</JAR>
+</CREOLE-DIRECTORY>

Added: gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
===================================================================
(Binary files differ)

Index: gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
===================================================================
--- gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar   2013-11-29 15:39:42 UTC 
(rev 17141)
+++ gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar   2013-11-29 17:10:22 UTC 
(rev 17142)

Property changes on: gate/trunk/plugins/Format_CSV/lib/opencsv-2.3.jar
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Added: gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java
===================================================================
--- gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java             
                (rev 0)
+++ gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java     
2013-11-29 17:10:22 UTC (rev 17142)
@@ -0,0 +1,442 @@
+/*
+ * CSVImporter.java
+ * 
+ * Copyright (c) 2013, The University of Sheffield. See the file COPYRIGHT.txt
+ * in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ * 
+ * This file is part of GATE (see http://gate.ac.uk/), and is free software,
+ * licenced under the GNU Library General Public License, Version 2, June 1991
+ * (in the distribution as file licence.html, and also available at
+ * http://gate.ac.uk/gate/licence.html).
+ * 
+ * Mark A. Greenwood, 10/09/2013
+ */
+
+package gate.corpora;
+
+import gate.Corpus;
+import gate.Document;
+import gate.Factory;
+import gate.FeatureMap;
+import gate.creole.metadata.AutoInstance;
+import gate.creole.metadata.CreoleResource;
+import gate.gui.MainFrame;
+import gate.gui.NameBearerHandle;
+import gate.gui.ResourceHelper;
+import gate.util.ExtensionFileFilter;
+import gate.util.Files;
+
+import java.awt.GridBagConstraints;
+import java.awt.GridBagLayout;
+import java.awt.Insets;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.swing.AbstractAction;
+import javax.swing.Action;
+import javax.swing.JButton;
+import javax.swing.JCheckBox;
+import javax.swing.JComponent;
+import javax.swing.JFileChooser;
+import javax.swing.JLabel;
+import javax.swing.JOptionPane;
+import javax.swing.JPanel;
+import javax.swing.JSpinner;
+import javax.swing.JTextField;
+import javax.swing.SpinnerNumberModel;
+
+import org.apache.commons.io.IOUtils;
+
+import au.com.bytecode.opencsv.CSVReader;
+
+@SuppressWarnings("serial")
+@CreoleResource(name = "CSV Corpus Populater", tool = true, autoinstances = 
@AutoInstance)
+public class CSVImporter extends ResourceHelper {
+
+  private static JComponent dialog = new JPanel();
+
+  private static SpinnerNumberModel textColModel = new SpinnerNumberModel(0, 0,
+    Integer.MAX_VALUE, 1);
+
+  private static JCheckBox cboFeatures = new JCheckBox(
+    "1st Row Contains Column Labels", true);
+
+  private static JCheckBox cboDocuments = new JCheckBox(
+    "Create One Document Per Row", false);
+
+  private static JTextField txtURL = new JTextField(30);
+
+  private static FileFilter CSV_FILE_FILTER = new ExtensionFileFilter(
+    "CSV Files (*.csv)", "csv");
+
+  static {
+    // we'll use the same dialog instance regardless of the corpus we are
+    // populating so we'll create a single static instance
+
+    dialog.setLayout(new GridBagLayout());
+
+    GridBagConstraints constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 0;
+    constraints.gridwidth = 2;
+    constraints.anchor = GridBagConstraints.WEST;
+    constraints.fill = GridBagConstraints.NONE;
+    constraints.insets = new Insets(0, 0, 0, 5);
+    dialog.add(new JLabel("CSV File URL:"), constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 0;
+    constraints.gridwidth = 3;
+    constraints.fill = GridBagConstraints.HORIZONTAL;
+    constraints.insets = new Insets(0, 0, 0, 10);
+    dialog.add(txtURL, constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 0;
+    constraints.gridwidth = 1;
+    constraints.anchor = GridBagConstraints.NORTHWEST;
+    JButton btnCSVURL = new JButton(MainFrame.getIcon("open-file"));
+    dialog.add(btnCSVURL, constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 1;
+    constraints.gridwidth = 3;
+    constraints.anchor = GridBagConstraints.NORTHWEST;
+    constraints.insets = new Insets(0, 0, 15, 5);
+    dialog.add(new JLabel("Document Content Is In Column"), constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 1;
+    constraints.gridwidth = 3;
+    constraints.anchor = GridBagConstraints.NORTHWEST;
+    dialog.add(new JSpinner(textColModel), constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 2;
+    constraints.gridwidth = GridBagConstraints.RELATIVE;
+    constraints.anchor = GridBagConstraints.NORTHWEST;
+    dialog.add(cboFeatures, constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 3;
+    constraints.gridwidth = GridBagConstraints.RELATIVE;
+    constraints.anchor = GridBagConstraints.NORTHWEST;
+    dialog.add(cboDocuments, constraints);
+
+    btnCSVURL.addActionListener(new ActionListener() {
+      @Override
+      public void actionPerformed(ActionEvent e) {
+        JFileChooser filer = MainFrame.getFileChooser();
+
+        filer.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES);
+        filer.setDialogTitle("Select a CSV File");
+        filer.resetChoosableFileFilters();
+        filer.setAcceptAllFileFilterUsed(false);
+        filer
+          
.addChoosableFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
+        filer
+          .setFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
+
+        if(filer.showOpenDialog(dialog) != JFileChooser.APPROVE_OPTION) return;
+        try {
+          txtURL.setText(filer.getSelectedFile().toURI().toURL()
+            .toExternalForm());
+        } catch(IOException ioe) {
+          // do nothing here
+        }
+      }
+    });
+  }
+
+  @Override
+  protected List<Action> buildActions(final NameBearerHandle handle) {
+    List<Action> actions = new ArrayList<Action>();
+
+    if(!(handle.getTarget() instanceof Corpus)) return actions;
+
+    actions.add(new AbstractAction("Populate from CSV File") {
+      @Override
+      public void actionPerformed(ActionEvent e) {
+
+        // display the populater dialog and return if it is cancelled
+        if(JOptionPane.showConfirmDialog(null, dialog,
+          "Populate From CSV File", JOptionPane.OK_CANCEL_OPTION,
+          JOptionPane.PLAIN_MESSAGE) != JOptionPane.OK_OPTION) return;
+
+        // we want to run the population in a separate thread so we don't lock
+        // up the GUI
+        Thread thread =
+          new Thread(Thread.currentThread().getThreadGroup(),
+            "CSV Corpus Populater") {
+
+            public void run() {
+              try {
+                // see if we can convert the URL to a File instance
+                File file = null;
+                try {
+                  file = Files.fileFromURL(new URL(txtURL.getText()));
+                } catch(IllegalArgumentException iae) {
+                  // this will happen if someone enters an actual URL, but we
+                  // handle that later so we can just ignore the exception for
+                  // now and keep going
+                }
+
+                if(file != null && file.isDirectory()) {
+                  // if we have a File instance and that points at a directory
+                  // then....
+
+                  // get all the CSV files in the directory structure
+                  File[] files =
+                    Files.listFilesRecursively(file, CSV_FILE_FILTER);
+
+                  for(File f : files) {
+                    // for each file...
+
+                    // skip directories as we don't want to handle those
+                    if(f.isDirectory()) continue;
+
+                    if(cboDocuments.isSelected()) {
+                      // if we are creating lots of documents from a single 
file
+                      // then call the populate method passing through all the
+                      // options from the GUI
+                      populate((Corpus)handle.getTarget(), f.toURI().toURL(),
+                        (Integer)textColModel.getValue(),
+                        cboFeatures.isSelected());
+                    } else {
+                      // if we are creating a single document from a single 
file
+                      // then call the createDoc method passing through all the
+                      // options from the GUI
+                      createDoc((Corpus)handle.getTarget(), f.toURI().toURL(),
+                        (Integer)textColModel.getValue(),
+                        cboFeatures.isSelected());
+                    }
+                  }
+                } else {
+                  // we have a single URL to process so...
+
+                  if(cboDocuments.isSelected()) {
+                    // if we are creating lots of documents from a single file
+                    // then call the populate method passing through all the
+                    // options from the GUI
+                    populate((Corpus)handle.getTarget(),
+                      new URL(txtURL.getText()),
+                      (Integer)textColModel.getValue(),
+                      cboFeatures.isSelected());
+                  } else {
+                    // if we are creating a single document from a single file
+                    // then call the createDoc method passing through all the
+                    // options from the GUI
+                    createDoc((Corpus)handle.getTarget(),
+                      new URL(txtURL.getText()),
+                      (Integer)textColModel.getValue(),
+                      cboFeatures.isSelected());
+                  }
+                }
+              } catch(Exception e) {
+                // TODO give a sensible error message
+                e.printStackTrace();
+              }
+            }
+          };
+
+        // let's leave the GUI nice and responsive
+        thread.setPriority(Thread.MIN_PRIORITY);
+
+        // lets get to it and do some actual work!
+        thread.start();
+
+      }
+    });
+
+    return actions;
+  }
+
+  /**
+   * Create a new document from each row and push it into the specified corpus
+   * 
+   * @param corpus
+   *          the Corpus to add documents to
+   * @param csv
+   *          the URL of the CSV file to processes
+   * @param column
+   *          the (zero index based) column which contains the text content
+   * @param colLabels
+   *          true if the first row contains column labels, true otherwise
+   */
+  public static void populate(Corpus corpus, URL csv, int column,
+                              boolean colLabels) {
+    CSVReader reader = null;
+    try {
+      // open a CSVReader over the URL
+      reader = new CSVReader(new InputStreamReader(csv.openStream()));
+
+      // if we are adding features read the first line
+      String[] features = (colLabels ? reader.readNext() : null);
+
+      String[] nextLine;
+      while((nextLine = reader.readNext()) != null) {
+        // for each line in the file...
+
+        // skip the line if there are less columns than we need to get to the
+        // content
+        if(column >= nextLine.length) continue;
+
+        // skip the line if the column with the content is empty
+        if(nextLine[column].trim().equals("")) continue;
+
+        FeatureMap fmap = Factory.newFeatureMap();
+        if(colLabels) {
+          // copy all the features from the row into a FeatureMap using the
+          // labels from the first line
+          for(int i = 0; i < features.length; ++i) {
+            if(i != column && i < nextLine.length) {
+              fmap.put(features[i], nextLine[i]);
+            }
+          }
+        }
+
+        // setup the initialization params for the document
+        FeatureMap params = Factory.newFeatureMap();
+        params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
+          nextLine[column]);
+
+        // create the document
+        Document doc =
+          (Document)Factory.createResource(
+            gate.corpora.DocumentImpl.class.getName(), params, fmap);
+
+        // add the document to the corpus
+        corpus.add(doc);
+
+        if(corpus.getLRPersistenceId() != null) {
+          // persistent corpus -> unload the document
+          corpus.unloadDocument(doc);
+          Factory.deleteResource(doc);
+        }
+
+      }
+
+      if(corpus.getDataStore() != null) {
+        // if this corpus is in a datastore make sure we sync it back
+        corpus.getDataStore().sync(corpus);
+      }
+    } catch(Exception e) {
+      // not much we can do other than report the exception
+      throw new RuntimeException("Unable to open CSV file: " + csv, e);
+    } finally {
+      // if we opened the reader successfully then close it so we don't leak
+      // file handles
+      if(reader != null) IOUtils.closeQuietly(reader);
+    }
+  }
+
+  /**
+   * Creates a single document from the CSV file
+   * 
+   * @param corpus
+   *          the Corpus to add documents to
+   * @param csv
+   *          the URL of the CSV file to processes
+   * @param column
+   *          the (zero index based) column which contains the text content
+   * @param colLabels
+   *          true if the first row contains column labels, true otherwise
+   */
+  public static void createDoc(Corpus corpus, URL csv, int column,
+                               boolean colLabels) {
+    CSVReader reader = null;
+    Document doc = null;
+    try {
+      // open a CSVReader over the URL
+      reader = new CSVReader(new InputStreamReader(csv.openStream()));
+
+      // if we are adding features read the first line
+      String[] features = (colLabels ? reader.readNext() : null);
+
+      // create an empty document to which we will add the content as we go
+      doc = Factory.newDocument("");
+
+      String[] nextLine;
+      while((nextLine = reader.readNext()) != null) {
+        // for each line in the file...
+
+        // skip the line if there are less columns than we need to get to the
+        // content
+        if(column >= nextLine.length) continue;
+
+        // skip the line if the column with the doc content is empty
+        if(nextLine[column].trim().equals("")) continue;
+
+        FeatureMap fmap = Factory.newFeatureMap();
+        if(colLabels) {
+          // put the data from the other columns into a FeatureMap using the
+          // labels from the first row
+          for(int i = 0; i < features.length; ++i) {
+            if(i != column && i < nextLine.length) {
+              fmap.put(features[i], nextLine[i]);
+            }
+          }
+        }
+
+        // find out how long the document currently is
+        // TODO can we keep a running track of this to avoid this call?
+        long length = doc.getContent().size();
+
+        // add the new text to the document
+        doc.edit(length, length, new DocumentContentImpl(nextLine[column] +
+          "\n\n"));
+
+        // add the spanning annotation to the Original markups set, we use the
+        // type "Text" if the columns don't have labels
+        doc.getAnnotations("Original markups").add(length,
+          length + nextLine[column].length(),
+          (colLabels ? features[column] : "Text"), fmap);
+      }
+
+      // store the original csv file URL as a document feature
+      doc.getFeatures().put("csvURL", csv.toExternalForm());
+
+      // so that the doc gets recreated properly put the XML for the doc we 
just
+      // created into the init param that will be used if the document is
+      // recreated
+      doc.setParameterValue(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
+        doc.toXml());
+
+      // add the document to the corpus
+      corpus.add(doc);
+
+      if(corpus.getLRPersistenceId() != null) {
+        // persistent corpus -> unload the document
+        corpus.unloadDocument(doc);
+        Factory.deleteResource(doc);
+      }
+
+      if(corpus.getDataStore() != null) {
+        // if this corpus is in a datastore make sure we sync it back
+        corpus.getDataStore().sync(corpus);
+      }
+    } catch(Exception e) {
+      // if we failed somewhere then delete the part built document
+      if(doc != null) Factory.deleteResource(doc);
+
+      // throw a "helpful" exception
+      throw new RuntimeException("Unable to open CSV file: " + csv, e);
+    } finally {
+      // if we got as far as opening a reader over the file then close it
+      if(reader != null) IOUtils.closeQuietly(reader);
+    }
+  }
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349351&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to