Author: tommaso
Date: Mon Sep 20 05:42:35 2010
New Revision: 998787
URL: http://svn.apache.org/viewvc?rev=998787&view=rev
Log:
[UIMA-1878] - applied patch from Greg Holmberg to handle spaces in path string
Modified:
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/FileSystemCollectionReader.java
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
Modified:
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/FileSystemCollectionReader.java
URL:
http://svn.apache.org/viewvc/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/FileSystemCollectionReader.java?rev=998787&r1=998786&r2=998787&view=diff
==============================================================================
---
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/FileSystemCollectionReader.java
(original)
+++
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/FileSystemCollectionReader.java
Mon Sep 20 05:42:35 2010
@@ -96,11 +96,11 @@ public class FileSystemCollectionReader
// call Tika wrapper
try {
- tika.populateCASfromURL(aCAS, file.toURL(), this.mMIME,
this.mLanguage);
+ tika.populateCASfromURI(aCAS, file.toURI(), this.mMIME,
this.mLanguage);
} catch (CASException e) {
getLogger().log(Level.WARNING,"Problem converting file
: "+file.toURL()+"\t"+e.getMessage());
- jcas.setDocumentText(" ");
- return;
+ throw new IOException(e);
+ //jcas.setDocumentText(" "); return;
}
}
Modified:
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
URL:
http://svn.apache.org/viewvc/uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java?rev=998787&r1=998786&r2=998787&view=diff
==============================================================================
---
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
(original)
+++
uima/sandbox/trunk/TikaAnnotator/src/main/java/org/apache/uima/tika/TIKAWrapper.java
Mon Sep 20 05:42:35 2010
@@ -22,7 +22,7 @@ package org.apache.uima.tika;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.net.URL;
+import java.net.URI;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
@@ -33,6 +33,7 @@ import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.util.Level;
public class TIKAWrapper {
@@ -54,16 +55,16 @@ public class TIKAWrapper {
}
- public void populateCASfromURL(CAS cas, URL url, String language)
throws CASException{
- populateCASfromURL(cas, url, null, language);
+ public void populateCASfromURI(CAS cas, URI uri, String language)
throws CASException{
+ populateCASfromURI(cas, uri, null, language);
}
- public void populateCASfromURL(CAS cas, URL url, String mime, String
language) throws CASException{
+ public void populateCASfromURI(CAS cas, URI uri, String mime, String
language) throws CASException{
InputStream originalStream=null;
try {
- originalStream = new BufferedInputStream(url
- .openStream());
+ originalStream = new BufferedInputStream(
+ uri.toURL().openStream());
} catch (IOException e1) {
new CASException(e1);
}
@@ -86,8 +87,8 @@ public class TIKAWrapper {
catch (Exception e){
// if we have a problem just dump the message and continue
// getLogger().log(Level.WARNING,"Problem converting file :
"+URI+"\t"+e.getMessage());
- cas.setDocumentText("");
- return;
+ // cas.setDocumentText(""); return;
+ throw new CASException(e);
}
finally {
// set language if it was explicitly specified as a
configuration
@@ -126,7 +127,7 @@ public class TIKAWrapper {
FeatureValue fv = new FeatureValue(jcas);
fv.setName("uri");
- fv.setValue(url.toString());
+ fv.setValue(uri.toString());
docAnnotation.setFeatures(i,fv);
docAnnotation.addToIndexes();