Ok -- I have rewritten my simple output formatter in java (see below). The
unicode problems were perhaps java/python problems as you suspected. The real
problem is that tika passes an "Attributes" which has a null value returned by
getValue().
Code and error follow (apologies if the java is inelegant -- I'm somewhat
rusty):
-------------------------
Code:
-------------------------
package com.factfiber.connect.tika;
import static java.lang.Math.*;
import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import org.xml.sax.SAXException;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
class Tika_XLS {
static class XLSContentHandler extends DefaultHandler {
protected OutputStreamWriter ous;
protected int indent = 0;
private boolean prevStart = false;
private boolean prevEnd = false;
public XLSContentHandler( OutputStream outf ) throws
UnsupportedEncodingException {
ous = new OutputStreamWriter( new BufferedOutputStream(
outf ), "utf-8" );
}
public void characters(char[] ch, int start, int length) throws
SAXException {
try { ous.write( ch, start, length ); }
catch ( IOException e ) { throw new SAXException( e ); }
}
public void startElement(String uri,
String localName,
String qName,
Attributes attributes) throws SAXException {
try {
if ( prevStart ) ous.write( "\n" );
for ( int i = 0; i < indent; i++ ) ous.write( "
" );
ous.write( "<");
ous.write( qName );
for ( int i = 0; i < attributes.getLength();
i++ ) {
ous.write( " " );
ous.write( attributes.getQName( i ));
ous.write( "=\"");
try {
ous.write( attributes.getValue( i
));
}
catch( NullPointerException e ) {
ous.flush();
System.err.println(
"ERR " +
String.valueOf( i ) + " "
+ attributes.getQName(
i ) + "=" + attributes.getValue( i ) );
throw e;
}
ous.write( "\"");
}
ous.write( " >" );
}
catch( IOException e ) {
throw new SAXException( e );
}
indent += 1;
prevStart = true;
prevEnd = false;
}
public void endElement( String uri, String localName, String
qName ) throws SAXException {
indent = max( 0, indent - 1 );
try {
if ( prevEnd )
for ( int i = 0; i < indent; i++ )
ous.write( " " );
ous.write( "</" + qName + ">\n" );
}
catch( IOException e ) { throw new SAXException( e ); }
prevStart = false;
prevEnd = true;
}
}
public static void main(String[] args) throws
FileNotFoundException, IOException, TikaException, SAXException {
InputStream inf;
OutputStream outf;
if ( args.length < 1 )
inf = System.in;
else
inf = new FileInputStream( args[ 0 ] );
if ( args.length < 2 )
outf = System.out;
else
outf = new FileOutputStream( args[ 1 ] );
parse( inf, outf );
}
public static void parse( InputStream inf, OutputStream outf )
throws IOException, TikaException, SAXException {
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
XLSContentHandler handler = new XLSContentHandler( outf );
OfficeParser parser = new OfficeParser();
try {
parser.parse(inf, handler, metadata, context);
}
finally {
inf.close();
}
}
}
-------------------------
Error:
-------------------------
ERR 0 href=null
Exception in thread "main" java.lang.NullPointerException
at java.io.Writer.write(Writer.java:140)
at
com.factfiber.connect.tika.Tika_XLS$XLSContentHandler.startElement(tika_xls.java:52)
at
org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at
org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:237)
at
org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:274)
at
org.apache.tika.parser.microsoft.LinkedCell.render(LinkedCell.java:35)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processExtraText(ExcelExtractor.java:423)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processSheet(ExcelExtractor.java:522)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.internalProcessRecord(ExcelExtractor.java:346)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processRecord(ExcelExtractor.java:297)
at
org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener.processRecord(FormatTrackingHSSFListener.java:82)
at
org.apache.poi.hssf.eventusermodel.HSSFRequest.processRecord(HSSFRequest.java:112)
at
org.apache.poi.hssf.eventusermodel.HSSFEventFactory.genericProcessEvents(HSSFEventFactory.java:147)
at
org.apache.poi.hssf.eventusermodel.HSSFEventFactory.processEvents(HSSFEventFactory.java:106)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processFile(ExcelExtractor.java:276)
at
org.apache.tika.parser.microsoft.ExcelExtractor.parse(ExcelExtractor.java:136)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:189)
at com.factfiber.connect.tika.Tika_XLS.parse(tika_xls.java:111)
at com.factfiber.connect.tika.Tika_XLS.main(tika_xls.java:100)
On Dec 21, 2010, at 5:28 AM, Nick Burch wrote:
> On Tue, 21 Dec 2010, Shaun Cutts wrote:
>> ok, but in when I call parse, then my ContentHandler.characters() callback
>> gets a char [], and this is passed as:
>>
>> (Pdb) ch
>> array('c', '\xa9 2010 Crane Data LLC. All rights reserved.')
>>
>> so when I try unicode I get an error:
>>
>> (Pdb) ch.tounicode()
>> *** ValueError: tounicode() may only be called on type 'u' arrays
>
> You sure there isn't a problem with your python-java bridge? All Java strings
> are always unicode
>
>> So it would seem to me that in fact I'm not getting a unicode string here.
>> When I try to decode in various codecs, I get problems. One question is what
>> is the standard name for "UCS-2" -- as when I try to use that name it fails;
>> is it a subset of utf-16?
>
> UCS-2 is a predecessor to UTF-16, which doesn't handle supplementary code
> points so can't hold the whole of the unicode range.
> http://en.wikipedia.org/wiki/UTF-16/UCS-2
>
> Nick