Hi,
During some tests with Tika-1.0-SNAPSHOT, I encountered an exception when parsing this MS PowerPoint file : http://jeanferrette.free.fr/MS8.ppt <http://jeanferrette.free.fr/MS8.ppt> This problem is reproducible with tika-0.9. Should I create an issue on JIRA for that ? I mean, is this kind of exception recoverable ? Here is the complete stack trace : java.io.IOException: Substitut UTF-16 non valide détecté : db00 bfff ? at com.sun.org.apache.xml.internal.serializer.ToStream.endElement(ToStream.java:2060) at com.sun.org.apache.xalan.internal.xsltc.trax.TransformerHandlerImpl.endElement(TransformerHandlerImpl.java:273) at org.apache.tika.sax.TeeContentHandler.endElement(TeeContentHandler.java:94) at org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136) at org.apache.tika.sax.SecureContentHandler.endElement(SecureContentHandler.java:215) at org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136) at org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136) at org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136) at org.apache.tika.sax.XHTMLContentHandler.lazyEndHead(XHTMLContentHandler.java:169) at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:234) at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:271) at org.apache.tika.sax.XHTMLContentHandler.element(XHTMLContentHandler.java:308) at org.apache.tika.parser.microsoft.HSLFExtractor.parse(HSLFExtractor.java:41) at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:201) at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:129) at org.apache.tika.gui.TikaGUI.handleStream(TikaGUI.java:320) at org.apache.tika.gui.TikaGUI.openFile(TikaGUI.java:279) at org.apache.tika.gui.ParsingTransferHandler.importFiles(ParsingTransferHandler.java:94) at org.apache.tika.gui.ParsingTransferHandler.importData(ParsingTransferHandler.java:77) at javax.swing.TransferHandler.importData(TransferHandler.java:755) at javax.swing.TransferHandler$DropHandler.drop(TransferHandler.java:1478) at java.awt.dnd.DropTarget.drop(DropTarget.java:434) at javax.swing.TransferHandler$SwingDropTarget.drop(TransferHandler.java:1203) at sun.awt.dnd.SunDropTargetContextPeer.processDropMessage(SunDropTargetContextPeer.java:519) at sun.awt.dnd.SunDropTargetContextPeer$EventDispatcher.dispatchDropEvent(SunDropTargetContextPeer.java:832) at sun.awt.dnd.SunDropTargetContextPeer$EventDispatcher.dispatchEvent(SunDropTargetContextPeer.java:756) at sun.awt.dnd.SunDropTargetEvent.dispatch(SunDropTargetEvent.java:30) at java.awt.Component.dispatchEventImpl(Component.java:4508) at java.awt.Container.dispatchEventImpl(Container.java:2099) at java.awt.Component.dispatchEvent(Component.java:4481) at java.awt.LightweightDispatcher.retargetMouseEvent(Container.java:4577) at java.awt.LightweightDispatcher.processDropTargetEvent(Container.java:4312) at java.awt.LightweightDispatcher.dispatchEvent(Container.java:4163) at java.awt.Container.dispatchEventImpl(Container.java:2085) at java.awt.Window.dispatchEventImpl(Window.java:2478) at java.awt.Component.dispatchEvent(Component.java:4481) at java.awt.EventQueue.dispatchEventImpl(EventQueue.java:643) at java.awt.EventQueue.access$000(EventQueue.java:84) at java.awt.EventQueue$1.run(EventQueue.java:602) at java.awt.EventQueue$1.run(EventQueue.java:600) at java.security.AccessController.doPrivileged(Native Method) at java.security.AccessControlContext$1.doIntersectionPrivilege(AccessControlContext.java:87) at java.security.AccessControlContext$1.doIntersectionPrivilege(AccessControlContext.java:98) at java.awt.EventQueue$2.run(EventQueue.java:616) at java.awt.EventQueue$2.run(EventQueue.java:614) at java.security.AccessController.doPrivileged(Native Method) at java.security.AccessControlContext$1.doIntersectionPrivilege(AccessControlContext.java:87) at java.awt.EventQueue.dispatchEvent(EventQueue.java:613) at java.awt.EventDispatchThread.pumpOneEventForFilters(EventDispatchThread.java:269) at java.awt.EventDispatchThread.pumpEventsForFilter(EventDispatchThread.java:184) at java.awt.EventDispatchThread.pumpEventsForHierarchy(EventDispatchThread.java:174) at java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:169) at java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:161) at java.awt.EventDispatchThread.run(EventDispatchThread.java:122) Caused by: java.io.IOException: Substitut UTF-16 non valide détecté : db00 bfff ? at com.sun.org.apache.xml.internal.serializer.ToStream.accumDefaultEscape(ToStream.java:1671) at com.sun.org.apache.xml.internal.serializer.ToStream.writeAttrString(ToStream.java:1988) at com.sun.org.apache.xml.internal.serializer.ToStream.processAttributes(ToStream.java:1942) at com.sun.org.apache.xml.internal.serializer.ToStream.endElement(ToStream.java:2031) ... 55 more Thanks. Pablo.
