Author: tallison Date: Mon Mar 30 13:29:11 2015 New Revision: 1670090 URL: http://svn.apache.org/r1670090 Log: TIKA-1512 temporary workaround. Currently not including test docs or tests that derive from govdocs1
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc (with props) Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1670090&r1=1670089&r2=1670090&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Mar 30 13:29:11 2015 @@ -405,10 +405,13 @@ public class WordExtractor extends Abstr if((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) && text.indexOf('"') > -1) { - String url = text.substring( - text.indexOf('"') + 1, - text.lastIndexOf('"') - ); + int start = text.indexOf('"') + 1; + int end = findHyperlinkEnd(text, start); + String url = ""; + if (start >= 0 && start < end && end <= text.length()) { + url = text.substring(start, end); + } + xhtml.startElement("a", "href", url); for(CharacterRun cr : texts) { handleCharacterRun(cr, skipStyling, xhtml); @@ -437,6 +440,31 @@ public class WordExtractor extends Abstr return i-index; } + //temporary work around for TIKA-1512 + private int findHyperlinkEnd(String text, int start) { + int end = text.lastIndexOf('"'); + if (end > start) { + return end; + } + end = text.lastIndexOf('\u201D');//smart right double quote + if (end > start) { + return end; + } + end = text.lastIndexOf('\r'); + if (end > start) { + return end; + } + //if nothing so far, take the full length of the string + //If the full string is > 256 characters, it appears + //that the url is truncated in the .doc file. This + //will return the value as it is in the file, which + //may be incorrect; but it is the same behavior as opening + //the link in MSWord. + //This code does not currently check that length is actually >= 256. + //we might want to add that? + return text.length(); + } + private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if(!isRendered(cr) || picture == null) { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1670090&r1=1670089&r2=1670090&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Mon Mar 30 13:29:11 2015 @@ -33,6 +33,7 @@ import org.apache.tika.metadata.OfficeOp import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -397,4 +398,30 @@ public class WordParserTest extends Tika assertContains("<p>1. Organisering av vakten:</p>", xml); } + + @Test + public void testHyperlinkStringIOOBESmartQuote() throws Exception { + //TIKA-1512, one cause: closing double quote is a smart quote + //test file contributed by user + XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc"); + assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml); + } + + @Test + @Ignore //until we determine whether we can include test docs or not + public void testHyperlinkStringLongNoCloseQuote() throws Exception { + //TIKA-1512, one cause: no closing quote on really long string + //test file derived from govdocs1 012152.doc + XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc"); + assertContains("href=\"http://www.lexis.com", result.xml); + } + + @Test + @Ignore //until we determine whether we can include test docs or not + public void testHyperlinkStringLongCarriageReturn() throws Exception { + //TIKA-1512, one cause: no closing quote, but carriage return + //test file derived from govdocs1 040044.doc + XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc"); + assertContains("href=\"http://www.nib.org", result.xml); + } } Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc?rev=1670090&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream