Author: tallison
Date: Mon Mar 30 13:29:11 2015
New Revision: 1670090
URL: http://svn.apache.org/r1670090
Log:
TIKA-1512 temporary workaround. Currently not including test docs or tests
that derive from govdocs1
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1670090&r1=1670089&r2=1670090&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Mon Mar 30 13:29:11 2015
@@ -405,10 +405,13 @@ public class WordExtractor extends Abstr
if((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
&& text.indexOf('"') > -1) {
- String url = text.substring(
- text.indexOf('"') + 1,
- text.lastIndexOf('"')
- );
+ int start = text.indexOf('"') + 1;
+ int end = findHyperlinkEnd(text, start);
+ String url = "";
+ if (start >= 0 && start < end && end <= text.length()) {
+ url = text.substring(start, end);
+ }
+
xhtml.startElement("a", "href", url);
for(CharacterRun cr : texts) {
handleCharacterRun(cr, skipStyling, xhtml);
@@ -437,6 +440,31 @@ public class WordExtractor extends Abstr
return i-index;
}
+ //temporary work around for TIKA-1512
+ private int findHyperlinkEnd(String text, int start) {
+ int end = text.lastIndexOf('"');
+ if (end > start) {
+ return end;
+ }
+ end = text.lastIndexOf('\u201D');//smart right double quote
+ if (end > start) {
+ return end;
+ }
+ end = text.lastIndexOf('\r');
+ if (end > start) {
+ return end;
+ }
+ //if nothing so far, take the full length of the string
+ //If the full string is > 256 characters, it appears
+ //that the url is truncated in the .doc file. This
+ //will return the value as it is in the file, which
+ //may be incorrect; but it is the same behavior as opening
+ //the link in MSWord.
+ //This code does not currently check that length is actually >= 256.
+ //we might want to add that?
+ return text.length();
+ }
+
private void handlePictureCharacterRun(CharacterRun cr, Picture picture,
PicturesSource pictures, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
if(!isRendered(cr) || picture == null) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1670090&r1=1670089&r2=1670090&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Mon Mar 30 13:29:11 2015
@@ -33,6 +33,7 @@ import org.apache.tika.metadata.OfficeOp
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -397,4 +398,30 @@ public class WordParserTest extends Tika
assertContains("<p>1. Organisering av vakten:</p>", xml);
}
+
+ @Test
+ public void testHyperlinkStringIOOBESmartQuote() throws Exception {
+ //TIKA-1512, one cause: closing double quote is a smart quote
+ //test file contributed by user
+ XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
+
assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512",
result.xml);
+ }
+
+ @Test
+ @Ignore //until we determine whether we can include test docs or not
+ public void testHyperlinkStringLongNoCloseQuote() throws Exception {
+ //TIKA-1512, one cause: no closing quote on really long string
+ //test file derived from govdocs1 012152.doc
+ XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
+ assertContains("href=\"http://www.lexis.com", result.xml);
+ }
+
+ @Test
+ @Ignore //until we determine whether we can include test docs or not
+ public void testHyperlinkStringLongCarriageReturn() throws Exception {
+ //TIKA-1512, one cause: no closing quote, but carriage return
+ //test file derived from govdocs1 040044.doc
+ XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
+ assertContains("href=\"http://www.nib.org", result.xml);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc?rev=1670090&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream