Author: tallison
Date: Mon Mar 30 13:29:11 2015
New Revision: 1670090

URL: http://svn.apache.org/r1670090
Log:
TIKA-1512 temporary workaround.  Currently not including test docs or tests 
that derive from govdocs1

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc
   (with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1670090&r1=1670089&r2=1670090&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 Mon Mar 30 13:29:11 2015
@@ -405,10 +405,13 @@ public class WordExtractor extends Abstr
 
           if((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
                  && text.indexOf('"') > -1) {
-             String url = text.substring(
-                   text.indexOf('"') + 1,
-                   text.lastIndexOf('"')
-             );
+              int start = text.indexOf('"') + 1;
+              int end = findHyperlinkEnd(text, start);
+              String url = "";
+              if (start >= 0 && start < end && end <= text.length()) {
+                  url = text.substring(start, end);
+              }
+
              xhtml.startElement("a", "href", url);
              for(CharacterRun cr : texts) {
                 handleCharacterRun(cr, skipStyling, xhtml);
@@ -437,6 +440,31 @@ public class WordExtractor extends Abstr
        return i-index;
     }
 
+    //temporary work around for TIKA-1512
+    private int findHyperlinkEnd(String text, int start) {
+        int end = text.lastIndexOf('"');
+        if (end > start) {
+            return end;
+        }
+        end = text.lastIndexOf('\u201D');//smart right double quote
+        if (end > start) {
+            return end;
+        }
+        end = text.lastIndexOf('\r');
+        if (end > start) {
+            return end;
+        }
+        //if nothing so far, take the full length of the string
+        //If the full string is > 256 characters, it appears
+        //that the url is truncated in the .doc file.  This
+        //will return the value as it is in the file, which
+        //may be incorrect; but it is the same behavior as opening
+        //the link in MSWord.
+        //This code does not currently check that length is actually >= 256.
+        //we might want to add that?
+        return text.length();
+    }
+
     private void handlePictureCharacterRun(CharacterRun cr, Picture picture, 
PicturesSource pictures, XHTMLContentHandler xhtml)
           throws SAXException, IOException, TikaException {
        if(!isRendered(cr) || picture == null) {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1670090&r1=1670089&r2=1670090&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Mon Mar 30 13:29:11 2015
@@ -33,6 +33,7 @@ import org.apache.tika.metadata.OfficeOp
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -397,4 +398,30 @@ public class WordParserTest extends Tika
         assertContains("<p>1. Organisering av vakten:</p>", xml);
 
     }
+
+    @Test
+    public void testHyperlinkStringIOOBESmartQuote() throws Exception {
+        //TIKA-1512, one cause: closing double quote is a smart quote
+        //test file contributed by user
+        XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
+        
assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512";, 
result.xml);
+    }
+
+    @Test
+    @Ignore //until we determine whether we can include test docs or not
+    public void testHyperlinkStringLongNoCloseQuote() throws Exception {
+        //TIKA-1512, one cause: no closing quote on really long string
+        //test file derived from govdocs1 012152.doc
+        XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
+        assertContains("href=\"http://www.lexis.com";, result.xml);
+    }
+
+    @Test
+    @Ignore //until we determine whether we can include test docs or not
+    public void testHyperlinkStringLongCarriageReturn() throws Exception {
+        //TIKA-1512, one cause: no closing quote, but carriage return
+        //test file derived from govdocs1 040044.doc
+        XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
+        assertContains("href=\"http://www.nib.org";, result.xml);
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc?rev=1670090&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_closingSmartQInHyperLink.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to