Tim Barrett created TIKA-4124:
---------------------------------
Summary: embedded html of type
http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk is
not parsed
Key: TIKA-4124
URL: https://issues.apache.org/jira/browse/TIKA-4124
Project: Tika
Issue Type: Bug
Components: parser
Reporter: Tim Barrett
Word documents that may have been created using third party programs such as
docx4j sometimes contain embedded html. This is not parsed by Tika. The
embedded HTML file usually resides within the main folder of the docx internal
structure.
Changing the code in:
org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedPart()
as follows, handles this (the final else if)
{color:#7f0055}if{color}{color:#000000}
(POIXMLDocument.{color}{color:#0000c0}OLE_OBJECT_REL_TYPE{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
&&
{color}{color:#0000c0}TYPE_OLE_OBJECT{color}{color:#000000}.equals({color}{color:#6a3e3e}target{color}{color:#000000}.getContentType()))
{{color}
{color:#000000}
handleEmbeddedOLE({color}{color:#6a3e3e}target{color}{color:#000000},
{color}{color:#6a3e3e}xhtml{color}{color:#000000},
{color}{color:#6a3e3e}sourceDesc{color}{color:#000000} +
{color}{color:#6a3e3e}rel{color}{color:#000000}.getId(),
{color}{color:#6a3e3e}parentMetadata{color}{color:#000000});{color}
{color:#000000} {color}{color:#7f0055}if{color}{color:#000000}
({color}{color:#6a3e3e}targetURI{color}{color:#000000} !=
{color}{color:#7f0055}null{color}{color:#000000}) {{color}
{color:#000000}
{color}{color:#6a3e3e}handledTarget{color}{color:#000000}.add({color}{color:#6a3e3e}targetURI{color}{color:#000000}.toString());{color}
{color:#000000} }{color}
{color:#000000} } {color}{color:#7f0055}else{color}{color:#000000}
{color}{color:#7f0055}if{color}{color:#000000}
({color}{color:#0000c0}RELATION_MEDIA{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
||
{color}{color:#0000c0}RELATION_VIDEO{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
||
{color}{color:#0000c0}RELATION_AUDIO{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000}){color}
{color:#000000} ||
PackageRelationshipTypes.{color}{color:#0000c0}IMAGE_PART{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
||
POIXMLDocument.{color}{color:#0000c0}PACK_OBJECT_REL_TYPE{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000}){color}
{color:#000000} ||
POIXMLDocument.{color}{color:#0000c0}OLE_OBJECT_REL_TYPE{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000}))
{{color}
{color:#000000}
handleEmbeddedFile({color}{color:#6a3e3e}target{color}{color:#000000},
{color}{color:#6a3e3e}xhtml{color}{color:#000000},
{color}{color:#6a3e3e}sourceDesc{color}{color:#000000} +
{color}{color:#6a3e3e}rel{color}{color:#000000}.getId());{color}
{color:#000000} {color}{color:#7f0055}if{color}{color:#000000}
({color}{color:#6a3e3e}targetURI{color}{color:#000000} !=
{color}{color:#7f0055}null{color}{color:#000000}) {{color}
{color:#000000}
{color}{color:#6a3e3e}handledTarget{color}{color:#000000}.add({color}{color:#6a3e3e}targetURI{color}{color:#000000}.toString());{color}
{color:#000000} }{color}
{color:#000000} } {color}{color:#7f0055}else{color}{color:#000000}
{color}{color:#7f0055}if{color}{color:#000000}
(XSSFRelation.{color}{color:#0000c0}VBA_MACROS{color}{color:#000000}.getRelation().equals({color}{color:#6a3e3e}type{color}{color:#000000}))
{{color}
{color:#000000}
handleMacros({color}{color:#6a3e3e}target{color}{color:#000000},
{color}{color:#6a3e3e}xhtml{color}{color:#000000});{color}
{color:#000000} {color}{color:#7f0055}if{color}{color:#000000}
({color}{color:#6a3e3e}targetURI{color}{color:#000000} !=
{color}{color:#7f0055}null{color}{color:#000000}) {{color}
{color:#000000}
{color}{color:#6a3e3e}handledTarget{color}{color:#000000}.add({color}{color:#6a3e3e}targetURI{color}{color:#000000}.toString());{color}
{color:#000000} }{color}
{color:#000000} } {color}{color:#7f0055}else{color}{color:#000000}
{color}{color:#7f0055}if{color}{color:#000000}
({color}{color:#6a3e3e}type{color}{color:#000000}.endsWith({color}{color:#2a00ff}"aFChunk"{color}{color:#000000}))
{{color}
{color:#000000}
handleEmbeddedFile({color}{color:#6a3e3e}target{color}{color:#000000},
{color}{color:#6a3e3e}xhtml{color}{color:#000000},
{color}{color:#6a3e3e}sourceDesc{color}{color:#000000} +
{color}{color:#6a3e3e}rel{color}{color:#000000}.getId());{color}
{color:#000000} }{color}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)