Jack Krupansky created SOLR-4809:
------------------------------------

             Summary: OpenOffice document body is not indexed by SolrCell
                 Key: SOLR-4809
                 URL: https://issues.apache.org/jira/browse/SOLR-4809
             Project: Solr
          Issue Type: Bug
          Components: contrib - Solr Cell (Tika extraction)
    Affects Versions: 4.3, 3.6.1
            Reporter: Jack Krupansky


As reported on the solr user mailing list, SolrCell is not indexing document 
body content for OpenOffice documents.

I tested with Apache Open Office 3.4.1 on Solr 4.3 and 3.6.1, for both 
OpenWriter (.ODT) and Impress (.ODS).

The extractOnly option does return the document body text, but Solr does not 
index the document body text. In my test cases (.ODS and .ODT), all I see for 
the "content" attribute in Solr are a few spaces.

Using the example schema, I indexed HelloWorld.odt using:

{code}
 curl 
"http://localhost:8983/solr/update/extract?literal.id=doc-1&uprefix=attr_&commit=true";
 -F "myfile=@HelloWorld.odt"
{code}

It queries as:

{code}
<?xml version="1.0" encoding="UTF-8"?>
<response>

<lst name="responseHeader">
  <int name="status">0</int>
  <int name="QTime">2</int>
  <lst name="params">
    <str name="indent">true</str>
    <str name="q">id:doc-1</str>
  </lst>
</lst>
<result name="response" numFound="1" start="0">
  <doc>
    <str name="id">doc-1</str>
    <arr name="attr_image_count">
      <str>0</str>
    </arr>
    <arr name="attr_editing_cycles">
      <str>1</str>
    </arr>
    <arr name="attr_stream_source_info">
      <str>myfile</str>
    </arr>
    <arr name="attr_meta_save_date">
      <str>2013-05-10T17:15:40.99</str>
    </arr>
    <arr name="attr_dc_subject">
      <str>Hello, World</str>
    </arr>
    <str name="subject">Hello World - subject</str>
    <arr name="attr_dcterms_created">
      <str>2013-05-10T17:11:58.88</str>
    </arr>
    <arr name="attr_date">
      <str>2013-05-10T17:15:40.99</str>
    </arr>
    <arr name="attr_dc_description">
      <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
    </arr>
    <arr name="attr_nbobject">
      <str>0</str>
    </arr>
    <arr name="attr_word_count">
      <str>10</str>
    </arr>
    <arr name="attr_edit_time">
      <str>PT3M44S</str>
    </arr>
    <arr name="attr_meta_paragraph_count">
      <str>4</str>
    </arr>
    <arr name="attr_creation_date">
      <str>2013-05-10T17:11:58.88</str>
    </arr>
    <arr name="title">
      <str>Hello World SolrCell Test - title</str>
    </arr>
    <arr name="attr_object_count">
      <str>0</str>
    </arr>
    <arr name="attr_stream_content_type">
      <str>application/octet-stream</str>
    </arr>
    <arr name="attr_nbimg">
      <str>0</str>
    </arr>
    <str name="description">This is a test of SolrCell using OpenOffice 3.4.1 - 
OpenWriter.</str>
    <arr name="attr_stream_size">
      <str>8960</str>
    </arr>
    <arr name="attr_meta_object_count">
      <str>0</str>
    </arr>
    <arr name="attr_cp_subject">
      <str>Hello World - subject</str>
    </arr>
    <arr name="attr_stream_name">
      <str>HelloWorld.odt</str>
    </arr>
    <arr name="attr_generator">
      <str>OpenOffice.org/3.4.1$Win32 
OpenOffice.org_project/341m1$Build-9593</str>
    </arr>
    <str name="keywords">Hello, World</str>
    <arr name="attr_last_save_date">
      <str>2013-05-10T17:15:40.99</str>
    </arr>
    <arr name="attr_paragraph_count">
      <str>4</str>
    </arr>
    <arr name="attr_dc_title">
      <str>Hello World SolrCell Test - title</str>
    </arr>
    <arr name="attr_dcterms_modified">
      <str>2013-05-10T17:15:40.99</str>
    </arr>
    <arr name="attr_meta_creation_date">
      <str>2013-05-10T17:11:58.88</str>
    </arr>
    <arr name="attr_page_count">
      <str>1</str>
    </arr>
    <arr name="attr_meta_character_count">
      <str>60</str>
    </arr>
    <date name="last_modified">2013-05-10T17:15:40Z</date>
    <arr name="attr_nbtab">
      <str>0</str>
    </arr>
    <arr name="attr_meta_word_count">
      <str>10</str>
    </arr>
    <arr name="attr_meta_table_count">
      <str>0</str>
    </arr>
    <arr name="attr_modified">
      <str>2013-05-10T17:15:40.99</str>
    </arr>
    <arr name="attr_meta_image_count">
      <str>0</str>
    </arr>
    <arr name="attr_xmptpg_npages">
      <str>1</str>
    </arr>
    <arr name="attr_table_count">
      <str>0</str>
    </arr>
    <arr name="attr_nbpara">
      <str>4</str>
    </arr>
    <arr name="attr_character_count">
      <str>60</str>
    </arr>
    <arr name="attr_meta_page_count">
      <str>1</str>
    </arr>
    <arr name="attr_nbword">
      <str>10</str>
    </arr>
    <arr name="attr_nbpage">
      <str>1</str>
    </arr>
    <arr name="content_type">
      <str>application/vnd.oasis.opendocument.text</str>
    </arr>
    <arr name="attr_nbcharacter">
      <str>60</str>
    </arr>
    <arr name="content">
      <str>  </str>
    </arr>
    <long name="_version_">1434688567598120960</long></doc>
</result>
</response>
{code}

Command to extract as text:

{code}
curl 
"http://localhost:8983/solr/update/extract?literal.id=doc-1&indent=true&extractOnly=true&extractFormat=text&commit=true";
 -F "myfile=@HelloWorld.odt"
{code}

The response:

{code}
<?xml version="1.0" encoding="UTF-8"?>
<response>

<lst name="responseHeader">
  <int name="status">0</int>
  <int name="QTime">124</int>
</lst>
<str name="HelloWorld.odt">








Hello World, from OpenOffice!

Third line.
Fourth line.
The end.


</str>
<lst name="HelloWorld.odt_metadata">
  <arr name="Image-Count">
    <str>0</str>
  </arr>
  <arr name="editing-cycles">
    <str>1</str>
  </arr>
  <arr name="stream_source_info">
    <str>myfile</str>
  </arr>
  <arr name="meta:save-date">
    <str>2013-05-10T17:15:40.99</str>
  </arr>
  <arr name="dc:subject">
    <str>Hello, World</str>
  </arr>
  <arr name="subject">
    <str>Hello World - subject</str>
  </arr>
  <arr name="dcterms:created">
    <str>2013-05-10T17:11:58.88</str>
  </arr>
  <arr name="date">
    <str>2013-05-10T17:15:40.99</str>
  </arr>
  <arr name="dc:description">
    <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
  </arr>
  <arr name="nbObject">
    <str>0</str>
  </arr>
  <arr name="Word-Count">
    <str>10</str>
  </arr>
  <arr name="Edit-Time">
    <str>PT3M44S</str>
  </arr>
  <arr name="meta:paragraph-count">
    <str>4</str>
  </arr>
  <arr name="Creation-Date">
    <str>2013-05-10T17:11:58.88</str>
  </arr>
  <arr name="title">
    <str>Hello World SolrCell Test - title</str>
  </arr>
  <arr name="Object-Count">
    <str>0</str>
  </arr>
  <arr name="stream_content_type">
    <str>application/octet-stream</str>
  </arr>
  <arr name="nbImg">
    <str>0</str>
  </arr>
  <arr name="description">
    <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
  </arr>
  <arr name="stream_size">
    <str>8960</str>
  </arr>
  <arr name="meta:object-count">
    <str>0</str>
  </arr>
  <arr name="cp:subject">
    <str>Hello World - subject</str>
  </arr>
  <arr name="stream_name">
    <str>HelloWorld.odt</str>
  </arr>
  <arr name="generator">
    <str>OpenOffice.org/3.4.1$Win32 
OpenOffice.org_project/341m1$Build-9593</str>
  </arr>
  <arr name="Keywords">
    <str>Hello, World</str>
  </arr>
  <arr name="Last-Save-Date">
    <str>2013-05-10T17:15:40.99</str>
  </arr>
  <arr name="Paragraph-Count">
    <str>4</str>
  </arr>
  <arr name="dc:title">
    <str>Hello World SolrCell Test - title</str>
  </arr>
  <arr name="dcterms:modified">
    <str>2013-05-10T17:15:40.99</str>
  </arr>
  <arr name="meta:creation-date">
    <str>2013-05-10T17:11:58.88</str>
  </arr>
  <arr name="Page-Count">
    <str>1</str>
  </arr>
  <arr name="meta:character-count">
    <str>60</str>
  </arr>
  <arr name="Last-Modified">
    <str>2013-05-10T17:15:40.99</str>
  </arr>
  <arr name="nbTab">
    <str>0</str>
  </arr>
  <arr name="meta:word-count">
    <str>10</str>
  </arr>
  <arr name="meta:table-count">
    <str>0</str>
  </arr>
  <arr name="modified">
    <str>2013-05-10T17:15:40.99</str>
  </arr>
  <arr name="meta:image-count">
    <str>0</str>
  </arr>
  <arr name="xmpTPg:NPages">
    <str>1</str>
  </arr>
  <arr name="Table-Count">
    <str>0</str>
  </arr>
  <arr name="nbPara">
    <str>4</str>
  </arr>
  <arr name="Character Count">
    <str>60</str>
  </arr>
  <arr name="meta:page-count">
    <str>1</str>
  </arr>
  <arr name="nbWord">
    <str>10</str>
  </arr>
  <arr name="nbPage">
    <str>1</str>
  </arr>
  <arr name="Content-Type">
    <str>application/vnd.oasis.opendocument.text</str>
  </arr>
  <arr name="nbCharacter">
    <str>60</str>
  </arr>
</lst>
</response>
{code}



--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

Reply via email to