Author: nick
Date: Tue Jan 24 16:10:34 2012
New Revision: 1235321

URL: http://svn.apache.org/viewvc?rev=1235321&view=rev
Log:
TIKA-770 Convert the remaining ODF document statistics to be defined 
properties, and update all of the Office Count statistics to be integer typed 
properties

Modified:
    tika/trunk/tika-core/pom.xml
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

Modified: tika/trunk/tika-core/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/pom.xml?rev=1235321&r1=1235320&r2=1235321&view=diff
==============================================================================
--- tika/trunk/tika-core/pom.xml (original)
+++ tika/trunk/tika-core/pom.xml Tue Jan 24 16:10:34 2012
@@ -94,6 +94,7 @@
               <excludes>
                 <exlude>org/apache/tika/metadata/Property$PropertyType</exlude>
                 <exlude>org/apache/tika/metadata/Property$ValueType</exlude>
+                <exlude>org/apache/tika/metadata/MSOffice</exlude>
                 <exlude>org/apache/tika/parser/EmptyParser</exlude>
               </excludes>
               <comparisonArtifacts>

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java?rev=1235321&r1=1235320&r2=1235321&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java 
Tue Jan 24 16:10:34 2012
@@ -29,26 +29,16 @@ public interface MSOffice {
 
     String APPLICATION_NAME = "Application-Name";
 
-    String CHARACTER_COUNT = "Character Count";
-
-    String PAGE_COUNT = "Page-Count";
-
     String REVISION_NUMBER = "Revision-Number";
 
-    String WORD_COUNT = "Word-Count";
-
     String TEMPLATE = "Template";
 
     String AUTHOR = "Author";
 
     String TOTAL_TIME = "Total-Time";
 
-    String SLIDE_COUNT = "Slide-Count";
-
     String PRESENTATION_FORMAT = "Presentation-Format";
 
-    String PARAGRAPH_COUNT = "Paragraph-Count";
-
     String NOTES = "Notes";
 
     String MANAGER = "Manager";
@@ -69,6 +59,44 @@ public interface MSOffice {
 
     String SECURITY = "Security";
 
+    
+    /** The number of Slides are there in the (presentation) document */
+    Property SLIDE_COUNT = 
+       Property.internalInteger("Slide-Count");
+    
+    /** The number of Pages are there in the (paged) document */
+    Property PAGE_COUNT = 
+       Property.internalInteger("Page-Count");
+
+    /** The number of individual Paragraphs in the document */ 
+    Property PARAGRAPH_COUNT = 
+       Property.internalInteger("Paragraph-Count");
+
+    /** The number of Words in the document */
+    Property WORD_COUNT = 
+       Property.internalInteger("Word-Count");
+
+    /** The number of Characters in the document */
+    Property CHARACTER_COUNT = 
+       Property.internalInteger("Character Count");
+    
+    /** The number of Tables in the document */
+    Property TABLE_COUNT = 
+       Property.internalInteger("Table-Count");
+    
+    /** The number of Images in the document */
+    Property IMAGE_COUNT = 
+       Property.internalInteger("Image-Count");
+    
+    /** 
+     * The number of Objects in the document.
+     * This is typically non-Image resources embedded in the
+     *  document, such as other documents or non-Image media. 
+     */
+    Property OBJECT_COUNT = 
+       Property.internalInteger("Object-Count");
+
+    
     /** How long has been spent editing the document? */ 
     String EDIT_TIME = "Edit-Time"; 
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1235321&r1=1235320&r2=1235321&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
 Tue Jan 24 16:10:34 2012
@@ -83,23 +83,30 @@ public class OpenDocumentMetaParser exte
         ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
         ch = getMeta(ch, md, "initial-creator", "initial-creator");
         ch = getMeta(ch, md, "generator", "generator");
+        
         // Process the user defined Meta Attributes
         ch = getUserDefined(ch, md);
+        
         // Process the OO Statistics Attributes
-        ch = getStatistic(ch, md, "nbTab", "table-count");
-        ch = getStatistic(ch, md, "nbObject", "object-count");
-        ch = getStatistic(ch, md, "nbImg", "image-count");
-        ch = getStatistic(ch, md, Metadata.PAGE_COUNT, "page-count");
-        ch = getStatistic(ch, md, PagedText.N_PAGES.getName(), "page-count");
-        ch = getStatistic(ch, md, Metadata.PARAGRAPH_COUNT, "paragraph-count");
-        ch = getStatistic(ch, md, Metadata.WORD_COUNT, "word-count");
-        ch = getStatistic(ch, md, Metadata.CHARACTER_COUNT, "character-count");
+        ch = getStatistic(ch, md, Metadata.OBJECT_COUNT.getName(), 
"object-count");
+        ch = getStatistic(ch, md, Metadata.IMAGE_COUNT.getName(),  
"image-count");
+        ch = getStatistic(ch, md, Metadata.PAGE_COUNT.getName(),   
"page-count");
+        ch = getStatistic(ch, md, PagedText.N_PAGES.getName(),     
"page-count");
+        ch = getStatistic(ch, md, Metadata.TABLE_COUNT.getName(),  
"table-count");
+        ch = getStatistic(ch, md, Metadata.PARAGRAPH_COUNT.getName(), 
"paragraph-count");
+        ch = getStatistic(ch, md, Metadata.WORD_COUNT.getName(),      
"word-count");
+        ch = getStatistic(ch, md, Metadata.CHARACTER_COUNT.getName(), 
"character-count");
+        
         // Legacy Statistics Attributes, replaced with real keys above
-        // TODO remove these soon!
+        // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
         ch = getStatistic(ch, md, "nbPage", "page-count");
         ch = getStatistic(ch, md, "nbPara", "paragraph-count");
         ch = getStatistic(ch, md, "nbWord", "word-count");
         ch = getStatistic(ch, md, "nbCharacter", "character-count");
+        ch = getStatistic(ch, md, "nbTab", "table-count");
+        ch = getStatistic(ch, md, "nbObject", "object-count");
+        ch = getStatistic(ch, md, "nbImg", "image-count");
+        
         // Normalise the rest
         ch = new NSNormalizerContentHandler(ch);
         return ch;

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1235321&r1=1235320&r2=1235321&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 Tue Jan 24 16:10:34 2012
@@ -89,6 +89,9 @@ public class ODFParserTest extends TikaT
              assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
              assertEquals("14", metadata.get(Metadata.WORD_COUNT));
              assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
+             assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+             assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
+             assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
              
              // Check the old style statistics (these will be removed shortly)
              assertEquals("0", metadata.get("nbTab"));
@@ -152,6 +155,9 @@ public class ODFParserTest extends TikaT
            assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
            assertEquals(null, metadata.get(Metadata.WORD_COUNT));
            assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
+           assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
+           assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
+           assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
            assertEquals(null, metadata.get("nbTab"));
            assertEquals(null, metadata.get("nbObject"));
            assertEquals(null, metadata.get("nbImg"));
@@ -207,6 +213,9 @@ public class ODFParserTest extends TikaT
            assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
            assertEquals("54", metadata.get(Metadata.WORD_COUNT));
            assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
+           assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+           assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
+           assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
            
            // Check the old style statistics (these will be removed shortly)
            assertEquals("0", metadata.get("nbTab"));


Reply via email to