Author: nick Date: Tue Jan 24 16:10:34 2012 New Revision: 1235321 URL: http://svn.apache.org/viewvc?rev=1235321&view=rev Log: TIKA-770 Convert the remaining ODF document statistics to be defined properties, and update all of the Office Count statistics to be integer typed properties
Modified: tika/trunk/tika-core/pom.xml tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Modified: tika/trunk/tika-core/pom.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/pom.xml?rev=1235321&r1=1235320&r2=1235321&view=diff ============================================================================== --- tika/trunk/tika-core/pom.xml (original) +++ tika/trunk/tika-core/pom.xml Tue Jan 24 16:10:34 2012 @@ -94,6 +94,7 @@ <excludes> <exlude>org/apache/tika/metadata/Property$PropertyType</exlude> <exlude>org/apache/tika/metadata/Property$ValueType</exlude> + <exlude>org/apache/tika/metadata/MSOffice</exlude> <exlude>org/apache/tika/parser/EmptyParser</exlude> </excludes> <comparisonArtifacts> Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java?rev=1235321&r1=1235320&r2=1235321&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java Tue Jan 24 16:10:34 2012 @@ -29,26 +29,16 @@ public interface MSOffice { String APPLICATION_NAME = "Application-Name"; - String CHARACTER_COUNT = "Character Count"; - - String PAGE_COUNT = "Page-Count"; - String REVISION_NUMBER = "Revision-Number"; - String WORD_COUNT = "Word-Count"; - String TEMPLATE = "Template"; String AUTHOR = "Author"; String TOTAL_TIME = "Total-Time"; - String SLIDE_COUNT = "Slide-Count"; - String PRESENTATION_FORMAT = "Presentation-Format"; - String PARAGRAPH_COUNT = "Paragraph-Count"; - String NOTES = "Notes"; String MANAGER = "Manager"; @@ -69,6 +59,44 @@ public interface MSOffice { String SECURITY = "Security"; + + /** The number of Slides are there in the (presentation) document */ + Property SLIDE_COUNT = + Property.internalInteger("Slide-Count"); + + /** The number of Pages are there in the (paged) document */ + Property PAGE_COUNT = + Property.internalInteger("Page-Count"); + + /** The number of individual Paragraphs in the document */ + Property PARAGRAPH_COUNT = + Property.internalInteger("Paragraph-Count"); + + /** The number of Words in the document */ + Property WORD_COUNT = + Property.internalInteger("Word-Count"); + + /** The number of Characters in the document */ + Property CHARACTER_COUNT = + Property.internalInteger("Character Count"); + + /** The number of Tables in the document */ + Property TABLE_COUNT = + Property.internalInteger("Table-Count"); + + /** The number of Images in the document */ + Property IMAGE_COUNT = + Property.internalInteger("Image-Count"); + + /** + * The number of Objects in the document. + * This is typically non-Image resources embedded in the + * document, such as other documents or non-Image media. + */ + Property OBJECT_COUNT = + Property.internalInteger("Object-Count"); + + /** How long has been spent editing the document? */ String EDIT_TIME = "Edit-Time"; Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1235321&r1=1235320&r2=1235321&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Tue Jan 24 16:10:34 2012 @@ -83,23 +83,30 @@ public class OpenDocumentMetaParser exte ch = getMeta(ch, md, "editing-cycles", "editing-cycles"); ch = getMeta(ch, md, "initial-creator", "initial-creator"); ch = getMeta(ch, md, "generator", "generator"); + // Process the user defined Meta Attributes ch = getUserDefined(ch, md); + // Process the OO Statistics Attributes - ch = getStatistic(ch, md, "nbTab", "table-count"); - ch = getStatistic(ch, md, "nbObject", "object-count"); - ch = getStatistic(ch, md, "nbImg", "image-count"); - ch = getStatistic(ch, md, Metadata.PAGE_COUNT, "page-count"); - ch = getStatistic(ch, md, PagedText.N_PAGES.getName(), "page-count"); - ch = getStatistic(ch, md, Metadata.PARAGRAPH_COUNT, "paragraph-count"); - ch = getStatistic(ch, md, Metadata.WORD_COUNT, "word-count"); - ch = getStatistic(ch, md, Metadata.CHARACTER_COUNT, "character-count"); + ch = getStatistic(ch, md, Metadata.OBJECT_COUNT.getName(), "object-count"); + ch = getStatistic(ch, md, Metadata.IMAGE_COUNT.getName(), "image-count"); + ch = getStatistic(ch, md, Metadata.PAGE_COUNT.getName(), "page-count"); + ch = getStatistic(ch, md, PagedText.N_PAGES.getName(), "page-count"); + ch = getStatistic(ch, md, Metadata.TABLE_COUNT.getName(), "table-count"); + ch = getStatistic(ch, md, Metadata.PARAGRAPH_COUNT.getName(), "paragraph-count"); + ch = getStatistic(ch, md, Metadata.WORD_COUNT.getName(), "word-count"); + ch = getStatistic(ch, md, Metadata.CHARACTER_COUNT.getName(), "character-count"); + // Legacy Statistics Attributes, replaced with real keys above - // TODO remove these soon! + // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770) ch = getStatistic(ch, md, "nbPage", "page-count"); ch = getStatistic(ch, md, "nbPara", "paragraph-count"); ch = getStatistic(ch, md, "nbWord", "word-count"); ch = getStatistic(ch, md, "nbCharacter", "character-count"); + ch = getStatistic(ch, md, "nbTab", "table-count"); + ch = getStatistic(ch, md, "nbObject", "object-count"); + ch = getStatistic(ch, md, "nbImg", "image-count"); + // Normalise the rest ch = new NSNormalizerContentHandler(ch); return ch; Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1235321&r1=1235320&r2=1235321&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Tue Jan 24 16:10:34 2012 @@ -89,6 +89,9 @@ public class ODFParserTest extends TikaT assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT)); assertEquals("14", metadata.get(Metadata.WORD_COUNT)); assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); + assertEquals("0", metadata.get(Metadata.OBJECT_COUNT)); + assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); // Check the old style statistics (these will be removed shortly) assertEquals("0", metadata.get("nbTab")); @@ -152,6 +155,9 @@ public class ODFParserTest extends TikaT assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT)); assertEquals(null, metadata.get(Metadata.WORD_COUNT)); assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals(null, metadata.get(Metadata.TABLE_COUNT)); + assertEquals(null, metadata.get(Metadata.OBJECT_COUNT)); + assertEquals(null, metadata.get(Metadata.IMAGE_COUNT)); assertEquals(null, metadata.get("nbTab")); assertEquals(null, metadata.get("nbObject")); assertEquals(null, metadata.get("nbImg")); @@ -207,6 +213,9 @@ public class ODFParserTest extends TikaT assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT)); assertEquals("54", metadata.get(Metadata.WORD_COUNT)); assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT)); + assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); + assertEquals("2", metadata.get(Metadata.OBJECT_COUNT)); + assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); // Check the old style statistics (these will be removed shortly) assertEquals("0", metadata.get("nbTab"));