This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 79f313d85ee558a2ba2e61477ec669ab0281dcb6 Author: Nick Burch <[email protected]> AuthorDate: Thu May 3 16:52:27 2018 +0100 More SAS7BDAT metadata --- .../java/org/apache/tika/metadata/Database.java | 5 +- .../org/apache/tika/parser/sas/SAS7BDATParser.java | 82 ++++++++++------------ .../apache/tika/parser/sas/SAS7BDATParserTest.java | 28 +++++++- 3 files changed, 67 insertions(+), 48 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Database.java b/tika-core/src/main/java/org/apache/tika/metadata/Database.java index bab983b..240d6ab 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Database.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Database.java @@ -20,6 +20,7 @@ public interface Database { final static String PREFIX = "database"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; Property TABLE_NAME = Property.externalTextBag(PREFIX+"table_name"); - Property COLUMN_COUNT = Property.externalText(PREFIX+"column_count"); + Property ROW_COUNT = Property.externalInteger(PREFIX+"row_count"); + Property COLUMN_COUNT = Property.externalInteger(PREFIX+"column_count"); Property COLUMN_NAME = Property.externalTextBag(PREFIX+"column_name"); -} \ No newline at end of file +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java index 5992e15..56260ca 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java @@ -22,11 +22,16 @@ import java.util.Collections; import java.util.Set; import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Database; +import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.executable.MachineMetadata; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -71,51 +76,40 @@ public class SAS7BDATParser extends AbstractParser { metadata.set(TikaCoreProperties.CREATED, props.getDateCreated()); metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified()); - // TODO What about these? -/* -u64 - false -compressionMethod - null -endianness - 1 -encoding - windows-1252 -sessionEncoding - null -fileType - DATA -sasRelease - 9.0101M3 -serverType - XP_PRO -osName - -osType - -headerLength - 1024 -pageLength - 8192 -pageCount - 1 -rowLength - 96 -rowCount - 31 -mixPageRowCount - 69 -columnsCount - 5 -*/ + metadata.set(PagedText.N_PAGES, (int)props.getPageCount()); + metadata.set(Database.COLUMN_COUNT, (int)props.getColumnsCount()); + metadata.set(Database.ROW_COUNT, (int)props.getRowCount()); + + // TODO Can we find more general properties for these / move + // these to more general places? + metadata.set(HttpHeaders.CONTENT_ENCODING, props.getEncoding()); + metadata.set(OfficeOpenXMLExtended.APPLICATION, props.getServerType()); + metadata.set(OfficeOpenXMLExtended.APP_VERSION, props.getSasRelease()); + metadata.set(MachineMetadata.ARCHITECTURE_BITS, + props.isU64() ? "64" : "32"); + metadata.set(MachineMetadata.ENDIAN, props.getEndianness() == 1 ? + MachineMetadata.Endian.LITTLE.getName() : + MachineMetadata.Endian.BIG.getName()); + + // The following SAS Metadata fields are currently ignored: + // compressionMethod + // sessionEncoding + // fileType + // osName - + // osType - + // mixPageRowCount + // headerLength + // pageLength + // rowLength + + // Process the column metadata + // TODO Find keys to record the format and the type + for (Column c : sas.getColumns()) { + String name = c.getLabel(); + if (name == null || name.isEmpty()) name = c.getName(); + metadata.add(Database.COLUMN_NAME, name); + } - // TODO Should we output more Column info as metadata? -/* -5 Columns defined: - 1 - A - Label: A - Format: $58. - Size 58 of java.lang.String - 2 - B - Label: B - Format: - Size 8 of java.lang.Number - 3 - C - Label: C - Format: DATE8. - Size 8 of java.lang.Number - 4 - D - Label: D - Format: DATETIME17. - Size 8 of java.lang.Number - 5 - E - Label: E - Format: - Size 8 of java.lang.Number -*/ // Output file contents as a table xhtml.element("h1", props.getName()); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java index 2f29a13..c2a74a7 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java @@ -21,13 +21,19 @@ import static org.junit.Assert.assertNull; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Database; +import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.executable.MachineMetadata; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; @@ -54,7 +60,16 @@ public class SAS7BDATParserTest extends TikaTest { assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.CREATED)); assertEquals("2017-01-30T07:31:47Z", metadata.get(TikaCoreProperties.MODIFIED)); - // TODO Test the rest of the metadata + assertEquals("1", metadata.get(PagedText.N_PAGES)); + assertEquals("2", metadata.get(Database.COLUMN_COUNT)); + assertEquals("11", metadata.get(Database.ROW_COUNT)); + assertEquals("windows-1252", metadata.get(HttpHeaders.CONTENT_ENCODING)); + assertEquals("W32_7PRO", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("9.0301M2", metadata.get(OfficeOpenXMLExtended.APP_VERSION)); + assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS)); + assertEquals("Little", metadata.get(MachineMetadata.ENDIAN)); + assertEquals(Arrays.asList("recnum","label"), + Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); String content = handler.toString(); assertContains("TESTING", content); @@ -82,7 +97,16 @@ public class SAS7BDATParserTest extends TikaTest { assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.CREATED)); assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.MODIFIED)); - // TODO Test the rest of the metadata + assertEquals("1", metadata.get(PagedText.N_PAGES)); + assertEquals("5", metadata.get(Database.COLUMN_COUNT)); + assertEquals("31", metadata.get(Database.ROW_COUNT)); + assertEquals("windows-1252", metadata.get(HttpHeaders.CONTENT_ENCODING)); + assertEquals("XP_PRO", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("9.0101M3", metadata.get(OfficeOpenXMLExtended.APP_VERSION)); + assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS)); + assertEquals("Little", metadata.get(MachineMetadata.ENDIAN)); + assertEquals(Arrays.asList("A","B","C","D","E"), + Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); String content = handler.toString(); assertContains("SHEET1", content); -- To stop receiving notification emails like this one, please contact [email protected].
