This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 79f313d85ee558a2ba2e61477ec669ab0281dcb6
Author: Nick Burch <[email protected]>
AuthorDate: Thu May 3 16:52:27 2018 +0100

    More SAS7BDAT metadata
---
 .../java/org/apache/tika/metadata/Database.java    |  5 +-
 .../org/apache/tika/parser/sas/SAS7BDATParser.java | 82 ++++++++++------------
 .../apache/tika/parser/sas/SAS7BDATParserTest.java | 28 +++++++-
 3 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Database.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Database.java
index bab983b..240d6ab 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Database.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Database.java
@@ -20,6 +20,7 @@ public interface Database {
     final static String PREFIX = 
"database"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
     Property TABLE_NAME = Property.externalTextBag(PREFIX+"table_name");
-    Property COLUMN_COUNT = Property.externalText(PREFIX+"column_count");
+    Property ROW_COUNT = Property.externalInteger(PREFIX+"row_count");
+    Property COLUMN_COUNT = Property.externalInteger(PREFIX+"column_count");
     Property COLUMN_NAME = Property.externalTextBag(PREFIX+"column_name");
-}
\ No newline at end of file
+}
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
index 5992e15..56260ca 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java
@@ -22,11 +22,16 @@ import java.util.Collections;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.executable.MachineMetadata;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -71,51 +76,40 @@ public class SAS7BDATParser extends AbstractParser {
         metadata.set(TikaCoreProperties.CREATED, props.getDateCreated());
         metadata.set(TikaCoreProperties.MODIFIED, props.getDateModified());
 
-        // TODO What about these?
-/*
-u64 - false
-compressionMethod - null
-endianness - 1
-encoding - windows-1252
-sessionEncoding - null
-fileType - DATA
-sasRelease - 9.0101M3
-serverType - XP_PRO
-osName - 
-osType - 
-headerLength - 1024
-pageLength - 8192
-pageCount - 1
-rowLength - 96
-rowCount - 31
-mixPageRowCount - 69
-columnsCount - 5
-*/
+        metadata.set(PagedText.N_PAGES,     (int)props.getPageCount());
+        metadata.set(Database.COLUMN_COUNT, (int)props.getColumnsCount());
+        metadata.set(Database.ROW_COUNT,    (int)props.getRowCount());
+
+        // TODO Can we find more general properties for these / move
+        //  these to more general places?
+        metadata.set(HttpHeaders.CONTENT_ENCODING, props.getEncoding());
+        metadata.set(OfficeOpenXMLExtended.APPLICATION, props.getServerType());
+        metadata.set(OfficeOpenXMLExtended.APP_VERSION, props.getSasRelease());
+        metadata.set(MachineMetadata.ARCHITECTURE_BITS, 
+                     props.isU64() ? "64" : "32");
+        metadata.set(MachineMetadata.ENDIAN, props.getEndianness() == 1 ? 
+                     MachineMetadata.Endian.LITTLE.getName() : 
+                     MachineMetadata.Endian.BIG.getName());
+
+        // The following SAS Metadata fields are currently ignored:
+        // compressionMethod
+        // sessionEncoding
+        // fileType
+        // osName - 
+        // osType - 
+        // mixPageRowCount
+        // headerLength
+        // pageLength
+        // rowLength
+
+        // Process the column metadata
+        // TODO Find keys to record the format and the type
+        for (Column c : sas.getColumns()) {
+            String name = c.getLabel();
+            if (name == null || name.isEmpty()) name = c.getName();
+            metadata.add(Database.COLUMN_NAME, name);
+        }
 
-        // TODO Should we output more Column info as metadata?
-/*
-5 Columns defined:
- 1 - A
-  Label: A
-  Format: $58.
-  Size 58 of java.lang.String
- 2 - B
-  Label: B
-  Format: 
-  Size 8 of java.lang.Number
- 3 - C
-  Label: C
-  Format: DATE8.
-  Size 8 of java.lang.Number
- 4 - D
-  Label: D
-  Format: DATETIME17.
-  Size 8 of java.lang.Number
- 5 - E
-  Label: E
-  Format: 
-  Size 8 of java.lang.Number
-*/
 
         // Output file contents as a table
         xhtml.element("h1", props.getName());
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
index 2f29a13..c2a74a7 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java
@@ -21,13 +21,19 @@ import static org.junit.Assert.assertNull;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Arrays;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.executable.MachineMetadata;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.junit.Test;
@@ -54,7 +60,16 @@ public class SAS7BDATParserTest extends TikaTest {
         assertEquals("2017-01-30T07:31:47Z", 
metadata.get(TikaCoreProperties.CREATED));
         assertEquals("2017-01-30T07:31:47Z", 
metadata.get(TikaCoreProperties.MODIFIED));
         
-        // TODO Test the rest of the metadata
+        assertEquals("1", metadata.get(PagedText.N_PAGES));
+        assertEquals("2", metadata.get(Database.COLUMN_COUNT));
+        assertEquals("11", metadata.get(Database.ROW_COUNT));
+        assertEquals("windows-1252", 
metadata.get(HttpHeaders.CONTENT_ENCODING));
+        assertEquals("W32_7PRO", 
metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("9.0301M2", 
metadata.get(OfficeOpenXMLExtended.APP_VERSION));
+        assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS));
+        assertEquals("Little", metadata.get(MachineMetadata.ENDIAN));
+        assertEquals(Arrays.asList("recnum","label"),
+                     Arrays.asList(metadata.getValues(Database.COLUMN_NAME)));
         
         String content = handler.toString();
         assertContains("TESTING", content);
@@ -82,7 +97,16 @@ public class SAS7BDATParserTest extends TikaTest {
         assertEquals("2015-03-06T19:10:19Z", 
metadata.get(TikaCoreProperties.CREATED));
         assertEquals("2015-03-06T19:10:19Z", 
metadata.get(TikaCoreProperties.MODIFIED));
         
-        // TODO Test the rest of the metadata
+        assertEquals("1", metadata.get(PagedText.N_PAGES));
+        assertEquals("5", metadata.get(Database.COLUMN_COUNT));
+        assertEquals("31", metadata.get(Database.ROW_COUNT));
+        assertEquals("windows-1252", 
metadata.get(HttpHeaders.CONTENT_ENCODING));
+        assertEquals("XP_PRO", 
metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("9.0101M3", 
metadata.get(OfficeOpenXMLExtended.APP_VERSION));
+        assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS));
+        assertEquals("Little", metadata.get(MachineMetadata.ENDIAN));
+        assertEquals(Arrays.asList("A","B","C","D","E"),
+                     Arrays.asList(metadata.getValues(Database.COLUMN_NAME)));
         
         String content = handler.toString();
         assertContains("SHEET1", content);

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to