This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit d0324f8e4fa70fce67d56dc70f611f5535fe229b Author: Nick Burch <[email protected]> AuthorDate: Wed May 9 18:19:34 2018 +0100 Add a test .sas7bdat file with labels, and generate the columnar/tabular test file in a few more formats --- .../apache/tika/parser/sas/SAS7BDATParserTest.java | 51 +++++++---- .../resources/test-documents/test-columnar.sas.xml | 102 +++++++++++++++++++++ .../test-documents/test-columnar.sas7bdat | Bin 9216 -> 17408 bytes .../resources/test-documents/test-columnar.xpt | Bin 0 -> 4560 bytes .../src/test/resources/test-documents/testSAS2.sas | 48 ++++++++++ 5 files changed, 182 insertions(+), 19 deletions(-) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java index 2657ac2..3bb3e01 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java @@ -82,36 +82,36 @@ public class SAS7BDATParserTest extends TikaTest { Metadata metadata = new Metadata(); try (InputStream stream = SAS7BDATParserTest.class.getResourceAsStream( - "/test-documents/test-columnar.sas7bdat")) { + "/test-documents/test-columnar.sas7bdat")) { parser.parse(stream, handler, metadata, new ParseContext()); } assertEquals("application/x-sas-data", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("SHEET1", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("TESTING", metadata.get(TikaCoreProperties.TITLE)); - // Fri Mar 06 19:10:19 GMT 2015 - assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2015-03-06T19:10:19Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2018-05-09T16:42:04Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2018-05-09T16:42:04Z", metadata.get(TikaCoreProperties.MODIFIED)); assertEquals("1", metadata.get(PagedText.N_PAGES)); - assertEquals("5", metadata.get(Database.COLUMN_COUNT)); - assertEquals("31", metadata.get(Database.ROW_COUNT)); + assertEquals("7", metadata.get(Database.COLUMN_COUNT)); + assertEquals("11", metadata.get(Database.ROW_COUNT)); assertEquals("windows-1252", metadata.get(HttpHeaders.CONTENT_ENCODING)); - assertEquals("XP_PRO", metadata.get(OfficeOpenXMLExtended.APPLICATION)); - assertEquals("9.0101M3", metadata.get(OfficeOpenXMLExtended.APP_VERSION)); + assertEquals("W32_7PRO", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("9.0301M2", metadata.get(OfficeOpenXMLExtended.APP_VERSION)); assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS)); assertEquals("Little", metadata.get(MachineMetadata.ENDIAN)); - assertEquals(Arrays.asList("A","B","C","D","E"), + assertEquals(Arrays.asList("Record Number","Square of the Record Number", + "Description of the Row","Percent Done", + "Percent Increment","date","datetime"), Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); String content = handler.toString(); - assertContains("SHEET1", content); - assertContains("A\tB\tC", content); - assertContains("Num=0\t", content); - assertContains("Num=404242\t", content); - assertContains("\t0\t", content); - assertContains("\t404242\t", content); - assertContains("\t08Feb1904\t", content); + assertContains("TESTING", content); + assertContains("0\t0\tThis", content); + assertContains("2\t4\tThis", content); + assertContains("4\t16\tThis", content); + assertContains("\t01-01-1960\t", content); + assertContains("\t01Jan1960:00:00", content); } @Test @@ -129,7 +129,20 @@ public class SAS7BDATParserTest extends TikaTest { assertContains("<td>This is row", xml); assertContains("10</td>", xml); } + + @Test + public void testHTML2() throws Exception { + XMLResult result = getXML("test-columnar.sas7bdat"); + String xml = result.xml; - // TODO Column names vs labels, with a different test file - // TODO Columnar consistency test + // Check the title came through + assertContains("<h1>TESTING</h1>", xml); + // Check the headings + assertContains("<th title=\"recnum\">Record Number</th>", xml); + assertContains("<th title=\"square\">Square of the Record Number</th>", xml); + assertContains("<th title=\"date\">date</th>", xml); + // Check formatting of dates + assertContains("<td>01-01-1960</td>", xml); + assertContains("<td>01Jan1960:00:00:10.00</td>", xml); + } } diff --git a/tika-parsers/src/test/resources/test-documents/test-columnar.sas.xml b/tika-parsers/src/test/resources/test-documents/test-columnar.sas.xml new file mode 100644 index 0000000..ae12fc5 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/test-columnar.sas.xml @@ -0,0 +1,102 @@ +<?xml version="1.0" encoding="windows-1252" ?> +<TABLE> + <TESTXML> + <recnum>0</recnum> + <square>0</square> + <desc>This is row 0 of 10</desc> + <pctdone>0</pctdone> + <pctincr missing="M" /> + <date>0</date> + <datetime>1960-01-01T00:00:01</datetime> + </TESTXML> + <TESTXML> + <recnum>1</recnum> + <square>1</square> + <desc>This is row 1 of 10</desc> + <pctdone>0.1</pctdone> + <pctincr>0</pctincr> + <date>1</date> + <datetime>1960-01-01T00:00:10</datetime> + </TESTXML> + <TESTXML> + <recnum>2</recnum> + <square>4</square> + <desc>This is row 2 of 10</desc> + <pctdone>0.2</pctdone> + <pctincr>0.5</pctincr> + <date>16</date> + <datetime>1960-01-01T00:01:40</datetime> + </TESTXML> + <TESTXML> + <recnum>3</recnum> + <square>9</square> + <desc>This is row 3 of 10</desc> + <pctdone>0.3</pctdone> + <pctincr>0.6666666667</pctincr> + <date>81</date> + <datetime>1960-01-01T00:16:40</datetime> + </TESTXML> + <TESTXML> + <recnum>4</recnum> + <square>16</square> + <desc>This is row 4 of 10</desc> + <pctdone>0.4</pctdone> + <pctincr>0.75</pctincr> + <date>256</date> + <datetime>1960-01-01T02:46:40</datetime> + </TESTXML> + <TESTXML> + <recnum>5</recnum> + <square>25</square> + <desc>This is row 5 of 10</desc> + <pctdone>0.5</pctdone> + <pctincr>0.8</pctincr> + <date>625</date> + <datetime>1960-01-02T03:46:40</datetime> + </TESTXML> + <TESTXML> + <recnum>6</recnum> + <square>36</square> + <desc>This is row 6 of 10</desc> + <pctdone>0.6</pctdone> + <pctincr>0.8333333333</pctincr> + <date>1296</date> + <datetime>1960-01-12T13:46:40</datetime> + </TESTXML> + <TESTXML> + <recnum>7</recnum> + <square>49</square> + <desc>This is row 7 of 10</desc> + <pctdone>0.7</pctdone> + <pctincr>0.8571428571</pctincr> + <date>2401</date> + <datetime>1960-04-25T17:46:40</datetime> + </TESTXML> + <TESTXML> + <recnum>8</recnum> + <square>64</square> + <desc>This is row 8 of 10</desc> + <pctdone>0.8</pctdone> + <pctincr>0.875</pctincr> + <date>4096</date> + <datetime>1963-03-03T09:46:40</datetime> + </TESTXML> + <TESTXML> + <recnum>9</recnum> + <square>81</square> + <desc>This is row 9 of 10</desc> + <pctdone>0.9</pctdone> + <pctincr>0.8888888889</pctincr> + <date>6561</date> + <datetime>1991-09-09T01:46:40</datetime> + </TESTXML> + <TESTXML> + <recnum>10</recnum> + <square>100</square> + <desc>This is row 10 of 10</desc> + <pctdone>1</pctdone> + <pctincr>0.9</pctincr> + <date>10000</date> + <datetime>2276-11-19T17:46:40</datetime> + </TESTXML> +</TABLE> diff --git a/tika-parsers/src/test/resources/test-documents/test-columnar.sas7bdat b/tika-parsers/src/test/resources/test-documents/test-columnar.sas7bdat index 250b3b8..553c45c 100644 Binary files a/tika-parsers/src/test/resources/test-documents/test-columnar.sas7bdat and b/tika-parsers/src/test/resources/test-documents/test-columnar.sas7bdat differ diff --git a/tika-parsers/src/test/resources/test-documents/test-columnar.xpt b/tika-parsers/src/test/resources/test-documents/test-columnar.xpt new file mode 100644 index 0000000..d908228 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test-columnar.xpt differ diff --git a/tika-parsers/src/test/resources/test-documents/testSAS2.sas b/tika-parsers/src/test/resources/test-documents/testSAS2.sas new file mode 100644 index 0000000..bc8c1fe --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testSAS2.sas @@ -0,0 +1,48 @@ +data testing; +begin=0; +end=10; +msg="This is row %x of %y"; +do i = begin to end by 1; +drop msg begin end i; +recnum=i; +square=i*i; +desc=tranwrd(tranwrd(msg,"%x",i),"%y",end); +format pctdone percent8.0; +format pctincr percent7.1; +pctdone=divide(i,end); +pctincr=divide(i-1,i); +format date ddmmyyd10.; +format datetime datetime.; +date=i**4; +datetime=10**i; +output; +end; +label recnum="Record Number" + square="Square of the Record Number" + desc="Description of the Row" + pctdone="Percent Done" + pctincr="Percent Increment"; +run; + +libname out '/home/tika/testing/sas'; +libname outxpt XPORT '/home/tika/testing/sas/testing.xpt'; +libname outv6 v6 '/home/tika/testing/sas'; +libname outxml xmlv2 '/home/tika/testing/sas'; + +data out.testing; +set testing; +run; +data outv6.testv6; +set testing; +run; +data outxml.testxml; +set testing; +run; +proc copy in=out out=outxpt; +select testing; +run; + + +proc print data=testing; +run; + -- To stop receiving notification emails like this one, please contact [email protected].
