This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1d7a113cdecf64a97a349d8ff74cad1ecd9127d3 Author: Nick Burch <[email protected]> AuthorDate: Thu May 10 13:48:03 2018 +0100 CSV assert as best we can (no dedicated parser), start on XLS and SAS7BDAT consistency tests --- .../org/apache/tika/parser/TabularFormatsTest.java | 65 ++++++++++++++++++++-- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java index 4dc7336..8574d37 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java @@ -17,6 +17,10 @@ package org.apache.tika.parser; +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; + import org.apache.tika.TikaTest; import org.junit.Test; @@ -57,21 +61,70 @@ public class TabularFormatsTest extends TikaTest { } }; - protected void assertHeaders(String xml, boolean isTH) { - // TODO Check for the first row, then TR or TH + protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) { + // Find the first row + int splitAt = xml.indexOf("</tr>"); + String hRow = xml.substring(0, splitAt); + splitAt = xml.indexOf("<tr>"); + hRow = hRow.substring(splitAt+4); + + // Split into cells, ignoring stuff before first cell + String[] cells; + if (isTH) { + cells = hRow.split("<th"); + } else { + cells = hRow.split("<td"); + } + cells = Arrays.copyOfRange(cells, 1, cells.length); + for (int i=0; i<cells.length; i++) { + splitAt = cells[i].lastIndexOf("</"); + cells[i] = cells[i].substring(0, splitAt).trim(); + } + + // Check we got the right number + assertEquals("Wrong number of cells in header row " + hRow, + columnLabels.length, cells.length); + + // Check we got the right stuff + // TODO } protected void assertContents(String xml, boolean hasHeader) { // TODO Check the rows } @Test + public void testSAS7BDAT() throws Exception { + XMLResult result = getXML("test-columnar.sas7bdat"); + String xml = result.xml; + assertHeaders(xml, true, true, true); + assertContents(xml, true); + } + @Test + public void testXLS() throws Exception { + XMLResult result = getXML("test-columnar.xls"); + String xml = result.xml; + assertHeaders(xml, false, true, false); + assertContents(xml, true); + } + // TODO Other formats + + /** + * Note - we don't have a dedicated CSV parser + * + * This means we don't get proper HTML out... + */ + @Test public void testCSV() throws Exception { XMLResult result = getXML("test-columnar.csv"); String xml = result.xml; - assertHeaders(xml, false); - assertContents(xml, true); + for (String label : columnLabels) { + assertContains(label, xml); + } + for (String[] vals : table) { + for (String val : vals) { + assertContains(val, xml); + } + } } - // TODO SAS7BDAT - // TODO Other formats } -- To stop receiving notification emails like this one, please contact [email protected].
