This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7f89db35d066e6c4ae35490c5bad67d376e5365e Author: Nick Burch <[email protected]> AuthorDate: Thu May 10 15:13:43 2018 +0100 Check header contents, check data rows count, add XLSX test --- .../org/apache/tika/parser/TabularFormatsTest.java | 77 +++++++++++++++++----- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java index 8574d37..023f49d 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java @@ -31,7 +31,7 @@ import org.junit.Test; */ public class TabularFormatsTest extends TikaTest { protected static final String[] columnNames = new String[] { - "recnum","square","desc","pctdone","pctinc", + "recnum","square","desc","pctdone","pctincr", "date","datetime","time" }; protected static final String[] columnLabels = new String[] { @@ -49,8 +49,9 @@ public class TabularFormatsTest extends TikaTest { "0","1","2","3","4","5","6","7","8","9","10" }, new String[] { - "0","1","4" // etc + "0","1","4","9","16","25","36","49","64","81","100" }, +/* new String[] { // etc "01-01-1960" }, @@ -59,37 +60,72 @@ public class TabularFormatsTest extends TikaTest { new String[] { "" } +*/ }; - - protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) { - // Find the first row - int splitAt = xml.indexOf("</tr>"); - String hRow = xml.substring(0, splitAt); - splitAt = xml.indexOf("<tr>"); - hRow = hRow.substring(splitAt+4); - + + protected static String[] toCells(String row, boolean isTH) { // Split into cells, ignoring stuff before first cell String[] cells; if (isTH) { - cells = hRow.split("<th"); + cells = row.split("<th"); } else { - cells = hRow.split("<td"); + cells = row.split("<td"); } cells = Arrays.copyOfRange(cells, 1, cells.length); for (int i=0; i<cells.length; i++) { - splitAt = cells[i].lastIndexOf("</"); + int splitAt = cells[i].lastIndexOf("</"); cells[i] = cells[i].substring(0, splitAt).trim(); } + return cells; + } + + protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) { + // Find the first row + int splitAt = xml.indexOf("</tr>"); + String hRow = xml.substring(0, splitAt); + splitAt = xml.indexOf("<tr>"); + hRow = hRow.substring(splitAt+4); + + // Split into cells, ignoring stuff before first cell + String[] cells = toCells(hRow, isTH); // Check we got the right number assertEquals("Wrong number of cells in header row " + hRow, columnLabels.length, cells.length); // Check we got the right stuff - // TODO + for (int i=0; i<cells.length; i++) { + if (hasLabel && hasName) { + assertContains("title=\"" + columnNames[i] + "\"", cells[i]); + assertContains(">" + columnLabels[i], cells[i]); + } else if (hasName) { + assertContains(">" + columnNames[i], cells[i]); + } else { + assertContains(">" + columnLabels[i], cells[i]); + } + } } protected void assertContents(String xml, boolean hasHeader) { - // TODO Check the rows + // Ignore anything before the first <tr> + // Ignore the header row if there is one + int ignores = 1; + if (hasHeader) ignores++; + + // Split into rows, and discard the row closing (and anything after) + String[] rows = xml.split("<tr>"); + rows = Arrays.copyOfRange(rows, ignores, rows.length); + for (int i=0; i<rows.length; i++) { + rows[i] = rows[i].split("</tr>")[0].trim(); + } + + // Check we got the right number of rows + for (int cn=0; cn<table.length; cn++) { + assertEquals("Wrong number of rows found compared to column " + (cn+1), + table[cn].length, rows.length); + } + + // Check each row's values + // TODO } @Test @@ -106,7 +142,16 @@ public class TabularFormatsTest extends TikaTest { assertHeaders(xml, false, true, false); assertContents(xml, true); } - // TODO Other formats + @Test + public void testXLSX() throws Exception { + XMLResult result = getXML("test-columnar.xlsx"); + String xml = result.xml; + assertHeaders(xml, false, true, false); + assertContents(xml, true); + } + // TODO Test ODS + + // TODO Test other formats, eg Database formats /** * Note - we don't have a dedicated CSV parser -- To stop receiving notification emails like this one, please contact [email protected].
