Repository: nifi Updated Branches: refs/heads/master 8acee0239 -> 5c48655e6
NIFI-4112: Fix ConvertExcelToCSV to handle empty cells. Signed-off-by: Matt Burgess <[email protected]> This closes #1973 Project: http://git-wip-us.apache.org/repos/asf/nifi/repo Commit: http://git-wip-us.apache.org/repos/asf/nifi/commit/5c48655e Tree: http://git-wip-us.apache.org/repos/asf/nifi/tree/5c48655e Diff: http://git-wip-us.apache.org/repos/asf/nifi/diff/5c48655e Branch: refs/heads/master Commit: 5c48655e69d1e1d9745a02099357f6b8b3d9d874 Parents: 8acee02 Author: Koji Kawamura <[email protected]> Authored: Tue Jul 4 16:30:19 2017 +0900 Committer: Matt Burgess <[email protected]> Committed: Thu Jul 13 10:40:01 2017 -0400 ---------------------------------------------------------------------- .../nifi-poi-bundle/nifi-poi-processors/pom.xml | 13 +++ .../poi/ConvertExcelToCSVProcessor.java | 89 ++++++++++++++--- .../additionalDetails.html | 97 +++++++++++++++++++ .../poi/ConvertExcelToCSVProcessorTest.java | 36 +++++++ .../src/test/resources/with-blank-cells.csv | 8 ++ .../src/test/resources/with-blank-cells.xlsx | Bin 0 -> 8489 bytes 6 files changed, 227 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml ---------------------------------------------------------------------- diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml index 85b38ac..d6db9c7 100644 --- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml @@ -29,6 +29,19 @@ <artifactId>nifi-poi-processors</artifactId> <packaging>jar</packaging> + <build> + <plugins> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <excludes combine.children="append"> + <exclude>src/test/resources/with-blank-cells.csv</exclude> + </excludes> + </configuration> + </plugin> + </plugins> + </build> <dependencies> <!-- https://mvnrepository.com/artifact/xerces/xerces --> <dependency> http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java ---------------------------------------------------------------------- diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java index b881c69..6d8274b 100644 --- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java @@ -24,6 +24,8 @@ import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; @@ -78,6 +80,7 @@ public class ConvertExcelToCSVProcessor public static final String SOURCE_FILE_NAME = "sourcefilename"; private static final String SAX_CELL_REF = "c"; private static final String SAX_CELL_TYPE = "t"; + private static final String SAX_CELL_ADDRESS = "r"; private static final String SAX_CELL_STRING = "s"; private static final String SAX_CELL_CONTENT_REF = "v"; private static final String SAX_ROW_REF = "row"; @@ -85,6 +88,7 @@ public class ConvertExcelToCSVProcessor private static final String DESIRED_SHEETS_DELIMITER = ","; private static final String UNKNOWN_SHEET_NAME = "UNKNOWN"; private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser"; + private static final Pattern CELL_ADDRESS_REGEX = Pattern.compile("^([a-zA-Z]+)([\\d]+)$"); public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor .Builder().name("extract-sheets") @@ -279,6 +283,27 @@ public class ConvertExcelToCSVProcessor } } + static Integer columnToIndex(String col) { + int length = col.length(); + int accumulator = 0; + for (int i = length; i > 0; i--) { + char c = col.charAt(i - 1); + int x = ((int) c) - 64; + accumulator += x * Math.pow(26, length - i); + } + // Make it to start with 0. + return accumulator - 1; + } + + private static class CellAddress { + final int row; + final int col; + + private CellAddress(int row, int col) { + this.row = row; + this.col = col; + } + } /** * Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value @@ -290,6 +315,10 @@ public class ConvertExcelToCSVProcessor private SharedStringsTable sst; private String currentContent; private boolean nextIsString; + private CellAddress firstCellAddress; + private CellAddress firstRowLastCellAddress; + private CellAddress previousCellAddress; + private CellAddress nextCellAddress; private OutputStream outputStream; private boolean firstColInRow; long rowCount; @@ -306,18 +335,35 @@ public class ConvertExcelToCSVProcessor this.outputStream = outputStream; } + public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if (name.equals(SAX_CELL_REF)) { String cellType = attributes.getValue(SAX_CELL_TYPE); - if(cellType != null && cellType.equals(SAX_CELL_STRING)) { + // Analyze cell address. + Matcher cellAddressMatcher = CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS)); + if (cellAddressMatcher.matches()) { + String col = cellAddressMatcher.group(1); + String row = cellAddressMatcher.group(2); + nextCellAddress = new CellAddress(Integer.parseInt(row), columnToIndex(col)); + + if (firstCellAddress == null) { + firstCellAddress = nextCellAddress; + } + } + if (cellType != null && cellType.equals(SAX_CELL_STRING)) { nextIsString = true; } else { nextIsString = false; } } else if (name.equals(SAX_ROW_REF)) { + if (firstRowLastCellAddress == null) { + firstRowLastCellAddress = previousCellAddress; + } firstColInRow = true; + previousCellAddress = null; + nextCellAddress = null; } else if (name.equals(SAX_SHEET_NAME_REF)) { sheetName = attributes.getValue(0); } @@ -325,6 +371,16 @@ public class ConvertExcelToCSVProcessor currentContent = ""; } + private void fillEmptyColumns(int nextColumn) throws IOException { + final CellAddress previousCell = previousCellAddress != null ? previousCellAddress : firstCellAddress; + if (previousCell != null) { + for (int i = 0; i < (nextColumn - previousCell.col); i++) { + // Fill columns. + outputStream.write(",".getBytes()); + } + } + } + public void endElement(String uri, String localName, String name) throws SAXException { @@ -334,22 +390,20 @@ public class ConvertExcelToCSVProcessor nextIsString = false; } - if (name.equals(SAX_CELL_CONTENT_REF)) { - if (firstColInRow) { + if (name.equals(SAX_CELL_CONTENT_REF) + // Limit scanning from the first column, and up to the last column. + && (firstCellAddress == null || firstCellAddress.col <= nextCellAddress.col) + && (firstRowLastCellAddress == null || nextCellAddress.col <= firstRowLastCellAddress.col)) { + try { + // A cell is found. + fillEmptyColumns(nextCellAddress.col); firstColInRow = false; - try { - outputStream.write(currentContent.getBytes()); - } catch (IOException e) { - getLogger().error("IO error encountered while writing content of parsed cell " + - "value from sheet {}", new Object[]{getSheetName()}, e); - } - } else { - try { - outputStream.write(("," + currentContent).getBytes()); - } catch (IOException e) { - getLogger().error("IO error encountered while writing content of parsed cell " + - "value from sheet {}", new Object[]{getSheetName()}, e); - } + outputStream.write(currentContent.getBytes()); + // Keep previously found cell address. + previousCellAddress = nextCellAddress; + } catch (IOException e) { + getLogger().error("IO error encountered while writing content of parsed cell " + + "value from sheet {}", new Object[]{getSheetName()}, e); } } @@ -357,6 +411,9 @@ public class ConvertExcelToCSVProcessor //If this is the first row and the end of the row element has been encountered then that means no columns were present. if (!firstColInRow) { try { + if (firstRowLastCellAddress != null) { + fillEmptyColumns(firstRowLastCellAddress.col); + } rowCount++; outputStream.write("\n".getBytes()); } catch (IOException e) { http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html ---------------------------------------------------------------------- diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html new file mode 100644 index 0000000..fbefa08 --- /dev/null +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/resources/docs/org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor/additionalDetails.html @@ -0,0 +1,97 @@ +<!DOCTYPE html> +<html lang="en"> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<head> + <meta charset="utf-8" /> + <title>ConvertExcelToCSVProcessor</title> + <style> +table { + border-collapse: collapse; +} + +table, th, td { + border: 1px solid #ccc; +} + +td.r { + text-align: right; +} + +td { + width: 50px; + padding: 5px; +} + </style> + <link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css" /> +</head> + +<body> +<h2>How it extracts CSV data from a sheet</h2> +<p> + ConvertExcelToCSVProcessor extracts CSV data with following rules: +</p> +<ul> + <li>Find the fist cell which has a value in it (the FirstCell).</li> + <li>Scan cells in the first row, starting from the FirstCell, + until it reaches to a cell after which no cell with a value can not be found in the row (the FirstRowLastCell).</li> + <li>Process the 2nd row and later, from the column of FirstCell to the column of FirstRowLastCell.</li> + <li>If a row does not have any cell that has a value, then the row is ignored.</li> +</ul> + +<p> + As an example, the sheet shown below will be: +</p> + +<table> + <tbody> + <tr><th>row </th><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th></tr> + <tr><td class="r"> 1</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr> + <tr><td class="r"> 2</td><td> </td><td> </td><td>x</td><td>y</td><td>z</td><td> </td><td> </td></tr> + <tr><td class="r"> 3</td><td> </td><td> </td><td>1</td><td> </td><td> </td><td> </td><td> </td></tr> + <tr><td class="r"> 4</td><td>2</td><td> </td><td> </td><td>3</td><td> </td><td> </td><td> </td></tr> + <tr><td class="r"> 5</td><td> </td><td> </td><td> </td><td> </td><td>4</td><td> </td><td> </td></tr> + <tr><td class="r"> 6</td><td> </td><td> </td><td>5</td><td>6</td><td>7</td><td> </td><td> </td></tr> + <tr><td class="r"> 7</td><td> </td><td> </td><td> </td><td> </td><td> </td><td>8</td><td> </td></tr> + <tr><td class="r"> 8</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr> + <tr><td class="r"> 9</td><td> </td><td> </td><td> </td><td> </td><td>9</td><td> </td><td> </td></tr> + <tr><td class="r"> 10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr> + <tr><td class="r"> 11</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr> + </tbody> +</table> + +<p> + converted to following CSV: +</p> + +<pre> +x,y,z +1,, +,3, +,,4 +5,6,7 +,,9 +</pre> + +<ul> + <li>C2(x) is the FirstCell, and E2(z) is the FirstRowLastCell.</li> + <li>A4(2) is ignored because it is out of range. So is F7(8).</li> + <li>Row 7 and 8 are ignored because those do not have a valid cell.</li> + <li>It is important to have a header row as shown in the example to define data area, + especially when a sheet includes empty cells.</li> +</ul> + +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java ---------------------------------------------------------------------- diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java index 1972fbb..1df2568 100644 --- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java @@ -16,9 +16,11 @@ */ package org.apache.nifi.processors.poi; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; +import java.nio.charset.StandardCharsets; import java.util.List; import org.apache.nifi.flowfile.attributes.CoreAttributes; @@ -40,6 +42,16 @@ public class ConvertExcelToCSVProcessorTest { } @Test + public void testColToIndex() { + assertEquals(Integer.valueOf(0), ConvertExcelToCSVProcessor.columnToIndex("A")); + assertEquals(Integer.valueOf(1), ConvertExcelToCSVProcessor.columnToIndex("B")); + assertEquals(Integer.valueOf(25), ConvertExcelToCSVProcessor.columnToIndex("Z")); + assertEquals(Integer.valueOf(29), ConvertExcelToCSVProcessor.columnToIndex("AD")); + assertEquals(Integer.valueOf(239), ConvertExcelToCSVProcessor.columnToIndex("IF")); + assertEquals(Integer.valueOf(16383), ConvertExcelToCSVProcessor.columnToIndex("XFD")); + } + + @Test public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception { testRunner.enqueue(new File("src/test/resources/TwoSheets.xlsx").toPath()); @@ -150,6 +162,30 @@ public class ConvertExcelToCSVProcessorTest { } /** + * Validates that a sheet contains blank cells can be converted to a CSV without missing columns. + * + * @throws Exception + * Any exception thrown during execution. + */ + @Test + public void testProcessASheetWithBlankCells() throws Exception { + + testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Sheet1"); + testRunner.enqueue(new File("src/test/resources/with-blank-cells.xlsx").toPath()); + testRunner.run(); + + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0); + + MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); + Long l = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); + assertTrue(l == 8l); + ff.isContentEqual("test", StandardCharsets.UTF_8); + ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv")); + } + + /** * Tests for graceful handling and error messaging of unsupported .XLS files. */ @Test http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv ---------------------------------------------------------------------- diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv new file mode 100644 index 0000000..ff3f706 --- /dev/null +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.csv @@ -0,0 +1,8 @@ +A,B,C,D +A1,,, +,B2,C2, +,,C3, +,,C4,D4 +A5,,C5,D5 +A6,B6,,D6 +A7,B7,C7,D7 http://git-wip-us.apache.org/repos/asf/nifi/blob/5c48655e/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx ---------------------------------------------------------------------- diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx new file mode 100644 index 0000000..a948246 Binary files /dev/null and b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/with-blank-cells.xlsx differ
