[
https://issues.apache.org/jira/browse/NIFI-4465?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16201611#comment-16201611
]
ASF GitHub Bot commented on NIFI-4465:
--------------------------------------
Github user ijokarumawak commented on a diff in the pull request:
https://github.com/apache/nifi/pull/2194#discussion_r143922622
--- Diff:
nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
---
@@ -283,163 +348,162 @@ public void process(OutputStream out) throws
IOException {
}
}
- static Integer columnToIndex(String col) {
- int length = col.length();
- int accumulator = 0;
- for (int i = length; i > 0; i--) {
- char c = col.charAt(i - 1);
- int x = ((int) c) - 64;
- accumulator += x * Math.pow(26, length - i);
+ /**
+ * Uses the XSSF Event SAX helpers to do most of the work
+ * of parsing the Sheet XML, and outputs the contents
+ * as a (basic) CSV.
+ */
+ private class SheetToCSV implements
XSSFSheetXMLHandler.SheetContentsHandler {
+ private ExcelSheetReadConfig readConfig;
+ CSVFormat csvFormat;
+
+ private boolean firstCellOfRow;
+ private boolean skipRow;
+ private int currentRow = -1;
+ private int currentCol = -1;
+ private int rowCount = 0;
+ private boolean rowHasValues=false;
+ private int skippedColumns=0;
+
+ private CSVPrinter printer;
+
+ private boolean firstRow=false;
+
+ private ArrayList<Object> fieldValues;
+
+ public int getRowCount(){
+ return rowCount;
}
- // Make it to start with 0.
- return accumulator - 1;
- }
- private static class CellAddress {
- final int row;
- final int col;
+ public void setOutput(PrintStream output){
+ final OutputStreamWriter streamWriter = new
OutputStreamWriter(output);
- private CellAddress(int row, int col) {
- this.row = row;
- this.col = col;
+ try {
+ printer = new CSVPrinter(streamWriter, csvFormat);
+ } catch (IOException e) {
+ throw new ProcessException("Failed to create CSV
Printer.", e);
+ }
}
- }
- /**
- * Extracts every row from an Excel Sheet and generates a
corresponding JSONObject whose key is the Excel CellAddress and value
- * is the content of that CellAddress converted to a String
- */
- private class ExcelSheetRowHandler
- extends DefaultHandler {
-
- private SharedStringsTable sst;
- private String currentContent;
- private boolean nextIsString;
- private CellAddress firstCellAddress;
- private CellAddress firstRowLastCellAddress;
- private CellAddress previousCellAddress;
- private CellAddress nextCellAddress;
- private OutputStream outputStream;
- private boolean firstColInRow;
- long rowCount;
- String sheetName;
-
- private ExcelSheetRowHandler(SharedStringsTable sst) {
- this.sst = sst;
- this.firstColInRow = true;
- this.rowCount = 0l;
- this.sheetName = UNKNOWN_SHEET_NAME;
+ public SheetToCSV(ExcelSheetReadConfig readConfig, CSVFormat
csvFormat){
+ this.readConfig = readConfig;
+ this.csvFormat = csvFormat;
}
- public void setFlowFileOutputStream(OutputStream outputStream) {
- this.outputStream = outputStream;
+ @Override
+ public void startRow(int rowNum) {
+ if(rowNum <= readConfig.getOverrideFirstRow()) {
+ skipRow = true;
+ return;
+ }
+
+ // Prepare for this row
+ skipRow = false;
+ firstCellOfRow = true;
+ firstRow = currentRow==-1;
+ currentRow = rowNum;
+ currentCol = -1;
+ rowHasValues = false;
+
+ fieldValues = new ArrayList<>();
}
+ @Override
+ public void endRow(int rowNum) {
+ if(skipRow) {
+ return;
+ }
- public void startElement(String uri, String localName, String name,
- Attributes attributes) throws SAXException {
+ if(firstRow){
+ readConfig.setLastColumn(currentCol);
+ }
- if (name.equals(SAX_CELL_REF)) {
- String cellType = attributes.getValue(SAX_CELL_TYPE);
- // Analyze cell address.
- Matcher cellAddressMatcher =
CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS));
- if (cellAddressMatcher.matches()) {
- String col = cellAddressMatcher.group(1);
- String row = cellAddressMatcher.group(2);
- nextCellAddress = new
CellAddress(Integer.parseInt(row), columnToIndex(col));
+ //if there was no data in this row, don't write it
+ if(!rowHasValues) {
+ return;
+ }
- if (firstCellAddress == null) {
- firstCellAddress = nextCellAddress;
- }
- }
- if (cellType != null && cellType.equals(SAX_CELL_STRING)) {
- nextIsString = true;
- } else {
- nextIsString = false;
- }
- } else if (name.equals(SAX_ROW_REF)) {
- if (firstRowLastCellAddress == null) {
- firstRowLastCellAddress = previousCellAddress;
- }
- firstColInRow = true;
- previousCellAddress = null;
- nextCellAddress = null;
- } else if (name.equals(SAX_SHEET_NAME_REF)) {
- sheetName = attributes.getValue(0);
+ // Ensure the correct number of columns
+ int columnsToAdd = (readConfig.getLastColumn() - currentCol) -
readConfig.getColumnsToSkip().size();
+ for (int i=0; i<columnsToAdd; i++) {
+ fieldValues.add(null);
}
- currentContent = "";
+ try {
+ printer.printRecord(fieldValues);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ rowCount++;
}
- private void fillEmptyColumns(int nextColumn) throws IOException {
- final CellAddress previousCell = previousCellAddress != null ?
previousCellAddress : firstCellAddress;
- if (previousCell != null) {
- for (int i = 0; i < (nextColumn - previousCell.col); i++) {
- // Fill columns.
- outputStream.write(",".getBytes());
- }
+ @Override
+ public void cell(String cellReference, String formattedValue,
+ XSSFComment comment) {
+ if(skipRow) {
+ return;
}
- }
- public void endElement(String uri, String localName, String name)
- throws SAXException {
+ // gracefully handle missing CellRef here in a similar way as
XSSFCell does
+ if(cellReference == null) {
+ cellReference = new CellAddress(currentRow,
currentCol).formatAsString();
+ }
+
+ // Did we miss any cells?
+ int thisCol = (new CellReference(cellReference)).getCol();
- if (nextIsString) {
- int idx = Integer.parseInt(currentContent);
- currentContent = new
XSSFRichTextString(sst.getEntryAt(idx)).toString();
- nextIsString = false;
+ // Should we skip this
+
+ //Use the first row of the file to decide on the area of data
to export
+ if(firstRow && firstCellOfRow){
+ readConfig.setFirstRow(currentRow);
+ readConfig.setFirstColumn(thisCol);
}
- if (name.equals(SAX_CELL_CONTENT_REF)
- // Limit scanning from the first column, and up to the
last column.
- && (firstCellAddress == null || firstCellAddress.col
<= nextCellAddress.col)
- && (firstRowLastCellAddress == null ||
nextCellAddress.col <= firstRowLastCellAddress.col)) {
- try {
- // A cell is found.
- fillEmptyColumns(nextCellAddress.col);
- firstColInRow = false;
- outputStream.write(currentContent.getBytes());
- // Keep previously found cell address.
- previousCellAddress = nextCellAddress;
- } catch (IOException e) {
- getLogger().error("IO error encountered while writing
content of parsed cell " +
- "value from sheet {}", new
Object[]{getSheetName()}, e);
- }
+ //if this cell falls outside our area, or has been explcitely
marked as a skipped column, return and don't write it out.
+ if(!firstRow && (thisCol < readConfig.getFirstColumn() ||
thisCol > readConfig.getLastColumn())){
+ return;
}
- if (name.equals(SAX_ROW_REF)) {
- //If this is the first row and the end of the row element
has been encountered then that means no columns were present.
- if (!firstColInRow) {
- try {
- if (firstRowLastCellAddress != null) {
- fillEmptyColumns(firstRowLastCellAddress.col);
- }
- rowCount++;
- outputStream.write("\n".getBytes());
- } catch (IOException e) {
- getLogger().error("IO error encountered while
writing new line indicator", e);
- }
- }
+ if(readConfig.getColumnsToSkip().contains(thisCol)){
+ skippedColumns++;
+ return;
}
- }
+ int missedCols = (thisCol - readConfig.getFirstColumn()) -
(currentCol - readConfig.getFirstColumn()) - 1;
+ if(firstCellOfRow){
+ missedCols = (thisCol - readConfig.getFirstColumn());
+ }
+
+ missedCols -= skippedColumns;
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- currentContent += new String(ch, start, length);
+ if (firstCellOfRow) {
+ firstCellOfRow = false;
+ }
+
+ for (int i=0; i<missedCols; i++) {
+ fieldValues.add(null);
+ }
+ currentCol = thisCol;
+
+ fieldValues.add(formattedValue);
+
+ rowHasValues = true;
+ skippedColumns = 0;
}
- public long getRowCount() {
- return rowCount;
+ @Override
+ public void headerFooter(String s, boolean b, String s1) {
+
}
- public String getSheetName() {
- return sheetName;
+ public void close() throws IOException {
+ printer.close();
}
}
-
- /**
+ /**
--- End diff --
Nitpicking, but wrong indentation.
> ConvertExcelToCSV Data Formatting and Delimiters
> ------------------------------------------------
>
> Key: NIFI-4465
> URL: https://issues.apache.org/jira/browse/NIFI-4465
> Project: Apache NiFi
> Issue Type: Improvement
> Components: Core Framework
> Reporter: Peter Wicks
> Assignee: Peter Wicks
> Priority: Minor
> Fix For: 1.5.0
>
>
> The ConvertExcelToCSV Processor does not output cell values using the
> formatting set in Excel.
> There are also no delimiter options available for column/record delimiting.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)