[jira] [Commented] (NIFI-4465) ConvertExcelToCSV Data Formatting and Delimiters

ASF GitHub Bot (JIRA) Thu, 12 Oct 2017 01:19:15 -0700

    [ 
https://issues.apache.org/jira/browse/NIFI-4465?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16201611#comment-16201611
 ]


ASF GitHub Bot commented on NIFI-4465:
--------------------------------------

Github user ijokarumawak commented on a diff in the pull request:

    https://github.com/apache/nifi/pull/2194#discussion_r143922622
  
    --- Diff: 
nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java
 ---
    @@ -283,163 +348,162 @@ public void process(OutputStream out) throws 
IOException {
             }
         }
     
    -    static Integer columnToIndex(String col) {
    -        int length = col.length();
    -        int accumulator = 0;
    -        for (int i = length; i > 0; i--) {
    -            char c = col.charAt(i - 1);
    -            int x = ((int) c) - 64;
    -            accumulator += x * Math.pow(26, length - i);
    +    /**
    +     * Uses the XSSF Event SAX helpers to do most of the work
    +     *  of parsing the Sheet XML, and outputs the contents
    +     *  as a (basic) CSV.
    +     */
    +    private class SheetToCSV implements 
XSSFSheetXMLHandler.SheetContentsHandler {
    +        private ExcelSheetReadConfig readConfig;
    +        CSVFormat csvFormat;
    +
    +        private boolean firstCellOfRow;
    +        private boolean skipRow;
    +        private int currentRow = -1;
    +        private int currentCol = -1;
    +        private int rowCount = 0;
    +        private boolean rowHasValues=false;
    +        private int skippedColumns=0;
    +
    +        private CSVPrinter printer;
    +
    +        private boolean firstRow=false;
    +
    +        private ArrayList<Object> fieldValues;
    +
    +        public int getRowCount(){
    +            return rowCount;
             }
    -        // Make it to start with 0.
    -        return accumulator - 1;
    -    }
     
    -    private static class CellAddress {
    -        final int row;
    -        final int col;
    +        public void setOutput(PrintStream output){
    +            final OutputStreamWriter streamWriter = new 
OutputStreamWriter(output);
     
    -        private CellAddress(int row, int col) {
    -            this.row = row;
    -            this.col = col;
    +            try {
    +                printer = new CSVPrinter(streamWriter, csvFormat);
    +            } catch (IOException e) {
    +                throw new ProcessException("Failed to create CSV 
Printer.", e);
    +            }
             }
    -    }
     
    -    /**
    -     * Extracts every row from an Excel Sheet and generates a 
corresponding JSONObject whose key is the Excel CellAddress and value
    -     * is the content of that CellAddress converted to a String
    -     */
    -    private class ExcelSheetRowHandler
    -            extends DefaultHandler {
    -
    -        private SharedStringsTable sst;
    -        private String currentContent;
    -        private boolean nextIsString;
    -        private CellAddress firstCellAddress;
    -        private CellAddress firstRowLastCellAddress;
    -        private CellAddress previousCellAddress;
    -        private CellAddress nextCellAddress;
    -        private OutputStream outputStream;
    -        private boolean firstColInRow;
    -        long rowCount;
    -        String sheetName;
    -
    -        private ExcelSheetRowHandler(SharedStringsTable sst) {
    -            this.sst = sst;
    -            this.firstColInRow = true;
    -            this.rowCount = 0l;
    -            this.sheetName = UNKNOWN_SHEET_NAME;
    +        public SheetToCSV(ExcelSheetReadConfig readConfig, CSVFormat 
csvFormat){
    +            this.readConfig = readConfig;
    +            this.csvFormat = csvFormat;
             }
     
    -        public void setFlowFileOutputStream(OutputStream outputStream) {
    -            this.outputStream = outputStream;
    +        @Override
    +        public void startRow(int rowNum) {
    +            if(rowNum <= readConfig.getOverrideFirstRow()) {
    +                skipRow = true;
    +                return;
    +            }
    +
    +            // Prepare for this row
    +            skipRow = false;
    +            firstCellOfRow = true;
    +            firstRow = currentRow==-1;
    +            currentRow = rowNum;
    +            currentCol = -1;
    +            rowHasValues = false;
    +
    +            fieldValues = new ArrayList<>();
             }
     
    +        @Override
    +        public void endRow(int rowNum) {
    +            if(skipRow) {
    +                return;
    +            }
     
    -        public void startElement(String uri, String localName, String name,
    -                Attributes attributes) throws SAXException {
    +            if(firstRow){
    +                readConfig.setLastColumn(currentCol);
    +            }
     
    -            if (name.equals(SAX_CELL_REF)) {
    -                String cellType = attributes.getValue(SAX_CELL_TYPE);
    -                // Analyze cell address.
    -                Matcher cellAddressMatcher = 
CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS));
    -                if (cellAddressMatcher.matches()) {
    -                    String col = cellAddressMatcher.group(1);
    -                    String row = cellAddressMatcher.group(2);
    -                    nextCellAddress = new 
CellAddress(Integer.parseInt(row), columnToIndex(col));
    +            //if there was no data in this row, don't write it
    +            if(!rowHasValues) {
    +                return;
    +            }
     
    -                    if (firstCellAddress == null) {
    -                        firstCellAddress = nextCellAddress;
    -                    }
    -                }
    -                if (cellType != null && cellType.equals(SAX_CELL_STRING)) {
    -                    nextIsString = true;
    -                } else {
    -                    nextIsString = false;
    -                }
    -            } else if (name.equals(SAX_ROW_REF)) {
    -                if (firstRowLastCellAddress == null) {
    -                    firstRowLastCellAddress = previousCellAddress;
    -                }
    -                firstColInRow = true;
    -                previousCellAddress = null;
    -                nextCellAddress = null;
    -            } else if (name.equals(SAX_SHEET_NAME_REF)) {
    -                sheetName = attributes.getValue(0);
    +            // Ensure the correct number of columns
    +            int columnsToAdd = (readConfig.getLastColumn() - currentCol) - 
readConfig.getColumnsToSkip().size();
    +            for (int i=0; i<columnsToAdd; i++) {
    +                fieldValues.add(null);
                 }
     
    -            currentContent = "";
    +            try {
    +                printer.printRecord(fieldValues);
    +            } catch (IOException e) {
    +                e.printStackTrace();
    +            }
    +
    +            rowCount++;
             }
     
    -        private void fillEmptyColumns(int nextColumn) throws IOException {
    -            final CellAddress previousCell = previousCellAddress != null ? 
previousCellAddress : firstCellAddress;
    -            if (previousCell != null) {
    -                for (int i = 0; i < (nextColumn - previousCell.col); i++) {
    -                    // Fill columns.
    -                    outputStream.write(",".getBytes());
    -                }
    +        @Override
    +        public void cell(String cellReference, String formattedValue,
    +                         XSSFComment comment) {
    +            if(skipRow) {
    +                return;
                 }
    -        }
     
    -        public void endElement(String uri, String localName, String name)
    -                throws SAXException {
    +            // gracefully handle missing CellRef here in a similar way as 
XSSFCell does
    +            if(cellReference == null) {
    +                cellReference = new CellAddress(currentRow, 
currentCol).formatAsString();
    +            }
    +
    +            // Did we miss any cells?
    +            int thisCol = (new CellReference(cellReference)).getCol();
     
    -            if (nextIsString) {
    -                int idx = Integer.parseInt(currentContent);
    -                currentContent = new 
XSSFRichTextString(sst.getEntryAt(idx)).toString();
    -                nextIsString = false;
    +            // Should we skip this
    +
    +            //Use the first row of the file to decide on the area of data 
to export
    +            if(firstRow && firstCellOfRow){
    +                readConfig.setFirstRow(currentRow);
    +                readConfig.setFirstColumn(thisCol);
                 }
     
    -            if (name.equals(SAX_CELL_CONTENT_REF)
    -                    // Limit scanning from the first column, and up to the 
last column.
    -                    && (firstCellAddress == null || firstCellAddress.col 
<= nextCellAddress.col)
    -                    && (firstRowLastCellAddress == null || 
nextCellAddress.col <= firstRowLastCellAddress.col)) {
    -                try {
    -                    // A cell is found.
    -                    fillEmptyColumns(nextCellAddress.col);
    -                    firstColInRow = false;
    -                    outputStream.write(currentContent.getBytes());
    -                    // Keep previously found cell address.
    -                    previousCellAddress = nextCellAddress;
    -                } catch (IOException e) {
    -                    getLogger().error("IO error encountered while writing 
content of parsed cell " +
    -                            "value from sheet {}", new 
Object[]{getSheetName()}, e);
    -                }
    +            //if this cell falls outside our area, or has been explcitely 
marked as a skipped column, return and don't write it out.
    +            if(!firstRow && (thisCol < readConfig.getFirstColumn() || 
thisCol > readConfig.getLastColumn())){
    +                return;
                 }
     
    -            if (name.equals(SAX_ROW_REF)) {
    -                //If this is the first row and the end of the row element 
has been encountered then that means no columns were present.
    -                if (!firstColInRow) {
    -                    try {
    -                        if (firstRowLastCellAddress != null) {
    -                            fillEmptyColumns(firstRowLastCellAddress.col);
    -                        }
    -                        rowCount++;
    -                        outputStream.write("\n".getBytes());
    -                    } catch (IOException e) {
    -                        getLogger().error("IO error encountered while 
writing new line indicator", e);
    -                    }
    -                }
    +            if(readConfig.getColumnsToSkip().contains(thisCol)){
    +                skippedColumns++;
    +                return;
                 }
     
    -        }
    +            int missedCols = (thisCol - readConfig.getFirstColumn()) - 
(currentCol - readConfig.getFirstColumn()) - 1;
    +            if(firstCellOfRow){
    +                missedCols = (thisCol - readConfig.getFirstColumn());
    +            }
    +
    +            missedCols -= skippedColumns;
     
    -        public void characters(char[] ch, int start, int length)
    -                throws SAXException {
    -            currentContent += new String(ch, start, length);
    +            if (firstCellOfRow) {
    +                firstCellOfRow = false;
    +            }
    +
    +            for (int i=0; i<missedCols; i++) {
    +                fieldValues.add(null);
    +            }
    +            currentCol = thisCol;
    +
    +            fieldValues.add(formattedValue);
    +
    +            rowHasValues = true;
    +            skippedColumns = 0;
             }
     
    -        public long getRowCount() {
    -            return rowCount;
    +        @Override
    +        public void headerFooter(String s, boolean b, String s1) {
    +
             }
     
    -        public String getSheetName() {
    -            return sheetName;
    +        public void close() throws IOException {
    +            printer.close();
             }
         }
     
    -
    -    /**
    +        /**
    --- End diff --
    
    Nitpicking, but wrong indentation.


> ConvertExcelToCSV Data Formatting and Delimiters
> ------------------------------------------------
>
>                 Key: NIFI-4465
>                 URL: https://issues.apache.org/jira/browse/NIFI-4465
>             Project: Apache NiFi
>          Issue Type: Improvement
>          Components: Core Framework
>            Reporter: Peter Wicks
>            Assignee: Peter Wicks
>            Priority: Minor
>             Fix For: 1.5.0
>
>
> The ConvertExcelToCSV Processor does not output cell values using the 
> formatting set in Excel.
> There are also no delimiter options available for column/record delimiting.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (NIFI-4465) ConvertExcelToCSV Data Formatting and Delimiters

Reply via email to