CSV component
-------------

                 Key: IO-200
                 URL: https://issues.apache.org/jira/browse/IO-200
             Project: Commons IO
          Issue Type: New Feature
          Components: Utilities
            Reporter: haruhiko nishi
            Priority: Trivial


TableBuilder is 'Builder ' that creates matrix of CSV and allows client to 
manipulate it(currently copy column only, as I could not  think not of other 
operation that may be useful)
Within the TableBuilder, each column of the CSV is represented as byte[] and 
becomes an object to be validated against Rule,represented by the interface as 
follows. As TableBuilder "buildTable" , each byte[] from each 'column' of CSV  
is passed to isValid() of the pre-set Rules. you can add as many Rule as you 
need on columns you want appy rules.Rule gets executed until the validation 
fails or succeeds. If any of the Rule fails, then its replace() is called and 
the column value being worked on gets replaced by the retun value.

TableBuilder is RFC 4180 compliant therefore it distinguishes NL itself and NL 
found in double quotes.
So you can add Rule that practically removes all NL chars found in value 
enclosed within doublequotes. 

TableBuilder is just a "rough skecth" idea of CSV parsing.(The code below works 
fine though) it still needs alot of refactoring and so.
I appreciate any comment.
 
TableBuilder provides addRule() methods with which client can apply Rules to 
column. 
StringValueruleAdapter is an adapter converts the byte[] representation of the 
column.

Here is simple exampe to use TableBuilder.

    public static void main(String[] args)throws Exception{
        TableBuilder tableBuilder=new TableBuilder("UTF-8",
                new MessageHandler(){
                    public void handleMessage(String message) {
                        System.err.println(message);
                    }
                },0,true);
        tableBuilder.addRule(3,new RemoveNLChars()); //removing NL cahracters 
found in value.
        tableBuilder.parse(new FileInputStream("test.txt"),TableBuilder.CSV);
        List<Record> list=tableBuilder.getRowAsListOf(Record.class);
        for(Record record:list)
            System.out.println(record.getA());

        tableBuilder.writeTo(new 
FileOutputStream("test_mod.txt"),TableBuilder.CSV);
    }

public class RemoveNLChars extends StringValueRuleAdapter {
    protected boolean isValid(String columnValue) {
        return !columnValue.contains(System.getProperty("line.separator"));
    }

    protected String replace(String columnValue) {
        return columnValue.replaceAll(System.getProperty("line.separator"),"");
    }

    public String getMessage() {
        return "";
    }
}

public interface Rule {
    public void setRowReference(List<byte[]> rowReference);
    public void setCharsetName(String charsetName);
    boolean isValid(final byte[] columnValue);
    byte[] replace(final byte[] columnValue);
    String getMessage();
}

public  abstract class StringValueRuleAdapter implements Rule{
    private String charsetName;
    private List<byte[]> rowReference;
    
    public void setRowReference(List<byte[]> rowReference) {
        this.rowReference=rowReference;
    }

    public void setCharsetName(String charsetName) {
        this.charsetName=charsetName;
    }

    public final boolean isValid(final byte[] columnValue) {
        String strValue;
        try {
            if(columnValue.length>0)
                strValue=(charsetName!=null) ? new 
String(columnValue,charsetName) : new String(columnValue);
            else
                strValue="";
        } catch (UnsupportedEncodingException e) {
            if(columnValue.length>0)
                strValue=new String(columnValue);
            else
                 strValue="";
        }
        return isValid(strValue);
    }

    public final byte[] replace(final byte[] columnValue) {
        String strValue;
        try {
            if(columnValue.length>0)
                strValue=(charsetName!=null) ? new 
String(columnValue,charsetName):new String(columnValue);
            else
                strValue="";
            return (charsetName!=null) ? 
replace(strValue).getBytes(charsetName):replace(strValue).getBytes();
        } catch (UnsupportedEncodingException e) {
            if(columnValue.length>0)
                strValue=new String(columnValue);
            else
                strValue="";
            return replace(strValue).getBytes();
        }
    }

    protected String getRowValue(int column) {
        try {
            return (charsetName!=null) ? new 
String(rowReference.get(column),charsetName) :
                    new String(rowReference.get(column));
        } catch (UnsupportedEncodingException e) {
            return new String(rowReference.get(column));
        } catch(IndexOutOfBoundsException noListFound){
            throw new IllegalArgumentException("no value exists at the 
requested column.");
        }
    }

    protected String getPrecedingRowValue(){
        return getRowValue(rowReference.size()-1);
    }

    protected abstract boolean isValid(String columnValue);
    protected abstract String replace(String columnValue);
}


public class TableBuilder {
    public static int CSV=0x2c;
    public static int TSV=0x09;
    private Map<Integer,Set<Rule>> columnRule=new  HashMap<Integer,Set<Rule>>();
    private Table currentTable;
    private byte[] newLineChars;
    private boolean endsWithNL;
    private String charsetName;
    private int rowOffset;
    private boolean useFirstColumnAsRowName;
    private MessageHandler msgHandler=new MessageHandler(){

        public void handleMessage(String message) {
            System.err.println(message);
        }
    };

    public TableBuilder(String charsetName,MessageHandler msgHandler,int 
rowOffset,boolean useFirstColumnAsRowName){
        this.charsetName=charsetName;
        this.rowOffset=rowOffset;
        this.msgHandler=msgHandler;
        this.useFirstColumnAsRowName=useFirstColumnAsRowName;
    }

    public TableBuilder(String charsetName){
        this.charsetName=charsetName;
    }

    public TableBuilder(){
        
    }

    public void addRule(int column, Rule rule){
        Set<Rule> ruleset;
        if((ruleset=columnRule.get(column))==null){
            ruleset=new LinkedHashSet<Rule>();
            columnRule.put(column,ruleset);
        }
        rule.setCharsetName(charsetName);
        ruleset.add(rule);
    }

    public void parse(InputStream in, int delimiter)throws Exception{
        int bytesRead;
        byte buf[]=new byte[1024];
        ByteArrayOutputStream outbuf=new ByteArrayOutputStream(buf.length);
        while((bytesRead=in.read(buf,0,buf.length))!=-1)
            outbuf.write(buf,0,bytesRead);
        in.close();
        ByteBuffer 
bytebuffer=ByteBuffer.allocateDirect(outbuf.size()).put(outbuf.toByteArray());
        bytebuffer.flip();
        currentTable=buildTable(bytebuffer,delimiter);
    }

    private class Table {
        private List<byte[]>[] columnMatrix;
        private List<List<byte[]>> rowMatrix;
        
        Table(List<byte[]>[] columnMatrix,List<List<byte[]>> rowMatrix){
            this.columnMatrix=columnMatrix;
            this.rowMatrix=rowMatrix;
        }

        public int getNumOfColumns() {
            return columnMatrix.length;
        }

        public int getNumOfRows(){
            return rowMatrix.size();
        }

        public byte[] getValueAt(int row, int column) {
            return columnMatrix[column].get(row);
        }

        public byte[] getColumnName(int column){
            return columnMatrix[column].get(0);
        }

        public List<byte[]> getColumn(int column){
            return columnMatrix[column];
        }

        public List<byte[]> getRow(int row){
            return rowMatrix.get(row);
        }
        
    }
  //TODO  extract csv row as JavaBean
    public <E> List<E> getRowAsListOf(final Class<E> clazz){
        List<E> list=null;
        Iterator<byte[]> header=currentTable.getRow(0).iterator();
        for(int i=1;i<currentTable.getNumOfRows();i++){
            try {
                E instance=clazz.newInstance();
                for(byte[] value:currentTable.getRow(i)){
                    String name=new String(header.next());
                    //BeanUtils.setProperty(instance,name,value);
                }
                if(list==null)
                    list=new ArrayList<E>();
                list.add(instance);
                header=currentTable.getRow(0).iterator();
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (InvocationTargetException e) {
                e.printStackTrace();
            } catch (InstantiationException e) {
                e.printStackTrace();
            }
        }
        return list;
    }

    public void writeTo(OutputStream out,int delimiter) throws IOException {

        for(int i=0,j=0;i<currentTable.getNumOfRows();i++,j=0){
            for(byte[] value:currentTable.getRow(i)){
                out.write(value);
                if(++j<currentTable.getNumOfColumns())
                    out.write(delimiter);
            }
            if(i<currentTable.getNumOfRows()-1)
                out.write(newLineChars);
            else{                                             
                if(endsWithNL)
                    out.write(newLineChars);
            }
        }
        out.close();
    }
     
     public void copyColumn(Rule rule,int from,int to, boolean override) {
            int numOfColumns=override ? 
currentTable.getNumOfColumns():currentTable.getNumOfColumns()+1;
            List<byte[]>[] columnMatrix=(List<byte[]>[])new List[numOfColumns];
            columnMatrix[to]=new ArrayList<byte[]>();
            for(int i=0,j=0;i<columnMatrix.length;i++){
                if(i==to){
                    for(int row=0;row<currentTable.getNumOfRows();row++){
                        byte[] value;
                        if(row>=rowOffset)
                            value=currentTable.getValueAt(row,from);
                        else
                            value=new byte[0];
                        if(rule!=null && row>rowOffset){
                            rule.setCharsetName(charsetName);
                            rule.setRowReference(currentTable.getRow(row));
                            if(!rule.isValid(value)){
                                String columnName;
                                byte[] 
columnNameByte=currentTable.getColumnName(from);
                                if(columnNameByte.length>0){
                                    try {
                                        if(charsetName!=null)
                                            columnName="'"+new 
String(columnNameByte,charsetName).trim()+"'";
                                        else
                                            columnName="'"+new 
String(columnNameByte).trim()+"'";
                                    } catch (UnsupportedEncodingException e) {
                                        columnName="'"+new 
String(columnNameByte).trim()+"'";
                                    }
                                }else
                                    columnName="''";
                                value=rule.replace(value);
                                String msg=rule.getMessage();
                                    if(msg.length()>0)
                                        try {
                                            handleMessage(msg
                                            .replace("${column_from}",""+from)
                                            .replace("${columnName}",columnName)
                                            .replace("${column_to}",""+(to+1))
                                            
.replace("${row}",useFirstColumnAsRowName ? new 
String(currentTable.getRow(row).get(0),charsetName) : ""+(row+1)));
                                        } catch (UnsupportedEncodingException 
ignored) {
                                            
                                        }
                            }
                        }
                    columnMatrix[i].add(value);
                    if(override)
                        currentTable.rowMatrix.get(row).remove(i);
                    currentTable.rowMatrix.get(row).add(i,value);
                    }
                    if(override)
                        ++j;
                }else
                    columnMatrix[i]=currentTable.getColumn(j++);
            }
            currentTable=new Table(columnMatrix,currentTable.rowMatrix);
    }
    
    private Table buildTable(ByteBuffer buf,int delimiter) throws 
ParseException {
        List<byte[]>[] columnMatrix=null;
        List<List<byte[]>> rowMatrix=new ArrayList<List<byte[]>>();
        int i=0,j,currentRow=0,rowIndex=0,column_count=0,column=0;
        endsWithNL=true;
        newLineChars=null;
        int limit=buf.limit();
        int pos=0;

        while(i<limit && 
((j=(buf.get(i)&0xff))==0x0d||(j=(buf.get(i)&0xff))==0x0a)){
            if(j==0x0a)
                ++currentRow;
            pos=++i;
        }
        
        int headRow=currentRow;
        while(i<limit){
            int tmp=buf.get(i) & 0xff;
                if(tmp==0x0a){
                    int k=i;
                    while(k>=0 
&&((buf.get(k)&0xff)==0x0d||(buf.get(k)&0xff)==0x0a))
                        --k;
                    byte[] prev=new byte[++k-pos];
                    
                    buf.position(pos);
                    buf.get(prev,0,prev.length);
                    List<byte[]> row;
                    try{
                        row=rowMatrix.get(rowIndex);
                    }catch(IndexOutOfBoundsException noListFound){
                        rowMatrix.add(new ArrayList<byte[]>());
                        row=rowMatrix.get(rowIndex);
                    }
                    if(currentRow==headRow){
                        column_count=column;
                        row.add(prev);
                        columnMatrix=(List<byte[]>[])new ArrayList[column+1];
                        Iterator<byte[]> itr;
                        for(j=0,itr=row.iterator();j<columnMatrix.length;j++){
                            columnMatrix[j]=new ArrayList<byte[]>();
                            columnMatrix[j].add(itr.next());
                        }
                    }else if(column_count!=column){
                        throw new ParseException("column count mismatch on row 
",currentRow+1);

                    }else{
                        Set<Rule> ruleset=columnRule.get(column);
                        if(ruleset!=null && currentRow>rowOffset+headRow){
                            byte[] 
columnNameByte=rowMatrix.get(rowOffset).get(column);
                            Rule rule=validate(ruleset,prev,row);
                            if(rule!=null){
                                String columnName;
                                if(columnNameByte.length>0){
                                    try {
                                        if(charsetName!=null)
                                            columnName="'"+new 
String(columnNameByte,charsetName).trim()+"'";
                                        else
                                            columnName="'"+new 
String(columnNameByte).trim()+"'";
                                    } catch (UnsupportedEncodingException e) {
                                        columnName="'"+new 
String(columnNameByte).trim()+"'";
                                    }
                                }else
                                    columnName="''";
                                prev=rule.replace(prev);
                                String msg=rule.getMessage();
                                if(msg.length()>0)
                                    try {
                                        handleMessage(msg
                                            .replace("${column}",""+column)
                                            
.replace("${columnName}",columnName.trim())
                                            
.replace("${row}",useFirstColumnAsRowName ? new 
String(rowMatrix.get(rowIndex).get(0),charsetName) : ""+(currentRow+1)));
                                    } catch (UnsupportedEncodingException 
ignored) {

                                    }
                            }
                        }
                        columnMatrix[column].add(prev);
                        row.add(prev);
                    }

                    if(newLineChars==null){
                        newLineChars=new byte[++i-k];
                        buf.position(k);
                        buf.get(newLineChars,0,newLineChars.length);
                    }else
                        ++i;
                while(i<limit && 
((j=(buf.get(i)&0xff))==0x0d||(j=(buf.get(i)&0xff))==0x0a)){
                    if(j==0x0a)
                        ++currentRow;
                    ++i;
                }
                column=0;
                ++currentRow;

                ++rowIndex;
                pos=i;
            }else if(tmp==delimiter){
                List<byte[]> row;
                try{
                    row=rowMatrix.get(rowIndex);
                }catch(IndexOutOfBoundsException noListFound){
                    rowMatrix.add(new ArrayList<byte[]>());
                    row=rowMatrix.get(rowIndex);
                }
                byte[] prev=new byte[i-pos];
                buf.position(pos);
                buf.get(prev,0,prev.length);
                if(currentRow==headRow)
                    row.add(prev);
                else{
                    Set<Rule> ruleset=columnRule.get(column);
                    if(ruleset!=null && currentRow>rowOffset+headRow){
                        byte[] 
columnNameByte=rowMatrix.get(rowOffset).get(column);
                            Rule rule=validate(ruleset,prev,row);
                            if(rule!=null){
                                String columnName;
                                if(columnNameByte.length>0){
                                    try {
                                        if(charsetName!=null)
                                            columnName="'"+new 
String(columnNameByte,charsetName).trim()+"'";
                                        else
                                            columnName="'"+new 
String(columnNameByte).trim()+"'";
                                    } catch (UnsupportedEncodingException e) {
                                        columnName="'"+new 
String(columnNameByte).trim()+"'";
                                    }
                                }else
                                    columnName="''";
                                prev=rule.replace(prev);
                        String msg=rule.getMessage();
                        if(msg.length()>0)
                            try {
                                handleMessage(msg
                                        .replace("${column}",""+column)
                                        
.replace("${columnName}",columnName.trim())
                                        
.replace("${row}",useFirstColumnAsRowName ? new 
String(rowMatrix.get(rowIndex).get(0),charsetName) : ""+(currentRow+1)));
                            } catch (UnsupportedEncodingException ignored) {

                            }

                            }
                    }
                    columnMatrix[column].add(prev);
                    row.add(prev);
                }
                ++column;
                pos=++i;
            }else
                if((i=_ESCAPED(buf,i))==i)
                    ++i;
        }

        if(pos!=limit){
            endsWithNL=false;
            byte[] remaining=new byte[limit-pos];
            buf.position(pos);
            buf.get(remaining,0,remaining.length);
            
            if(columnMatrix!=null){
                if(column_count!=column)
                    throw new ParseException("column count mismatch on row 
",+1+currentRow);
                List<byte[]> row=rowMatrix.get(rowIndex);
                row.add(remaining);
                Set<Rule> ruleset=columnRule.get(column);
                if(ruleset!=null && currentRow>rowOffset+headRow){
                    byte[] columnNameByte=rowMatrix.get(rowOffset).get(column);
                    Rule rule=validate(ruleset,remaining,row);
                    if(rule!=null){
                        String columnName;
                        if(columnNameByte.length>0){
                            try {
                                if(charsetName!=null)
                                    columnName="'"+new 
String(columnNameByte,charsetName).trim()+"'";
                                else
                                    columnName="'"+new 
String(columnNameByte).trim()+"'";
                            } catch (UnsupportedEncodingException e) {
                                columnName="'"+new 
String(columnNameByte).trim()+"'";
                            }
                        }else
                            columnName="''";
                        remaining=rule.replace(remaining);
                        String msg=rule.getMessage();
                        if(msg.length()>0)
                            try {
                                handleMessage(msg
                                .replace("${column}",""+column)
                                .replace("${columnName}",columnName.trim())
                                .replace("${row}",useFirstColumnAsRowName ? new 
String(rowMatrix.get(rowIndex).get(0),charsetName) : ""+(currentRow+1)));
                            } catch (UnsupportedEncodingException ignored) {

                            }

                    }
                }
                columnMatrix[column].add(remaining);
            }else{
                columnMatrix=(List<byte[]>[])new List[column+1];
                List<byte[]> row;
                try{
                    row=rowMatrix.get(rowIndex);
                }catch(IndexOutOfBoundsException noListFound){
                    rowMatrix.add(new ArrayList<byte[]>());
                    row=rowMatrix.get(rowIndex);
                }
                row.add(remaining);
                Iterator<byte[]> itr;
                for(j=0,itr=row.iterator();j<columnMatrix.length;j++){
                    columnMatrix[j]=new ArrayList<byte[]>(1);
                    columnMatrix[j].add(itr.next());
                }
            }
        }
        return new Table(columnMatrix,rowMatrix);
    }

    private int _ESCAPED(ByteBuffer src,int i){
        int org=i;
        if(i==src.limit())
            return i;
        int j;
        if((j=_DQUOTE(src,i))==i)
            return i;

        
for(i=j;(j=_TEXTDATA(src,i))>i||(j=_COMMA(src,i))>i||(j=_CR(src,i))>i||(j=_LF(src,i))>i||(j=_2DQUOTE(src,i))>i;)
            i=j;

        if(i==_DQUOTE(src,i))
            return org;
        return i;
    }

    private int _TEXTDATA(ByteBuffer src,int i){
        if(i==src.limit())
            return i;
        if(_COMMA(src,i)==i && _CR(src,i)==i && _LF(src,i)==i && 
_DQUOTE(src,i)==i)
            return ++i;
        return i;
    }

    private int _2DQUOTE(ByteBuffer src,int i) {
        if(i==src.limit())
            return i;
        if(i==_DQUOTE(src,i))
            return i;
        if(i+1==_DQUOTE(src,i+1))
            return i;
        return i+2;
    }

    private int _DQUOTE(ByteBuffer src,int i) {
        return _CHAR(src,i,0x22);
    }

    public int _LF(ByteBuffer src,int i) {
        return _CHAR(src,i,0x0a);
    }

    private int _CR(ByteBuffer src,int i) {
        return _CHAR(src,i,0x0d);
    }

    private int _COMMA(ByteBuffer src,int i) {
        return _CHAR(src,i,0x2c);
    }

    private int _CHAR(ByteBuffer src,int i,int token){
        if(i==src.limit())
            return i;
        if((src.get(i) & 0xff)==token)
            ++i;
        return i;
    }
     
    private void handleMessage(String message) {
        msgHandler.handleMessage(message);
    }
    
    public Rule validate(Set<Rule> ruleset,byte[] value, List<byte[]> 
rowReference) {
        for(Rule rule:ruleset){
            if(rule!=null){
                rule.setRowReference(rowReference);
                if(!rule.isValid(value))
                    return rule;
            }
        }
        return null;
    }

}

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

Reply via email to