This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 1697c7ba16 [MINOR] CSV frame reader refine csv parsing
1697c7ba16 is described below

commit 1697c7ba16792831de05c2d6ae08e3aeb2f38ff3
Author: Sebastian Baunsgaard <[email protected]>
AuthorDate: Tue Oct 24 17:53:23 2023 +0200

    [MINOR] CSV frame reader refine csv parsing
    
    This commit adds a few shortcuts in the CSV parsing to:
    
    1. reduce call time of trim by filtering strings not containing whitespace
    this is a trade off, that makes it slower for strings with whitespace, and
    faster for the common case of no white spaces.
    2. Specialize the split CSV to a case with a single char delimiter,
    this simplify the splitting logic. But only implemented for the case of
    no quotation marks in the line input, since quotations make the rules
    change for csv parsing.
    
    Closes 1932
---
 .../sysds/runtime/io/FrameReaderTextCSV.java       |  59 +++++++---
 .../apache/sysds/runtime/io/IOUtilFunctions.java   | 121 ++++++++++++++++-----
 .../runtime/util/FastBufferedDataOutputStream.java |   2 +-
 3 files changed, 140 insertions(+), 42 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java 
b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
index d8de58f058..cfe4a5e45b 100644
--- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
+++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java
@@ -144,9 +144,8 @@ public class FrameReaderTextCSV extends FrameReader {
                        String[] parts = null; // cache array for line reading.
                        while(reader.next(key, value)) // foreach line
                        {
-                               String cellStr = value.toString();
                                boolean emptyValuesFound = false;
-                               cellStr = IOUtilFunctions.trim(cellStr);
+                               String cellStr = 
IOUtilFunctions.trim(value.toString());
                                parts = IOUtilFunctions.splitCSV(cellStr, 
delim, parts);
                                // sanity checks for empty values and number of 
columns
 
@@ -154,13 +153,12 @@ public class FrameReaderTextCSV extends FrameReader {
                                final boolean mtdx = 
parts[0].equals(TfUtils.TXMTD_NDPREFIX);
                                // parse frame meta data (missing values / num 
distinct)
                                if(mtdP || mtdx) {
-                                       parts = 
IOUtilFunctions.splitCSV(cellStr, delim);
                                        if(parts.length != dest.getNumColumns() 
+ 1){
                                                LOG.warn("Invalid metadata ");
                                                parts = null;
                                                continue;
                                        }
-                                       if(mtdP)
+                                       else if(mtdP)
                                                for(int j = 0; j < 
dest.getNumColumns(); j++)
                                                        
dest.getColumnMetadata(j).setMvValue(parts[j + 1]);
                                        else if(mtdx)
@@ -169,17 +167,8 @@ public class FrameReaderTextCSV extends FrameReader {
                                        parts = null;
                                        continue;
                                }
-
-                               for(int col = 0; col < nCol; col++) {
-                                       String part = 
IOUtilFunctions.trim(parts[col]);
-                                       if(part.isEmpty() || (naValues != null 
&& naValues.contains(part))) {
-                                               if(isFill && dfillValue != 0)
-                                                       dest.set(row, col, 
sfillValue);
-                                               emptyValuesFound = true;
-                                       }
-                                       else
-                                               dest.set(row, col, part);
-                               }
+                               assignColumns(row, nCol, dest, parts, naValues, 
isFill, dfillValue, sfillValue);
+                               
                                
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, 
emptyValuesFound);
                                
IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
                                row++;
@@ -195,6 +184,46 @@ public class FrameReaderTextCSV extends FrameReader {
                return row;
        }
 
+       private boolean assignColumns(int row, int nCol, FrameBlock dest, 
String[] parts, Set<String> naValues,
+               boolean isFill, double dfillValue, String sfillValue) {
+               if(!isFill && naValues == null)
+                       return assignColumnsNoFillNoNan(row, nCol, dest, parts);
+               else 
+                       return assignColumnsGeneric(row, nCol, dest, parts, 
naValues, isFill, dfillValue, sfillValue);
+       }
+
+       private boolean assignColumnsGeneric(int row, int nCol, FrameBlock 
dest, String[] parts, Set<String> naValues,
+               boolean isFill, double dfillValue, String sfillValue) {
+               boolean emptyValuesFound = false;
+               for(int col = 0; col < nCol; col++) {
+                       String part = IOUtilFunctions.trim(parts[col]);
+                       if(part.isEmpty() || (naValues != null && 
naValues.contains(part))) {
+                               if(isFill && dfillValue != 0)
+                                       dest.set(row, col, sfillValue);
+                               emptyValuesFound = true;
+                       }
+                       else
+                               dest.set(row, col, part);
+               }
+
+               return emptyValuesFound;
+       }
+
+       private boolean assignColumnsNoFillNoNan(int row, int nCol, FrameBlock 
dest, String[] parts){
+               
+               boolean emptyValuesFound = false;
+               for(int col = 0; col < nCol; col++) {
+                       String part = IOUtilFunctions.trim(parts[col]);
+                       if(part.isEmpty()) 
+                               emptyValuesFound = true;
+                       else
+                               dest.set(row, col, part);
+               }
+
+               return emptyValuesFound;
+       }
+
+
        protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, 
FileSystem fs) throws IOException {
                TextInputFormat informat = new TextInputFormat();
                informat.configure(job);
diff --git a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java 
b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
index c188928ae0..9ce18d11b8 100644
--- a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
+++ b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java
@@ -222,7 +222,7 @@ public class IOUtilFunctions {
                final ArrayList<String> tokens = new ArrayList<>();
 
                while(from < len) { // for all tokens
-                       to = getTo(str, from, delim);
+                       to = getTo(str, from, delim, len, delimLen);
                        tokens.add(str.substring(from, to));
                        from = to + delimLen;
                }
@@ -257,28 +257,60 @@ public class IOUtilFunctions {
                        return cache;
                }
                else
-                       return splitCSVNonNullWithCache(str,delim,cache);
+                       return splitCSVNonNullWithCache(str, delim, cache);
        }
 
        private static String[] splitCSVNonNullWithCache(final String str, 
final String delim, final String[] cache) {
+               
                final int len = str.length();
                final int delimLen = delim.length();
-               final boolean containsQuotationMarks = str.contains("\"");
+               
+               if(str.contains("\""))
+                       return splitCSVNonNullWithCacheWithQuote(str, delim, 
cache, len, delimLen);
+               else if(delimLen == 1)
+                       return splitCSVNonNullCacheNoQuoteCharDelim(str, 
delim.charAt(0), cache, len);
+               else 
+                       return splitCSVNonNullCacheNoQuote(str, delim, cache,  
len, delimLen);
+       }
+
+       private static String[] splitCSVNonNullWithCacheWithQuote(final String 
str, final String delim,
+               final String[] cache, final int len, final int delimLen) {
                int from = 0;
                int id = 0;
-               if(containsQuotationMarks){
-                       while(from < len) { // for all tokens
-                               final int to = getTo(str, from, delim);
-                               cache[id++] = str.substring(from, to);
-                               from = to + delimLen;
-                       }
+               while(from < len) { // for all tokens
+                       final int to = getTo(str, from, delim, len, delimLen);
+                       cache[id++] = str.substring(from, to);
+                       from = to + delimLen;
                }
-               else{
-                       while(from < len) { // for all tokens
-                               final int to = getToNoQuote(str, from, delim);
-                               cache[id++] = str.substring(from, to);
-                               from = to + delimLen;
-                       }
+
+               if(from == len)
+                       cache[id] = "";
+               return cache;
+       }
+
+       private static String[] splitCSVNonNullCacheNoQuote(final String str, 
final String delim, final String[] cache,final int len, final int delimLen) {
+               int from = 0;
+               int id = 0;
+               
+               while(from < len) { // for all tokens
+                       final int to = getToNoQuote(str, from, delim, len, 
delimLen);
+                       cache[id++] = str.substring(from, to);
+                       from = to + delimLen;
+               }
+               
+               if(from == len)
+                       cache[id] = "";
+               return cache;
+       }
+
+       private static String[] splitCSVNonNullCacheNoQuoteCharDelim(final 
String str, final char delim,
+               final String[] cache, final int len) {
+               int from = 0;
+               int id = 0;
+               while(from < len) { // for all tokens
+                       final int to = getToNoQuoteCharDelim(str, from, delim, 
len);
+                       cache[id++] = str.substring(from, to);
+                       from = to + 1;
                }
 
                if(from == len)
@@ -296,9 +328,18 @@ public class IOUtilFunctions {
                return true;
        }
 
-       private static int getTo(final String str, final int from, final String 
delim) {
-               final int len = str.length();
-               final int dLen = delim.length();
+       /**
+        * Get next index of substring after delim, while the string can 
contain Quotation marks
+        * 
+        * @param str   The string to get the index from
+        * @param from  The index to start searching from
+        * @param delim The delimiter to find
+        * @param len   The length of the str string argument
+        * @param dLen  The length of the delimiter string
+        * @return The next index.
+        */
+       private static int getTo(final String str, final int from, final String 
delim,
+               final int len, final int dLen) {
                final char cq = CSV_QUOTE_CHAR;
                final int fromP1 = from + 1;
                int to;
@@ -322,12 +363,21 @@ public class IOUtilFunctions {
                return to >= 0 ? to : len;
        }
 
-       private static int getToNoQuote(final String str, final int from, final 
String delim) {
-               final int len = str.length();
-               final int dLen = delim.length();
-               final int fromP1 = from + 1;
+       /**
+        * Get next index of substring after delim
+        * 
+        * @param str   The string to get the index from
+        * @param from  The index to start searching from
+        * @param delim The delimiter to find
+        * @param len   The length of the str string argument
+        * @param dLen  The length of the delimiter string
+        * @return The next index.
+        */
+       private static int getToNoQuote(final String str, final int from, final 
String delim, final int len,
+               final int dLen) {
+               
                int to;
-
+               final int fromP1 = from + 1;
                if(isEmptyMatch(str, from, delim, dLen, len))
                        return to = from; // empty string
                else // default: unquoted non-empty
@@ -335,10 +385,29 @@ public class IOUtilFunctions {
 
                // slice out token and advance position
                return to >= 0 ? to : len;
+               
+       }
+
+       private static int getToNoQuoteCharDelim(final String str, final int 
from, final char delim, final int len){
+               for(int i = from; i < len; i++)
+                       if(str.charAt(i) == delim)
+                               return i;
+               return len;
        }
 
        public static String trim(String str) {
-               return str.trim();
+               try{
+                       final int len = str.length();
+                       if(len == 0)
+                               return str;
+                       // short the call to return input if not whitespace in 
ends.
+                       else if(str.charAt(0) <= ' ' || str.charAt(len -1) <= ' 
')
+                               return str.trim();
+                       else 
+                               return str;
+               }catch(Exception e){
+                       throw new RuntimeException("failed trimming: " + str + 
" " + str.length(),e);
+               }
        }
 
        /**
@@ -366,7 +435,7 @@ public class IOUtilFunctions {
                int from = 0; 
                int pos = 0;
                while( from < len  ) { // for all tokens
-                       final int to = getTo(str, from, delim);
+                       final int to = getTo(str, from, delim, len, dLen);
                        final String curString = str.substring(from, to);
                        tokens[pos++] = naStrings.contains(curString) ? null : 
curString;
                        from = to + dLen;
@@ -401,7 +470,7 @@ public class IOUtilFunctions {
                int numTokens = 0;
                int from = 0; 
                while( from < len  ) { // for all tokens
-                       int to = getTo(str, from, delim);
+                       int to = getTo(str, from, delim, len, dlen);
                        from = to + dlen;
                        numTokens++;
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/util/FastBufferedDataOutputStream.java 
b/src/main/java/org/apache/sysds/runtime/util/FastBufferedDataOutputStream.java
index adf9f0abd5..1804bc78e0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/util/FastBufferedDataOutputStream.java
+++ 
b/src/main/java/org/apache/sysds/runtime/util/FastBufferedDataOutputStream.java
@@ -191,7 +191,7 @@ public class FastBufferedDataOutputStream extends 
FilterOutputStream implements
                for( int i=0; i<slen; i++ ) {
                        if (_count+3 > _bufflen)
                                flushBuffer();
-                       char c = s.charAt(i);
+                       final char c = s.charAt(i);
                        if( c>= 0x0001 && c<=0x007F ) //1 byte range
                                _buff[_count++] = (byte) c;
                        else if( c>=0x0800 ) { //3 byte range

Reply via email to