Re: [sqlite] [csv extension] Error while reading long lines

gwenn Thu, 13 May 2010 12:28:49 -0700

Done!

Index: ext/csv/csv.c
===================================================================
--- ext/csv/csv.c
+++ ext/csv/csv.c
@@ -60,10 +60,11 @@
   char *zRow;                  /* Buffer for current CSV row */
   char cDelim;                 /* Character to use for delimiting columns
*/
   int nCol;                    /* Number of columns in current row */
   int maxCol;                  /* Size of aCols array */
   char **aCols;                /* Array of parsed columns */
+  int *aEscapedQuotes;         /* Number of escaped quotes for each column
in aCols */
 };



 /*
 ** An CSV cursor object.
@@ -120,10 +121,11 @@
 */
 static char *csv_getline( CSV *pCSV ){
   int n = 0;
   int bEol = 0;
   int bShrink = 0;
+  int bQuotedCol = 0;

   /* allocate initial row buffer */
   if( pCSV->maxRow < 1 ){
     pCSV->zRow = sqlite3_malloc( 100 );
     if( pCSV->zRow ){
@@ -135,10 +137,13 @@
   /* read until eol */
   while( !bEol ){
     /* grow row buffer as needed */
     if( n+100>pCSV->maxRow ){
       int newSize = pCSV->maxRow*2 + 100;
+      if( newSize>=pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){
+        return 0;
+      }
       char *p = sqlite3_realloc(pCSV->zRow, newSize);
       if( !p ) return 0;
       pCSV->maxRow = newSize;
       pCSV->zRow = p;
       bShrink = -1;
@@ -150,19 +155,32 @@
       pCSV->zRow[n] = '\0';
       bEol = -1;
       break;
     }
     /* look for line delimiter */
-    while( pCSV->zRow[n] ){ n++; }
-    if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
+    while( pCSV->zRow[n] ){
+      if( pCSV->zRow[n]=='\"' ){
+        if( bQuotedCol ) {
+          if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
+            n++;
+          }else{
+            bQuotedCol = 0;
+          }
+        }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
+          bQuotedCol = 1;
+        }
+      }
+      n++;
+    }
+    if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
!bQuotedCol ){
       pCSV->zRow[n-1] = '\n'; /* uniform line ending */
       pCSV->zRow[n] = '\0';
       bEol = -1;
     }
   }
   if( bShrink ){
-    pCSV->zRow = realloc( pCSV->zRow, n+1 );
+    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
     pCSV->maxRow = n+1;
   }
   return bEol ? pCSV->zRow : 0;
 }

@@ -317,15 +335,16 @@
   /* allocate initial space for the column pointers */
   if( pCSV->maxCol < 1 ){
     /* take a guess */
     int maxCol = (int)(strlen(pCSV->zRow) / 5 + 1);
     pCSV->aCols = (char **)sqlite3_malloc( sizeof(char*) * maxCol );
+    pCSV->aEscapedQuotes = (int *)sqlite3_malloc( sizeof(int) * maxCol );
     if( pCSV->aCols ){
       pCSV->maxCol = maxCol;
     }
   }
-  if( !pCSV->aCols ) return SQLITE_NOMEM;
+  if( !pCSV->aCols || !pCSV->aEscapedQuotes ) return SQLITE_NOMEM;

   /* add custom delim character */
   zDelims[0] = pCSV->cDelim;

   /* parse the zRow into individual columns */
@@ -332,23 +351,32 @@
   do{
     /* if it begins with a quote, assume it's a quoted col */
     if( *s=='\"' ){
       s++;  /* skip quote */
       pCSV->aCols[nCol] = s; /* save pointer for this col */
-      /* TBD: handle escaped quotes "" */
+      pCSV->aEscapedQuotes[nCol] = 0;
       /* find closing quote */
-      s = strchr(s, '\"');
-      if( !s ){
-        /* no closing quote */
-        pCSV->eof = -1;
-        return SQLITE_ERROR;
+      while( 1 ){
+        s = strchr(s, '\"');
+        if( !s ){
+          /* no closing quote */
+          pCSV->eof = -1;
+          return SQLITE_ERROR;
+        }else if ( *(s+1)=='\"' ){
+          /* escaped quote */
+          pCSV->aEscapedQuotes[nCol]++;
+          s+=2;
+        }else{
+          break;
+        }
       }
       *s = '\0'; /* null terminate this col */
       /* fall through and look for following ",\n" */
       s++;
     }else{
       pCSV->aCols[nCol] = s; /* save pointer for this col */
+      pCSV->aEscapedQuotes[nCol] = 0;
     }
     s = strpbrk(s, zDelims);
     if( !s ){
       /* no col delimiter */
       pCSV->eof = -1;
@@ -362,18 +390,27 @@
     if( cDelim == '\n' ) break;
     /* move to start of next col */
     s++; /* skip delimiter */

     if(nCol >= pCSV->maxCol ){
+      if( nCol>=pCSV->db->aLimit[SQLITE_LIMIT_COLUMN] ){
+        return SQLITE_ERROR;
+      }
       /* we need to grow our col pointer array */
       char **p = (char **)sqlite3_realloc( pCSV->aCols, sizeof(char*) *
(nCol+5) );
       if( !p ){
         /* out of memory */
         return SQLITE_ERROR;
       }
       pCSV->maxCol = nCol + 5;
       pCSV->aCols = p;
+      int *p1 = (int *)sqlite3_realloc( pCSV->aEscapedQuotes, sizeof(int) *
(nCol+5) );
+      if( !p1 ){
+        /* out of memory */
+        return SQLITE_ERROR;
+      }
+      pCSV->aEscapedQuotes = p1;
     }

   }while( *s );

   pCSV->nCol = nCol;
@@ -402,13 +439,38 @@
   CSV *pCSV = (CSV *)pVtabCursor->pVtab;

   if( i<0 || i>=pCSV->nCol ){
     sqlite3_result_null( ctx );
   }else{
-    char *col = pCSV->aCols[i];
+    const char *col = pCSV->aCols[i];
     if( !col ){
       sqlite3_result_null( ctx );
+    }else if( pCSV->aEscapedQuotes[i] ){
+      char *z;
+
+      int nByte = (int)(strlen(col) - pCSV->aEscapedQuotes[i]);
+      if( nByte>pCSV->db->aLimit[SQLITE_LIMIT_LENGTH] ){
+        sqlite3_result_error_toobig( ctx );
+        z = 0;
+      }else{
+        z = sqlite3_malloc( nByte );
+        if( !z ){
+          sqlite3_result_error_nomem( ctx );
+        }
+      }
+      if( z ){
+        int j,k;
+        for(j=0, k=0; col[j]; j++){
+          z[k++] = col[j];
+          if( col[j]=='\"' ){
+            /* unescape quote */
+            j++;
+          }
+        }
+        z[k] = 0;
+        sqlite3_result_text( ctx, z, k, sqlite3_free );
+      }
     }else{
       sqlite3_result_text( ctx, col, -1, SQLITE_TRANSIENT );
     }
   }

@@ -473,10 +535,11 @@
     /* finalize any prepared statements here */

     csv_close( pCSV );
     if( pCSV->zRow ) sqlite3_free( pCSV->zRow );
     if( pCSV->aCols ) sqlite3_free( pCSV->aCols );
+    if( pCSV->aEscapedQuotes ) sqlite3_free( pCSV->aEscapedQuotes );
     sqlite3_free( pCSV );
   }
   return 0;
 }

@@ -539,10 +602,11 @@
     return SQLITE_NOMEM;
   }

   /* intialize virtual table object */
   memset(pCSV, 0, sizeof(CSV)+nDb+nName+nFile+3);
+  pCSV->db = db;
   pCSV->nBusy = 1;
   pCSV->base.pModule = &csvModule;
   pCSV->cDelim = cDelim;
   pCSV->zDb = (char *)&pCSV[1];
   pCSV->zName = &pCSV->zDb[nDb+1];
@@ -608,11 +672,11 @@
         *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
         sqlite3_free(zSql);
         csvRelease( pCSV );
         return SQLITE_ERROR;
       }
-      zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
+      zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
     }else{
       zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
     }
     sqlite3_free(zTmp);
   }

I'll add some testcases...

On Sat, May 8, 2010 at 9:44 PM, gwenn <[email protected]> wrote:

> Ok,
> I've just added support to embedded new lines and partial support to
> escaped double-quotes.
> By partial support, I mean they are not unescaped yet...
>
> Index: ext/csv/csv.c
> ===================================================================
> --- ext/csv/csv.c
> +++ ext/csv/csv.c
> @@ -120,10 +120,11 @@
>  */
>  static char *csv_getline( CSV *pCSV ){
>    int n = 0;
>    int bEol = 0;
>    int bShrink = 0;
> +  int bQuotedCol = 0;
>
>    /* allocate initial row buffer */
>    if( pCSV->maxRow < 1 ){
>      pCSV->zRow = sqlite3_malloc( 100 );
>      if( pCSV->zRow ){
> @@ -150,19 +151,32 @@
>        pCSV->zRow[n] = '\0';
>        bEol = -1;
>        break;
>      }
>      /* look for line delimiter */
> -    while( pCSV->zRow[n] ){ n++; }
> -    if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) ){
> +    while( pCSV->zRow[n] ){
> +      if( pCSV->zRow[n]=='\"' ){
> +        if( bQuotedCol ) {
> +          if( pCSV->zRow[n+1]=='\"' ) { /* escaped */
> +            n++;
> +          }else{
> +            bQuotedCol = 0;
> +          }
> +        }else if( n==0 || pCSV->zRow[n-1]==pCSV->cDelim ){
> +          bQuotedCol = 1;
> +        }
> +      }
> +      n++;
> +    }
> +    if( (n>0) && ((pCSV->zRow[n-1]=='\n') || (pCSV->zRow[n-1]=='\r')) &&
> !bQuotedCol ){
>        pCSV->zRow[n-1] = '\n'; /* uniform line ending */
>        pCSV->zRow[n] = '\0';
>        bEol = -1;
>      }
>    }
>    if( bShrink ){
> -    pCSV->zRow = realloc( pCSV->zRow, n+1 );
> +    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>      pCSV->maxRow = n+1;
>    }
>    return bEol ? pCSV->zRow : 0;
>  }
>
> @@ -332,17 +346,23 @@
>    do{
>      /* if it begins with a quote, assume it's a quoted col */
>      if( *s=='\"' ){
>        s++;  /* skip quote */
>        pCSV->aCols[nCol] = s; /* save pointer for this col */
> -      /* TBD: handle escaped quotes "" */
>        /* find closing quote */
> -      s = strchr(s, '\"');
> -      if( !s ){
> -        /* no closing quote */
> -        pCSV->eof = -1;
> -        return SQLITE_ERROR;
> +      while( 1 ){
> +        s = strchr(s, '\"');
> +        if( !s ){
> +          /* no closing quote */
> +          pCSV->eof = -1;
> +          return SQLITE_ERROR;
> +        }else if ( *(s+1)=='\"' ){
> +          /* TBD: replace all escaped quotes by a single one */
> +          s+=2;
> +        }else{
> +          break;
> +        }
>        }
>        *s = '\0'; /* null terminate this col */
>        /* fall through and look for following ",\n" */
>        s++;
>      }else{
> @@ -608,11 +628,11 @@
>          *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
>          sqlite3_free(zSql);
>          csvRelease( pCSV );
>          return SQLITE_ERROR;
>        }
> -      zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
> +      zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
>      }else{
>        zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
>      }
>      sqlite3_free(zTmp);
>    }
>
>
> On Sat, May 8, 2010 at 3:45 PM, gwenn <[email protected]> wrote:
>
>> While looking in csv1.test, I found a solution to the case when header row
>> contains spaces: just wrap the column name with double quotes.
>>
>> Index: ext/csv/csv.c
>> ===================================================================
>> --- ext/csv/csv.c
>> +++ ext/csv/csv.c
>> @@ -158,11 +158,11 @@
>>        pCSV->zRow[n] = '\0';
>>        bEol = -1;
>>      }
>>    }
>>    if( bShrink ){
>> -    pCSV->zRow = realloc( pCSV->zRow, n+1 );
>> +    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>>      pCSV->maxRow = n+1;
>>    }
>>    return bEol ? pCSV->zRow : 0;
>>  }
>>
>> @@ -608,11 +608,11 @@
>>          *pzErr = sqlite3_mprintf("%s", aErrMsg[4]);
>>          sqlite3_free(zSql);
>>          csvRelease( pCSV );
>>          return SQLITE_ERROR;
>>        }
>> -      zSql = sqlite3_mprintf("%s%s%s", zTmp, zCol, zTail);
>> +      zSql = sqlite3_mprintf("%s\"%s\"%s", zTmp, zCol, zTail);
>>      }else{
>>        zSql = sqlite3_mprintf("%scol%d%s", zTmp, i+1, zTail);
>>      }
>>      sqlite3_free(zTmp);
>>    }
>>
>> Index: ext/csv/csv1.test
>> ===================================================================
>> --- ext/csv/csv1.test
>> +++ ext/csv/csv1.test
>> @@ -23,10 +23,11 @@
>>  #
>>  #   csv-1.*: Creating/destroying csv tables.
>>  #   csv-2.*: Linear scans of csv data.
>>  #   csv-3.*: Test renaming an csv table.
>>  #   csv-4.*: CREATE errors
>> +#   csv-5.*: Dirty header and long line.
>>  #
>>
>>  ifcapable !csv {
>>    finish_test
>>    return
>> @@ -36,10 +37,12 @@
>>  set test1csv [file join [file dirname [info script]] test1.csv]
>>  # This file is delimited by '|' and has quoted fields.
>>  set test2csv [file join [file dirname [info script]] test2.csv]
>>  # This file is delimited by '|'.  It does NOT have quoted fields.
>>  set test3csv [file join [file dirname [info script]] test3.csv]
>> +# This file contains a dirty header and one long line.
>> +set test4csv [file join [file dirname [info script]] test4.csv]
>>
>>
>>  
>> #----------------------------------------------------------------------------
>>  # Test cases csv-1.* test CREATE and DROP table statements.
>>  #
>>
>> @@ -249,5 +252,14 @@
>>    catchsql " CREATE VIRTUAL TABLE t1 USING csv('foo') "
>>  } {1 {Error opening CSV file: 'foo'}}
>>  do_test csv-4.1.3 {
>>    catchsql " CREATE VIRTUAL TABLE t1 USING csv(foo foo) "
>>  } {1 {Error opening CSV file: 'foo foo'}}
>> +
>>
>> +#----------------------------------------------------------------------------
>> +# Test cases csv-5.* test file with dirty header and long line.
>> +#
>> +
>> +do_test csv-5.1.1 {
>> +  execsql " CREATE VIRTUAL TABLE t1 USING csv('$test4csv') "
>> +  execsql " CREATE VIRTUAL TABLE t2 USING csv('$test4csv', ',',
>> USE_HEADER_ROW) "
>> +} {}
>>
>> ADDED    ext/csv/test4.csv
>> col 1,col.2,col-3,col!4,c...@5,col;6,col%7,col*8,col=9,col'10
>>
>> 123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789,123456789
>>
>> I tried to handle double-quoted fields with embedded line breaks without
>> success but I am not stuck yet.
>> Regards.
>>
>> On Wed, Apr 21, 2010 at 8:22 PM, gwenn <[email protected]> wrote:
>>
>>> Thanks for this great extension.
>>> It works smoothly with 500Mo files.
>>>
>>> And it's a workaround to some shortcomings of the '.import' command:
>>>  - no need to create a table before,
>>>  - no need to delete the header row before/after,
>>>  - no error if the number of columns is not homogeneous,
>>>  - ...
>>> It's a nightmare to work with the CSV format but I have to.
>>>
>>> I made a quick and dirty fix to the USE_HEADER_ROW mode to replace
>>> whitespaces, slashes or hyphens by underscores.
>>> But I look for a better solution. Is there any way to make sure a string
>>> is a valid column name?
>>>
>>> Regards
>>>
>>> On Mon Apr 19 17:32:13 GMT 2010, Shane Harrelson <shane at sqlite.org>
>>> wrote:
>>> > Thanks for the report.  The extension is still very a much a
>>> > work-in-progress and any feedback is greatly appreciated.
>>> >
>>> > -Shane
>>>
>>> >
>>> >
>>> > On Sun, Apr 18, 2010 at 12:51 PM, gwenn <gwenn.kahz at gmail.com>
>>> wrote:
>>> > > Hello,
>>> > > There is a little bug/typo in the csv extension when lines exceed 100
>>> > > characters:
>>> > > *** glibc detected *** sqlite3: realloc(): invalid pointer:
>>> > > 0x0000000000ad1a78 ***
>>> > > ======= Backtrace: =========
>>> > > /lib/libc.so.6[0x7f6dab009d16]
>>> > > /lib/libc.so.6(realloc+0x321)[0x7f6dab00fda1]
>>> > > ./libSqliteCsv.so[0x7f6da9ef9dbf]
>>> > >
>>> > > A possible patch is:
>>> > > --- /tmp/SQLite-d474195a997b9d94/ext/csv/csv.c 2009-11-05
>>> 05:14:30.000000000
>>> > > +0100
>>> > > +++ csv.c 2010-04-18 18:48:04.000000000 +0200
>>> > > @@ -160,7 +160,7 @@
>>> > >     }
>>> > >   }
>>> > >   if( bShrink ){
>>> > > -    pCSV->zRow = realloc( pCSV->zRow, n+1 );
>>> > > +    pCSV->zRow = sqlite3_realloc( pCSV->zRow, n+1 );
>>> > >     pCSV->maxRow = n+1;
>>> > >   }
>>> > >   return bEol ? pCSV->zRow : 0;
>>> > >
>>> > > Regards.
>>> > > _______________________________________________
>>> > > sqlite-users mailing list
>>> > > sqlite-users at sqlite.org
>>> > > http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users
>>> > >
>>>
>>
>>
>
_______________________________________________
sqlite-users mailing list
[email protected]
http://sqlite.org:8080/cgi-bin/mailman/listinfo/sqlite-users

Re: [sqlite] [csv extension] Error while reading long lines

Reply via email to