Changeset: ab50b8ebf77a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ab50b8ebf77a
Added Files:
        sql/backends/monet5/vaults/csv/CMakeLists.txt
        sql/backends/monet5/vaults/csv/csv.c
Branch: default
Log Message:

add new csv loader


diffs (truncated from 555 to 300 lines):

diff --git a/sql/backends/monet5/vaults/csv/CMakeLists.txt 
b/sql/backends/monet5/vaults/csv/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/vaults/csv/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+add_library(csv MODULE)
+
+target_sources(csv
+    PRIVATE
+    csv.c)
+
+target_include_directories(csv
+    PRIVATE
+    $<TARGET_PROPERTY:mal,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:malmodules,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:atoms,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:sql,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:sqlcommon,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:sqlserver,INTERFACE_INCLUDE_DIRECTORIES>
+    $<TARGET_PROPERTY:sqlstorage,INTERFACE_INCLUDE_DIRECTORIES>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${INCLUDEDIR}/monetdb>)
+
+target_link_libraries(csv
+    PRIVATE
+    monetdb_config_header
+    monetdb5
+    bat
+    sqlinclude)
+
+set_target_properties(csv
+    PROPERTIES
+    OUTPUT_NAME
+    _csv)
+
+install(TARGETS
+    csv
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/monetdb5
+    COMPONENT server)
diff --git a/sql/backends/monet5/vaults/csv/csv.c 
b/sql/backends/monet5/vaults/csv/csv.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/vaults/csv/csv.c
@@ -0,0 +1,510 @@
+
+#include "monetdb_config.h"
+#include "rel_file_loader.h"
+#include "rel_exp.h"
+
+#include "mal_instruction.h"
+#include "mal_interpreter.h"
+#include "mal_parser.h"
+#include "mal_builder.h"
+#include "mal_namespace.h"
+#include "mal_exception.h"
+#include "mal_linker.h"
+#include "mal_backend.h"
+#include "sql_types.h"
+#include "rel_bin.h"
+
+#include <unistd.h>
+
+static FILE *
+csv_open_file(char* filename)
+{
+       return fopen(filename, "r");
+}
+
+/* todo handle escapes */
+static const char *
+next_delim(const char *s, const char *e, char delim, char quote)
+{
+       bool inquote = false;
+       for(;  s < e; s++) {
+               if (*s == quote)
+                       inquote = !inquote;
+               else if (!inquote && *s == delim)
+                       return s;
+       }
+       if (s < e)
+               return s;
+       return NULL;
+}
+
+/* todo detect escapes */
+static char
+detect_quote(const char *buf)
+{
+       const char *cur = buf;
+       const char *l = NULL;
+       /* "'(none) */
+       bool has_double_quote = true, has_single_quote = true;
+       while ((has_double_quote || has_single_quote) && (l = strchr(cur, 
'\n')) != NULL) {
+               const char *s = cur, *t;
+               if (has_double_quote && ((t = strchr(s, '"')) == NULL || t > 
l))  /* no quote not used */
+                       has_double_quote = false;
+               if (has_single_quote && ((t = strchr(s, '\'')) == NULL || t > 
l))  /* no quote not used */
+                       has_single_quote = false;
+               cur = l+1;
+       }
+       if (has_double_quote && !has_single_quote)
+               return '"';
+       if (has_single_quote && !has_double_quote)
+               return '\'';
+       /* no quote */
+       return '\0';
+}
+
+#define DLEN 4
+static char
+detect_delimiter(const char *buf, char q, int *nr_fields)
+{
+       const char *delimiter = ",|;\t";
+       int cnts[DLEN][2] = { 0 }, l = 0;
+
+       const char *cur = buf;
+
+       for (l = 0; l < 2; l++) { /* start with 2 lines only */
+               const char *e = strchr(cur, '\n');
+               if (!e)
+                       break;
+               int i = 0;
+               const char *dp = delimiter;
+               for (char d = *dp; d; d=*(++dp), i++) {
+                       const char *s = cur;
+                       /* all lines should have some numbers */
+                       if (l && cnts[i][l])
+                               if (cnts[i][0] != cnts[i][1])
+                                       break;
+                       int nr = 1;
+                       while( (s = next_delim(s, e, d, q)) != NULL && s<e ) {
+                               if (s+1 < e)
+                                       nr++;
+                               s++;
+                       }
+                       cnts[i][l] = nr;
+               }
+               cur = e+1;
+       }
+       if (l) {
+               int maxpos = -1, maxcnt = 0;
+               for (int i = 0; i<DLEN; i++) {
+                       if (cnts[i][0] == cnts[i][1] && maxcnt < cnts[i][0]) {
+                               maxcnt = cnts[i][0];
+                               maxpos = i;
+                       }
+               }
+               if (maxpos>=0) {
+                       *nr_fields = maxcnt;
+                       return delimiter[maxpos];
+               }
+       }
+       /* nothing detected */
+       return ' ';
+}
+
+typedef enum csv {
+ CSV_BOOLEAN = 0,
+ CSV_BIGINT,
+ CSV_DECIMAL,
+ CSV_DOUBLE,
+ CSV_TIME,
+ CSV_DATE,
+ CSV_TIMESTAMP,
+ CSV_STRING,
+//later: UUID, INET, JSON etc
+} csv_types_t;
+
+typedef struct csv_type {
+       csv_types_t type;
+       int scale;
+} csv_type;
+
+static bool
+detect_bool(const char *s, const char *e)
+{
+       if ((e - s) == 1 && (*s == 'T' || *s == 't' || *s == 'F' || *s == 'f'))
+               return true;
+       if (strcmp(s,"TRUE") == 0 || strcmp(s,"true") == 0 || strcmp(s,"FALSE") 
== 0 || strcmp(s,"false") == 0)
+               return true;
+       if (strcmp(s,"NULL") == 0)
+               return true;
+       return false;
+}
+
+static bool
+detect_bigint(const char *s, const char *e)
+{
+       while(s < e) {
+               if (!isdigit(*s))
+                       break;
+               s++;
+       }
+       if (s==e)
+               return true;
+       return false;
+}
+
+static bool
+detect_decimal(const char *s, const char *e, int *scale)
+{
+       int dotseen = 0;
+
+       while(s < e) {
+               if (!dotseen && *s == '.')
+                       dotseen = (e-(s+1));
+               else if (!isdigit(*s))
+                       break;
+               s++;
+       }
+       if (s==e && dotseen) {
+               *scale = dotseen;
+               return true;
+       }
+       return false;
+}
+
+static bool
+detect_time(const char *s, const char *e)
+{
+       /* TODO detect time with timezone */
+       if ((e-s) != 5)
+               return false;
+       /* 00:00 - 23:59 */
+       if (s[2] != ':')
+               return false;
+       if ((((s[0] == '0' || s[0] == '1') &&
+             (s[1] >= '0' && s[1] <= '9'))  ||
+             (s[0] == '2' && (s[1] >= '0' && s[1] <= '3'))) &&
+          (s[3] >= '0' && s[3] <= '5' && s[4] >= '0' && s[4] <= '9'))
+               return true;
+       return false;
+}
+
+static bool
+detect_date(const char *s, const char *e)
+{
+       if ((e-s) != 10)
+               return false;
+       /* YYYY-MM-DD */
+       if ( s[4] == '-' && s[7] == '-' &&
+          ((s[5] == '0' && s[6] >= '0' && s[6] <= '9') ||
+           (s[5] == '1' && s[6] >= '0' && s[6] <= '2')) &&
+           (s[8] >= '0' && s[8] <= '3' && s[9] >= '0' && s[9] <= '9'))
+               return true;
+       return false;
+}
+
+static bool
+detect_timestamp(const char *s, const char *e)
+{
+       if ((e-s) != 16)
+               return false;
+       /* DATE TIME */
+       if (detect_date(s, s+5) && detect_time(s+6, e))
+               return true;
+       return false;
+}
+
+/* per row */
+static  csv_type *
+detect_types_row(const char *s, const char *e, char delim, char quote, int 
nr_fields)
+{
+       csv_type *types = (csv_type*)GDKmalloc(sizeof(csv_type)*nr_fields);
+       if (!types)
+               return NULL;
+       for(int i = 0; i< nr_fields; i++) {
+               const char *n = next_delim(s, e, delim, quote);
+               int scale = 0;
+
+               types[i].type = CSV_STRING;
+               if (n) {
+                       if (detect_bool(s,n))
+                               types[i].type = CSV_BOOLEAN;
+                       else if (detect_bigint(s, n))
+                               types[i].type = CSV_BIGINT;
+                       else if (detect_decimal(s, n, &scale))
+                               types[i].type = CSV_DECIMAL;
+                       else if (detect_time(s, n))
+                               types[i].type = CSV_TIME;
+                       else if (detect_date(s, n))
+                               types[i].type = CSV_DATE;
+                       else if (detect_timestamp(s, n))
+                               types[i].type = CSV_TIMESTAMP;
+                       types[i].scale = scale;
+               }
+               s = n+1;
+       }
+       return types;
+}
+
+static csv_type *
+detect_types(const char *buf, char delim, char quote, int nr_fields, bool 
*has_header)
+{
+       const char *cur = buf;
+       csv_type *types = NULL;
+       int nr_lines = 0;
+
+       while ( true ) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to