Changeset: ab50b8ebf77a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ab50b8ebf77a
Added Files:
sql/backends/monet5/vaults/csv/CMakeLists.txt
sql/backends/monet5/vaults/csv/csv.c
Branch: default
Log Message:
add new csv loader
diffs (truncated from 555 to 300 lines):
diff --git a/sql/backends/monet5/vaults/csv/CMakeLists.txt
b/sql/backends/monet5/vaults/csv/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/vaults/csv/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+add_library(csv MODULE)
+
+target_sources(csv
+ PRIVATE
+ csv.c)
+
+target_include_directories(csv
+ PRIVATE
+ $<TARGET_PROPERTY:mal,INTERFACE_INCLUDE_DIRECTORIES>
+ $<TARGET_PROPERTY:malmodules,INTERFACE_INCLUDE_DIRECTORIES>
+ $<TARGET_PROPERTY:atoms,INTERFACE_INCLUDE_DIRECTORIES>
+ $<TARGET_PROPERTY:sql,INTERFACE_INCLUDE_DIRECTORIES>
+ $<TARGET_PROPERTY:sqlcommon,INTERFACE_INCLUDE_DIRECTORIES>
+ $<TARGET_PROPERTY:sqlserver,INTERFACE_INCLUDE_DIRECTORIES>
+ $<TARGET_PROPERTY:sqlstorage,INTERFACE_INCLUDE_DIRECTORIES>
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+ $<INSTALL_INTERFACE:${INCLUDEDIR}/monetdb>)
+
+target_link_libraries(csv
+ PRIVATE
+ monetdb_config_header
+ monetdb5
+ bat
+ sqlinclude)
+
+set_target_properties(csv
+ PROPERTIES
+ OUTPUT_NAME
+ _csv)
+
+install(TARGETS
+ csv
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/monetdb5
+ COMPONENT server)
diff --git a/sql/backends/monet5/vaults/csv/csv.c
b/sql/backends/monet5/vaults/csv/csv.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/vaults/csv/csv.c
@@ -0,0 +1,510 @@
+
+#include "monetdb_config.h"
+#include "rel_file_loader.h"
+#include "rel_exp.h"
+
+#include "mal_instruction.h"
+#include "mal_interpreter.h"
+#include "mal_parser.h"
+#include "mal_builder.h"
+#include "mal_namespace.h"
+#include "mal_exception.h"
+#include "mal_linker.h"
+#include "mal_backend.h"
+#include "sql_types.h"
+#include "rel_bin.h"
+
+#include <unistd.h>
+
+static FILE *
+csv_open_file(char* filename)
+{
+ return fopen(filename, "r");
+}
+
+/* todo handle escapes */
+static const char *
+next_delim(const char *s, const char *e, char delim, char quote)
+{
+ bool inquote = false;
+ for(; s < e; s++) {
+ if (*s == quote)
+ inquote = !inquote;
+ else if (!inquote && *s == delim)
+ return s;
+ }
+ if (s < e)
+ return s;
+ return NULL;
+}
+
+/* todo detect escapes */
+static char
+detect_quote(const char *buf)
+{
+ const char *cur = buf;
+ const char *l = NULL;
+ /* "'(none) */
+ bool has_double_quote = true, has_single_quote = true;
+ while ((has_double_quote || has_single_quote) && (l = strchr(cur,
'\n')) != NULL) {
+ const char *s = cur, *t;
+ if (has_double_quote && ((t = strchr(s, '"')) == NULL || t >
l)) /* no quote not used */
+ has_double_quote = false;
+ if (has_single_quote && ((t = strchr(s, '\'')) == NULL || t >
l)) /* no quote not used */
+ has_single_quote = false;
+ cur = l+1;
+ }
+ if (has_double_quote && !has_single_quote)
+ return '"';
+ if (has_single_quote && !has_double_quote)
+ return '\'';
+ /* no quote */
+ return '\0';
+}
+
+#define DLEN 4
+static char
+detect_delimiter(const char *buf, char q, int *nr_fields)
+{
+ const char *delimiter = ",|;\t";
+ int cnts[DLEN][2] = { 0 }, l = 0;
+
+ const char *cur = buf;
+
+ for (l = 0; l < 2; l++) { /* start with 2 lines only */
+ const char *e = strchr(cur, '\n');
+ if (!e)
+ break;
+ int i = 0;
+ const char *dp = delimiter;
+ for (char d = *dp; d; d=*(++dp), i++) {
+ const char *s = cur;
+ /* all lines should have some numbers */
+ if (l && cnts[i][l])
+ if (cnts[i][0] != cnts[i][1])
+ break;
+ int nr = 1;
+ while( (s = next_delim(s, e, d, q)) != NULL && s<e ) {
+ if (s+1 < e)
+ nr++;
+ s++;
+ }
+ cnts[i][l] = nr;
+ }
+ cur = e+1;
+ }
+ if (l) {
+ int maxpos = -1, maxcnt = 0;
+ for (int i = 0; i<DLEN; i++) {
+ if (cnts[i][0] == cnts[i][1] && maxcnt < cnts[i][0]) {
+ maxcnt = cnts[i][0];
+ maxpos = i;
+ }
+ }
+ if (maxpos>=0) {
+ *nr_fields = maxcnt;
+ return delimiter[maxpos];
+ }
+ }
+ /* nothing detected */
+ return ' ';
+}
+
+typedef enum csv {
+ CSV_BOOLEAN = 0,
+ CSV_BIGINT,
+ CSV_DECIMAL,
+ CSV_DOUBLE,
+ CSV_TIME,
+ CSV_DATE,
+ CSV_TIMESTAMP,
+ CSV_STRING,
+//later: UUID, INET, JSON etc
+} csv_types_t;
+
+typedef struct csv_type {
+ csv_types_t type;
+ int scale;
+} csv_type;
+
+static bool
+detect_bool(const char *s, const char *e)
+{
+ if ((e - s) == 1 && (*s == 'T' || *s == 't' || *s == 'F' || *s == 'f'))
+ return true;
+ if (strcmp(s,"TRUE") == 0 || strcmp(s,"true") == 0 || strcmp(s,"FALSE")
== 0 || strcmp(s,"false") == 0)
+ return true;
+ if (strcmp(s,"NULL") == 0)
+ return true;
+ return false;
+}
+
+static bool
+detect_bigint(const char *s, const char *e)
+{
+ while(s < e) {
+ if (!isdigit(*s))
+ break;
+ s++;
+ }
+ if (s==e)
+ return true;
+ return false;
+}
+
+static bool
+detect_decimal(const char *s, const char *e, int *scale)
+{
+ int dotseen = 0;
+
+ while(s < e) {
+ if (!dotseen && *s == '.')
+ dotseen = (e-(s+1));
+ else if (!isdigit(*s))
+ break;
+ s++;
+ }
+ if (s==e && dotseen) {
+ *scale = dotseen;
+ return true;
+ }
+ return false;
+}
+
+static bool
+detect_time(const char *s, const char *e)
+{
+ /* TODO detect time with timezone */
+ if ((e-s) != 5)
+ return false;
+ /* 00:00 - 23:59 */
+ if (s[2] != ':')
+ return false;
+ if ((((s[0] == '0' || s[0] == '1') &&
+ (s[1] >= '0' && s[1] <= '9')) ||
+ (s[0] == '2' && (s[1] >= '0' && s[1] <= '3'))) &&
+ (s[3] >= '0' && s[3] <= '5' && s[4] >= '0' && s[4] <= '9'))
+ return true;
+ return false;
+}
+
+static bool
+detect_date(const char *s, const char *e)
+{
+ if ((e-s) != 10)
+ return false;
+ /* YYYY-MM-DD */
+ if ( s[4] == '-' && s[7] == '-' &&
+ ((s[5] == '0' && s[6] >= '0' && s[6] <= '9') ||
+ (s[5] == '1' && s[6] >= '0' && s[6] <= '2')) &&
+ (s[8] >= '0' && s[8] <= '3' && s[9] >= '0' && s[9] <= '9'))
+ return true;
+ return false;
+}
+
+static bool
+detect_timestamp(const char *s, const char *e)
+{
+ if ((e-s) != 16)
+ return false;
+ /* DATE TIME */
+ if (detect_date(s, s+5) && detect_time(s+6, e))
+ return true;
+ return false;
+}
+
+/* per row */
+static csv_type *
+detect_types_row(const char *s, const char *e, char delim, char quote, int
nr_fields)
+{
+ csv_type *types = (csv_type*)GDKmalloc(sizeof(csv_type)*nr_fields);
+ if (!types)
+ return NULL;
+ for(int i = 0; i< nr_fields; i++) {
+ const char *n = next_delim(s, e, delim, quote);
+ int scale = 0;
+
+ types[i].type = CSV_STRING;
+ if (n) {
+ if (detect_bool(s,n))
+ types[i].type = CSV_BOOLEAN;
+ else if (detect_bigint(s, n))
+ types[i].type = CSV_BIGINT;
+ else if (detect_decimal(s, n, &scale))
+ types[i].type = CSV_DECIMAL;
+ else if (detect_time(s, n))
+ types[i].type = CSV_TIME;
+ else if (detect_date(s, n))
+ types[i].type = CSV_DATE;
+ else if (detect_timestamp(s, n))
+ types[i].type = CSV_TIMESTAMP;
+ types[i].scale = scale;
+ }
+ s = n+1;
+ }
+ return types;
+}
+
+static csv_type *
+detect_types(const char *buf, char delim, char quote, int nr_fields, bool
*has_header)
+{
+ const char *cur = buf;
+ csv_type *types = NULL;
+ int nr_lines = 0;
+
+ while ( true ) {
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]