Changeset: 8368330949de for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8368330949de
Added Files:
sql/backends/monet5/bam/bam_lib.c
sql/backends/monet5/bam/bam_lib.h
sql/backends/monet5/bam/bam_loader.c
sql/backends/monet5/bam/bam_loader.h
sql/backends/monet5/bam/bam_loader.mal
Removed Files:
sql/backends/monet5/bam/bamloader.c
sql/backends/monet5/bam/bamloader.h
Modified Files:
sql/backends/monet5/bam/85_bam.sql
sql/backends/monet5/bam/Makefile.ag
sql/backends/monet5/bam/bam.mal
sql/backends/monet5/bam/bam_schema_1.sql
Branch: DVframework_bam
Log Message:
Finished bam loader, did some renaming and created UDF library for SQL
interaction with BAM files.
diffs (truncated from 2317 to 300 lines):
diff --git a/sql/backends/monet5/bam/85_bam.sql
b/sql/backends/monet5/bam/85_bam.sql
--- a/sql/backends/monet5/bam/85_bam.sql
+++ b/sql/backends/monet5/bam/85_bam.sql
@@ -1,2 +1,12 @@
-CREATE PROCEDURE bamloader(repo string, mode int, num_threads int)
-external name bam.bamloader;
\ No newline at end of file
+CREATE PROCEDURE bam_loader(repo STRING, dbschema INT, num_threads INT)
+EXTERNAL NAME bam.bam_loader;
+
+
+CREATE FUNCTION bam_flag(flag INT, name STRING)
+RETURNS STRING EXTERNAL NAME bam.bam_flag;
+
+CREATE FUNCTION reverse_seq(seq STRING)
+RETURNS STRING EXTERNAL NAME bam.reverse_seq;
+
+CREATE FUNCTION reverse_qual(qual STRING)
+RETURNS STRING EXTERNAL NAME bam.reverse_qual;
diff --git a/sql/backends/monet5/bam/Makefile.ag
b/sql/backends/monet5/bam/Makefile.ag
--- a/sql/backends/monet5/bam/Makefile.ag
+++ b/sql/backends/monet5/bam/Makefile.ag
@@ -34,7 +34,7 @@ INCLUDES = .. \
lib__bam = {
MODULE
DIR = libdir/monetdb5
- SOURCES = bamloader.c bamloader.h
+ SOURCES = bam_loader.c bam_loader.h bam_lib.h bam_lib.c
LIBS = ../../../../monetdb5/tools/libmonetdb5 \
../../../../gdk/libbat \
$(SAMTOOLS_LIBS)
diff --git a/sql/backends/monet5/bam/bam.mal b/sql/backends/monet5/bam/bam.mal
--- a/sql/backends/monet5/bam/bam.mal
+++ b/sql/backends/monet5/bam/bam.mal
@@ -1,9 +1,22 @@
module bam;
-pattern bamloader(entry:str, mode:int, num_threads:int):void
-address bamloader
+pattern bam_loader(entry:str, dbschema:int, num_threads:int):void
+address bam_loader
comment "Read the files in the BAM repository, fill and return a
temp_container accordingly.";
pattern register_table(ticket:lng, table_idx:int)(:any...)
address register_table
comment "fill the specified table from the temp_container."
+
+
+command bam_flag(flag:int, name:str):bit
+address bam_flag
+comment "Get bam flag by name."
+
+command reverse_seq(seq:str):str
+address reverse_seq
+comment "Reverse DNA sequence."
+
+command reverse_qual(qual:str):str
+address reverse_qual
+comment "Reverse DNA quality string."
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/bam_lib.c
b/sql/backends/monet5/bam/bam_lib.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_lib.c
@@ -0,0 +1,54 @@
+#include "monetdb_config.h"
+#include "bam_lib.h"
+
+#define kth_bit(flag, k) ((flag & (1 << k)) == (1 << k))
+
+str
+bam_flag(bit *ret, int *flag, str *name)
+{
+ if(strcmp(*name, "mult_segm") == 0) *ret = kth_bit(*flag, 0);
+ else if(strcmp(*name, "prop_alig") == 0) *ret = kth_bit(*flag, 1);
+ else if(strcmp(*name, "segm_unma") == 0) *ret = kth_bit(*flag, 2);
+ else if(strcmp(*name, "next_unma") == 0) *ret = kth_bit(*flag, 3);
+ else if(strcmp(*name, "segm_reve") == 0) *ret = kth_bit(*flag, 4);
+ else if(strcmp(*name, "next_reve") == 0) *ret = kth_bit(*flag, 5);
+ else if(strcmp(*name, "firs_segm") == 0) *ret = kth_bit(*flag, 6);
+ else if(strcmp(*name, "last_segm") == 0) *ret = kth_bit(*flag, 7);
+ else if(strcmp(*name, "seco_alig") == 0) *ret = kth_bit(*flag, 8);
+ else if(strcmp(*name, "qual_cont") == 0) *ret = kth_bit(*flag, 9);
+ else if(strcmp(*name, "opti_dupl") == 0) *ret = kth_bit(*flag, 10);
+ else throw(MAL, "bam_flag", "Unknown flag name given: %s\n", *name);
+ return MAL_SUCCEED;
+}
+
+str
+reverse_seq(str *ret, str *seq)
+{
+ /*TODO: Figure out when malloc'ed memory will be freed */
+ unsigned int i;
+ unsigned int len = strlen(*seq);
+ *ret = GDKmalloc((len+1)*sizeof(char));
+ if(*ret == NULL)
+ throw(MAL, "reverse_seq", MAL_MALLOC_FAIL);
+ for(i=0; i<len; i++)
+ {
+ switch((*seq)[i])
+ {
+ case 'A': (*ret)[i] = 'T'; break;
+ case 'T': (*ret)[i] = 'A'; break;
+ case 'C': (*ret)[i] = 'G'; break;
+ case 'G': (*ret)[i] = 'C'; break;
+ default : throw(MAL, "reverse_seq", "Invalid character found in
sequence: '%c'\n", (*seq)[i]);
+ }
+ }
+ (*ret)[len] = '\0';
+ return MAL_SUCCEED;
+}
+
+str
+reverse_qual(str *ret, str *qual)
+{
+ /*TODO: Implement */
+ *ret = *qual;
+ return MAL_SUCCEED;
+}
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/bam_lib.h
b/sql/backends/monet5/bam/bam_lib.h
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_lib.h
@@ -0,0 +1,20 @@
+#ifndef _BAM_LIB_H
+#define _BAM_LIB_H
+
+#include "mal_exception.h"
+
+#ifdef WIN32
+#ifndef LIBBAM
+#define bam_export extern __declspec(dllimport)
+#else
+#define bam_export extern __declspec(dllexport)
+#endif
+#else
+#define bam_export extern
+#endif
+
+bam_export str bam_flag(bit *ret, int *flag, str *name);
+bam_export str reverse_seq(str *ret, str *seq);
+bam_export str reverse_qual(str *ret, str *seq);
+
+#endif
diff --git a/sql/backends/monet5/bam/bam_loader.c
b/sql/backends/monet5/bam/bam_loader.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_loader.c
@@ -0,0 +1,1056 @@
+#include "monetdb_config.h"
+#include "bam_loader.h"
+
+/*
+static int CREATED MACRO, ORIGINAL DEFINITION COMMENTED OUT
+_append_option_to_bat_cond_str(_temp_container *ret_tc, _bam_header_option
*opt, str cmp, int table, int col, int *appendErr, int *flag)
+{
+ return (strcmp(opt->tag, cmp) == 0 && _append_option_to_bat(ret_tc,
(ptr)opt->value, table, col, appendErr, flag));
+}*/
+ #define _append_option_to_bat_cond_str(ret_tc, opt, cmp, table, col,
appendErr, flag) \
+ (strcmp((opt)->tag, cmp) == 0 && _append_option_to_bat(ret_tc,
(ptr)((opt)->value), table, col, appendErr, flag))
+
+
+
+/*
+ * NOTE: Copied directly from miniseed/registrar.c
+ * TODO: Make both miniseed/registrar.c and this file include some generic
library for these kind of structures
+ * keeps BAT and other properties of columns of a table.
+ */
+typedef struct {
+ bat *column_bats; /* keeps bats of the columns: lower array */
+ str *column_names; /* names of columns that are kept in the higher
array */
+} _temp_subcontainer;
+
+/*
+ * NOTE: Copied directly from miniseed/registrar.c
+ * TODO: Make both miniseed/registrar.c and this file include some generic
library for these kind of structures
+ * keeps (some) tables of a schema.
+ */
+typedef struct {
+ str schema_name; /* schema or vault name */
+ _temp_subcontainer *tables_columns; /* keeps tables: higher array */
+ str *table_names; /* names of tables that are kept in the higher array
*/
+ int *num_columns; /* number of columns in each table in the higher
array */
+ int num_tables;
+} _temp_container;
+
+/*
+* File format specific structures
+*/
+typedef struct {
+ char tag[3];
+ str value;
+} _bam_header_option;
+
+typedef struct {
+ char header_tag[3];
+ _bam_header_option *options;
+ int num_options;
+} _bam_header_line;
+
+
+/* Global vars */
+
+FILE *_logfile = NULL; /* keep _logfile file opened in this global var while
the bam code runs, */
+ /* since opening and closing every time something has
to be written turned out to be very slow */
+
+/* SQL schema details that are common to every schema */
+int _num_col_files = 5;
+int _num_col_sq = 7;
+int _num_col_rg = 13;
+int _num_col_pg = 6;
+int _num_col_alignments_extra = 5;
+
+str _coln_files[] = {"file_id", "file_location", "format_version",
"sorting_order", "comments"};
+int _colt_files[] = {TYPE_sht , TYPE_str , TYPE_flt , TYPE_str
, TYPE_str };
+
+str _coln_sq[] = {"sn" , "file_id", "ln" , "as" , "m5" , "sp"
, "ur" };
+int _colt_sq[] = {TYPE_str, TYPE_sht , TYPE_int, TYPE_int, TYPE_str,
TYPE_str, TYPE_str};
+
+str _coln_rg[] = {"id" , "file_id", "cn" , "ds" , "dt" , "fo"
, "ks" , "lb" , "pg" , "pi" , "pl" , "pu" , "sm" };
+int _colt_rg[] = {TYPE_str, TYPE_sht , TYPE_str, TYPE_str, TYPE_int,
TYPE_str, TYPE_str, TYPE_str, TYPE_str, TYPE_int, TYPE_str, TYPE_str, TYPE_str};
+
+str _coln_pg[] = {"id" , "file_id", "pn" , "cl" , "pp" , "vn"
};
+int _colt_pg[] = {TYPE_str, TYPE_sht , TYPE_str, TYPE_str, TYPE_str, TYPE_str
};
+
+str _coln_alignments_extra[] = {"tag" , "virtual_offset", "file_id", "type"
, "value" };
+int _colt_alignments_extra[] = {TYPE_str, TYPE_lng , TYPE_sht ,
TYPE_str, TYPE_str};
+
+
+
+
+/* File format specific functions */
+static str _next_file_id(Client cntxt, MalBlkPtr mb, sht *next_file_id);
+static str _init_temp_container(_temp_container *ret_tc, int dbschema);
+static str _init_temp_container_simple(_temp_container *ret_tc);
+static str _loadfile(str filepath, _temp_container *ret_tc, int dbschema, int
file_id); /* load file and add contents to ret_tc */
+static str _process_bam_header(int file_id, str header, _temp_container
*ret_tc);
+static int _append_option_to_bat_cond_lng(_temp_container *ret_tc,
_bam_header_option *opt, str cmp, int table, int col, int *appendErr, int
*flag);
+static int _append_option_to_bat_cond_flt(_temp_container *ret_tc,
_bam_header_option *opt, str cmp, int table, int col, int *appendErr, int
*flag);
+static int _append_option_to_bat(_temp_container *ret_tc, ptr value, int
table, int col, int *appendErr, int *flag);
+static str _read_bam_header_line(str *header, _bam_header_line *ret_hl, int
*eof);
+static void _free_bam_header_line(_bam_header_line *hl);
+static str _process_bam_alignment(int file_id, lng virtual_offset,
bam_header_t *header, bam1_t *alignment, _temp_container *ret_tc, int schema);
+static int _parse_alignment_str(str *sam_alig, str *dest);
+static int _parse_alignment_lng(str *sam_alig, lng *dest);
+
+/* Generic functions */
+static str _init_temp_subcontainer(_temp_subcontainer *ret_tsc, str
*col_names, int *col_types, int num_cols);
+static str _append_to_bat(bat cb, ptr val);
+static str _insert_into_vault(Client cntxt, _temp_container* tc);
+static int _read_string_until_delim(str *src, str *ret, char *delims, int
num_delims);
+static int _parse_lng(str *src, lng *i);
+static void _append_to_log(str mssg);
+static void _free_temp_container(_temp_container* tc);
+
+
+
+
+/* File format specific functions */
+
+str
+bam_loader(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+ /* TODO: This function is now called for BAM files, eventually there will
have to be one generic */
+ /* function that e.g. receives the file format for which a DV should be
initialized as an argument */
+
+ str *repo_path = (str*) getArgReference(stk,pci,pci->retc); /* arg 1:
repo_path, used as filepath right now */
+ int dbschema = *(int*) getArgReference(stk,pci,pci->retc+1); /* arg 2:
schema to use, see bam_loader.h for possible options */
+ int num_threads = *(int*) getArgReference(stk,pci,pci->retc+2); /* arg
3: 1: no threads, >1: multi-threaded */
+
+ sht next_file_id = 0;
+
+ str err1 = NULL, err2 = NULL;
+ _temp_container *tc = (_temp_container
*)GDKmalloc(sizeof(_temp_container));
+
+ if(tc == NULL)
+ throw(MAL, "bamloader", MAL_MALLOC_FAIL);
+ else if((err1 = _next_file_id(cntxt, mb, &next_file_id)) != MAL_SUCCEED)
+ err2 = "Error while retrieving next file_id: %s\n";
+ else if((err1 = _init_temp_container(tc, dbschema)) != MAL_SUCCEED)
+ err2 = "Error while creating _temp_container: %s\n";
+ else if((err1 = _loadfile(*repo_path, tc, dbschema, next_file_id)) !=
MAL_SUCCEED)
+ err2 = "Error while loading BAM file: %s\n";
+ else if((err1 = _insert_into_vault(cntxt, tc)) != MAL_SUCCEED)
+ err2 = "Error inserting data into database: %s\n";
+
+ _free_temp_container(tc);
+
+ if(err2 != NULL)
+ throw(MAL, "bamloader", err2, err1);
+
+ (void)num_threads;
+
+ return MAL_SUCCEED;
+}
+
+/* This method retrieves the number of records in the files table and returns
this number + 1 as the new file_id
+* It would be prettier to use a sequence for this purpose. However, this was
more complicated to achieve, so therefore
+* the method is implemented like this. Furthermore, after insertions, we would
need to retrieve these newly inserted
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list