Changeset: 8368330949de for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8368330949de
Added Files:
        sql/backends/monet5/bam/bam_lib.c
        sql/backends/monet5/bam/bam_lib.h
        sql/backends/monet5/bam/bam_loader.c
        sql/backends/monet5/bam/bam_loader.h
        sql/backends/monet5/bam/bam_loader.mal
Removed Files:
        sql/backends/monet5/bam/bamloader.c
        sql/backends/monet5/bam/bamloader.h
Modified Files:
        sql/backends/monet5/bam/85_bam.sql
        sql/backends/monet5/bam/Makefile.ag
        sql/backends/monet5/bam/bam.mal
        sql/backends/monet5/bam/bam_schema_1.sql
Branch: DVframework_bam
Log Message:

Finished bam loader, did some renaming and created UDF library for SQL 
interaction with BAM files.


diffs (truncated from 2317 to 300 lines):

diff --git a/sql/backends/monet5/bam/85_bam.sql 
b/sql/backends/monet5/bam/85_bam.sql
--- a/sql/backends/monet5/bam/85_bam.sql
+++ b/sql/backends/monet5/bam/85_bam.sql
@@ -1,2 +1,12 @@
-CREATE PROCEDURE bamloader(repo string, mode int, num_threads int)
-external name bam.bamloader;
\ No newline at end of file
+CREATE PROCEDURE bam_loader(repo STRING, dbschema INT, num_threads INT)
+EXTERNAL NAME bam.bam_loader;
+
+
+CREATE FUNCTION bam_flag(flag INT, name STRING)
+RETURNS STRING EXTERNAL NAME bam.bam_flag;
+
+CREATE FUNCTION reverse_seq(seq STRING)
+RETURNS STRING EXTERNAL NAME bam.reverse_seq;
+
+CREATE FUNCTION reverse_qual(qual STRING)
+RETURNS STRING EXTERNAL NAME bam.reverse_qual;
diff --git a/sql/backends/monet5/bam/Makefile.ag 
b/sql/backends/monet5/bam/Makefile.ag
--- a/sql/backends/monet5/bam/Makefile.ag
+++ b/sql/backends/monet5/bam/Makefile.ag
@@ -34,7 +34,7 @@ INCLUDES = .. \
 lib__bam = {
        MODULE
        DIR = libdir/monetdb5
-       SOURCES = bamloader.c bamloader.h
+       SOURCES = bam_loader.c bam_loader.h bam_lib.h bam_lib.c
        LIBS = ../../../../monetdb5/tools/libmonetdb5 \
                   ../../../../gdk/libbat \
        $(SAMTOOLS_LIBS)
diff --git a/sql/backends/monet5/bam/bam.mal b/sql/backends/monet5/bam/bam.mal
--- a/sql/backends/monet5/bam/bam.mal
+++ b/sql/backends/monet5/bam/bam.mal
@@ -1,9 +1,22 @@
 module bam;
 
-pattern bamloader(entry:str, mode:int, num_threads:int):void
-address bamloader
+pattern bam_loader(entry:str, dbschema:int, num_threads:int):void
+address bam_loader
 comment "Read the files in the BAM repository, fill and return a 
temp_container accordingly.";
 
 pattern register_table(ticket:lng, table_idx:int)(:any...)
 address register_table
 comment "fill the specified table from the temp_container."
+
+
+command bam_flag(flag:int, name:str):bit
+address bam_flag
+comment "Get bam flag by name."
+
+command reverse_seq(seq:str):str
+address reverse_seq
+comment "Reverse DNA sequence."
+
+command reverse_qual(qual:str):str
+address reverse_qual
+comment "Reverse DNA quality string."
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/bam_lib.c 
b/sql/backends/monet5/bam/bam_lib.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_lib.c
@@ -0,0 +1,54 @@
+#include "monetdb_config.h"
+#include "bam_lib.h"
+
+#define kth_bit(flag, k) ((flag & (1 << k)) == (1 << k))
+
+str 
+bam_flag(bit *ret, int *flag, str *name)
+{
+         if(strcmp(*name, "mult_segm") == 0) *ret = kth_bit(*flag, 0);
+    else if(strcmp(*name, "prop_alig") == 0) *ret = kth_bit(*flag, 1);
+    else if(strcmp(*name, "segm_unma") == 0) *ret = kth_bit(*flag, 2);
+    else if(strcmp(*name, "next_unma") == 0) *ret = kth_bit(*flag, 3);
+    else if(strcmp(*name, "segm_reve") == 0) *ret = kth_bit(*flag, 4);
+    else if(strcmp(*name, "next_reve") == 0) *ret = kth_bit(*flag, 5);
+    else if(strcmp(*name, "firs_segm") == 0) *ret = kth_bit(*flag, 6);
+    else if(strcmp(*name, "last_segm") == 0) *ret = kth_bit(*flag, 7);
+    else if(strcmp(*name, "seco_alig") == 0) *ret = kth_bit(*flag, 8);
+    else if(strcmp(*name, "qual_cont") == 0) *ret = kth_bit(*flag, 9);
+    else if(strcmp(*name, "opti_dupl") == 0) *ret = kth_bit(*flag, 10);
+    else throw(MAL, "bam_flag", "Unknown flag name given: %s\n", *name);
+    return MAL_SUCCEED;
+}
+
+str 
+reverse_seq(str *ret, str *seq)
+{
+    /*TODO: Figure out when malloc'ed memory will be freed */
+    unsigned int i;
+    unsigned int len = strlen(*seq);
+    *ret = GDKmalloc((len+1)*sizeof(char));
+    if(*ret == NULL)
+        throw(MAL, "reverse_seq", MAL_MALLOC_FAIL);
+    for(i=0; i<len; i++)
+    {
+        switch((*seq)[i])
+        {
+            case 'A': (*ret)[i] = 'T'; break;
+            case 'T': (*ret)[i] = 'A'; break;
+            case 'C': (*ret)[i] = 'G'; break;
+            case 'G': (*ret)[i] = 'C'; break;
+            default : throw(MAL, "reverse_seq", "Invalid character found in 
sequence: '%c'\n", (*seq)[i]);
+        }
+    }
+    (*ret)[len] = '\0';
+    return MAL_SUCCEED;
+}
+
+str 
+reverse_qual(str *ret, str *qual)
+{
+    /*TODO: Implement */
+    *ret = *qual;
+    return MAL_SUCCEED;
+}
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/bam_lib.h 
b/sql/backends/monet5/bam/bam_lib.h
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_lib.h
@@ -0,0 +1,20 @@
+#ifndef _BAM_LIB_H
+#define _BAM_LIB_H
+
+#include "mal_exception.h"
+
+#ifdef WIN32
+#ifndef LIBBAM
+#define bam_export extern __declspec(dllimport)
+#else
+#define bam_export extern __declspec(dllexport)
+#endif
+#else
+#define bam_export extern
+#endif
+
+bam_export str bam_flag(bit *ret, int *flag, str *name);
+bam_export str reverse_seq(str *ret, str *seq);
+bam_export str reverse_qual(str *ret, str *seq);
+
+#endif
diff --git a/sql/backends/monet5/bam/bam_loader.c 
b/sql/backends/monet5/bam/bam_loader.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_loader.c
@@ -0,0 +1,1056 @@
+#include "monetdb_config.h"
+#include "bam_loader.h"
+ 
+/*
+static int  CREATED MACRO, ORIGINAL DEFINITION COMMENTED OUT
+_append_option_to_bat_cond_str(_temp_container *ret_tc, _bam_header_option 
*opt, str cmp, int table, int col, int *appendErr, int *flag)
+{
+    return (strcmp(opt->tag, cmp) == 0 && _append_option_to_bat(ret_tc, 
(ptr)opt->value, table, col, appendErr, flag));
+}*/
+ #define _append_option_to_bat_cond_str(ret_tc, opt, cmp, table, col, 
appendErr, flag) \
+    (strcmp((opt)->tag, cmp) == 0 && _append_option_to_bat(ret_tc, 
(ptr)((opt)->value), table, col, appendErr, flag))
+
+
+
+/*
+ * NOTE: Copied directly from miniseed/registrar.c
+ * TODO: Make both miniseed/registrar.c and this file include some generic 
library for these kind of structures
+ * keeps BAT and other properties of columns of a table.
+ */
+typedef struct {
+       bat *column_bats; /* keeps bats of the columns: lower array */
+       str *column_names; /* names of columns that are kept in the higher 
array */
+} _temp_subcontainer;
+
+/*
+ * NOTE: Copied directly from miniseed/registrar.c
+ * TODO: Make both miniseed/registrar.c and this file include some generic 
library for these kind of structures
+ * keeps (some) tables of a schema.
+ */
+typedef struct {
+       str schema_name; /* schema or vault name */
+       _temp_subcontainer *tables_columns; /* keeps tables: higher array */
+       str *table_names; /* names of tables that are kept in the higher array 
*/
+       int *num_columns; /* number of columns in each table in the higher 
array */
+       int num_tables;
+} _temp_container;
+
+/*
+* File format specific structures
+*/
+typedef struct {
+    char tag[3];
+    str value;
+} _bam_header_option;
+
+typedef struct {
+    char header_tag[3];
+    _bam_header_option *options;
+    int num_options;
+} _bam_header_line;
+
+
+/* Global vars */
+
+FILE *_logfile = NULL; /* keep _logfile file opened in this global var while 
the bam code runs,  */
+                        /* since opening and closing every time something has 
to be written turned out to be very slow */
+
+/* SQL schema details that are common to every schema */
+int _num_col_files               = 5;
+int _num_col_sq                  = 7;
+int _num_col_rg                  = 13;
+int _num_col_pg                  = 6;
+int _num_col_alignments_extra    = 5;
+
+str _coln_files[]  = {"file_id", "file_location", "format_version", 
"sorting_order", "comments"};
+int _colt_files[]  = {TYPE_sht , TYPE_str       , TYPE_flt        , TYPE_str   
    , TYPE_str  };
+
+str _coln_sq[]  = {"sn"    , "file_id", "ln"    , "as"    , "m5"    , "sp"    
, "ur"    };
+int _colt_sq[]  = {TYPE_str, TYPE_sht , TYPE_int, TYPE_int, TYPE_str, 
TYPE_str, TYPE_str};
+
+str _coln_rg[]  = {"id"    , "file_id", "cn"    , "ds"    , "dt"    , "fo"    
, "ks"    , "lb"    , "pg"    , "pi"    , "pl"    , "pu"    , "sm"    };
+int _colt_rg[]  = {TYPE_str, TYPE_sht , TYPE_str, TYPE_str, TYPE_int, 
TYPE_str, TYPE_str, TYPE_str, TYPE_str, TYPE_int, TYPE_str, TYPE_str, TYPE_str};
+
+str _coln_pg[]  = {"id"    , "file_id", "pn"    , "cl"    , "pp"    , "vn"     
 };
+int _colt_pg[]  = {TYPE_str, TYPE_sht , TYPE_str, TYPE_str, TYPE_str, TYPE_str 
 };
+
+str _coln_alignments_extra[]  = {"tag"   , "virtual_offset", "file_id", "type" 
 , "value" };
+int _colt_alignments_extra[]  = {TYPE_str, TYPE_lng        , TYPE_sht , 
TYPE_str, TYPE_str};
+
+
+
+
+/* File format specific functions */
+static str _next_file_id(Client cntxt, MalBlkPtr mb, sht *next_file_id);
+static str _init_temp_container(_temp_container *ret_tc, int dbschema);
+static str _init_temp_container_simple(_temp_container *ret_tc);
+static str _loadfile(str filepath, _temp_container *ret_tc, int dbschema, int 
file_id); /* load file and add contents to ret_tc */
+static str _process_bam_header(int file_id, str header, _temp_container 
*ret_tc);
+static int _append_option_to_bat_cond_lng(_temp_container *ret_tc, 
_bam_header_option *opt, str cmp, int table, int col, int *appendErr, int 
*flag);
+static int _append_option_to_bat_cond_flt(_temp_container *ret_tc, 
_bam_header_option *opt, str cmp, int table, int col, int *appendErr, int 
*flag);
+static int _append_option_to_bat(_temp_container *ret_tc, ptr value, int 
table, int col, int *appendErr, int *flag);
+static str _read_bam_header_line(str *header, _bam_header_line *ret_hl, int 
*eof);
+static void _free_bam_header_line(_bam_header_line *hl);
+static str _process_bam_alignment(int file_id, lng virtual_offset, 
bam_header_t *header, bam1_t *alignment, _temp_container *ret_tc, int schema);
+static int _parse_alignment_str(str *sam_alig, str *dest);
+static int _parse_alignment_lng(str *sam_alig, lng *dest);
+
+/* Generic functions */
+static str _init_temp_subcontainer(_temp_subcontainer *ret_tsc, str 
*col_names, int *col_types, int num_cols);
+static str _append_to_bat(bat cb, ptr val);
+static str _insert_into_vault(Client cntxt, _temp_container* tc);
+static int _read_string_until_delim(str *src, str *ret, char *delims, int 
num_delims);
+static int _parse_lng(str *src, lng *i);
+static void _append_to_log(str mssg);
+static void _free_temp_container(_temp_container* tc);
+
+
+
+
+/* File format specific functions */
+
+str 
+bam_loader(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{    
+    /* TODO: This function is now called for BAM files, eventually there will 
have to be one generic */
+    /* function that e.g. receives the file format for which a DV should be 
initialized as an argument */
+    
+    str *repo_path = (str*) getArgReference(stk,pci,pci->retc); /* arg 1: 
repo_path, used as filepath right now */
+       int dbschema = *(int*) getArgReference(stk,pci,pci->retc+1); /* arg 2: 
schema to use, see bam_loader.h for possible options */
+       int num_threads = *(int*) getArgReference(stk,pci,pci->retc+2); /* arg 
3: 1: no threads, >1: multi-threaded */
+    
+    sht next_file_id = 0;
+    
+    str err1 = NULL, err2 = NULL;
+    _temp_container *tc = (_temp_container 
*)GDKmalloc(sizeof(_temp_container));
+    
+    if(tc == NULL)
+        throw(MAL, "bamloader", MAL_MALLOC_FAIL);
+    else if((err1 = _next_file_id(cntxt, mb, &next_file_id)) != MAL_SUCCEED)
+        err2 = "Error while retrieving next file_id: %s\n";
+    else if((err1 = _init_temp_container(tc, dbschema)) != MAL_SUCCEED)
+        err2 = "Error while creating _temp_container: %s\n";
+    else if((err1 = _loadfile(*repo_path, tc, dbschema, next_file_id)) != 
MAL_SUCCEED)
+        err2 = "Error while loading BAM file: %s\n";
+    else if((err1 = _insert_into_vault(cntxt, tc)) != MAL_SUCCEED)
+        err2 = "Error inserting data into database: %s\n";
+        
+    _free_temp_container(tc);
+    
+    if(err2 != NULL)
+        throw(MAL, "bamloader", err2, err1);
+    
+    (void)num_threads;
+    
+    return MAL_SUCCEED;
+}
+
+/* This method retrieves the number of records in the files table and returns 
this number + 1 as the new file_id
+* It would be prettier to use a sequence for this purpose. However, this was 
more complicated to achieve, so therefore
+* the method is implemented like this. Furthermore, after insertions, we would 
need to retrieve these newly inserted 
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to