Changeset: ac06d0a20b67 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ac06d0a20b67
Added Files:
        sql/backends/monet5/bam/bam_clear_0.sql
        sql/backends/monet5/bam/bam_schema_0.sql
Modified Files:
        sql/backends/monet5/bam/Tests/benchmark2.sql
        sql/backends/monet5/bam/bam_clear_1.sql
        sql/backends/monet5/bam/bam_loader.c
        sql/backends/monet5/bam/bam_loader.h
        sql/backends/monet5/bam/bam_schema_1.sql
Branch: DVframework_bam
Log Message:

Extended bam_loader with some convenient ways to deal with using multiple 
schemas. Also implemented a new split schema, including the required 
functionality to bam_loader to actually work with this schema. This schema will 
be used for testing the DVF.

Furthermore, started to adapt the dvf and dvf_opt to work with bam files 
instead of mseed files. When this finishes, generic methods will be generated.

Another big update is the benchmark2.sql file. This is completely revised and 
extended after a long discussion with Tobias.


diffs (truncated from 922 to 300 lines):

diff --git a/sql/backends/monet5/bam/Tests/benchmark2.sql 
b/sql/backends/monet5/bam/Tests/benchmark2.sql
--- a/sql/backends/monet5/bam/Tests/benchmark2.sql
+++ b/sql/backends/monet5/bam/Tests/benchmark2.sql
@@ -21,6 +21,8 @@ FROM (
         CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE 
qual END AS qual
     FROM bam.alignments
     WHERE file_id = 1
+      AND seq  <> '*'
+      AND qual <> '*'
       AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
       AND (bam_flag(flag, 'seco_alig') = False
       OR   bam_flag(flag, 'segm_unma') = True)
@@ -35,6 +37,8 @@ FROM (
         CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE 
qual END AS qual
     FROM bam.alignments
     WHERE file_id = 1
+      AND seq  <> '*'
+      AND qual <> '*'
       AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
       AND (bam_flag(flag, 'seco_alig') = False
       OR   bam_flag(flag, 'segm_unma') = True)
@@ -47,14 +51,13 @@ FROM (
     ON a1.qname = a2.qname
 ORDER BY qname;
 
-
-
 -- Description:
 -- This query selects fields required by the FASTQ file format (qname, 
seq/seq-reverse, qual/qual-reverse).
 -- It performs a join resulting in a wide table starting with a qname, 
followed by the seq/seq-reverse
 -- and the qual/qual-reverse for both reads of this qname. I.e., every tuple 
in the result contains a read pair. 
 -- The outer query joins two subresults together. Both subresults only contain 
primary and unmapped alignments 
--- from file with file_id=1.
+-- from file with file_id=1. Also, alignments that have not stored their seq 
or qual value are filtered out in the
+-- subresults.
 -- The subresults contain alignments with their 'firs_segm' and 'last_segm' 
flag set respectively. Furthermore,
 -- since we don't want alignments showing up in the result more than once, we 
have to make sure that for every
 -- qname, exactly one left and exactly one right alignment remains in the 
subresults. Therefore, qnames that
@@ -78,6 +81,8 @@ WITH alig AS (
     SELECT qname, flag, seq, qual
     FROM bam.alignments
     WHERE file_id = 1
+      AND seq  <> '*'
+      AND qual <> '*'
       AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
       AND (bam_flag(flag, 'seco_alig') = False
       OR   bam_flag(flag, 'segm_unma') = True)
@@ -404,7 +409,7 @@ FROM (
 
 
 
--------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------- Query 6a 
----------------------------------------------------------------
+------------------------------------------------------------- Query 6 
----------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
 
 SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual
@@ -489,7 +494,7 @@ FROM (
     ON  a1.qname = a2.qname
     AND bam_flag(a1.flag, 'firs_segm') = bam_flag(a2.flag, 'firs_segm')
     AND bam_flag(a1.flag, 'last_segm') = bam_flag(a2.flag, 'last_segm')
-    AND (a1.rname <> a2.rname OR a1.pos <> a2.pos);
+    AND (a1.rname <> a2.rname OR a1.pos <> a2.pos)
 ORDER BY qname;
 
 -- Description:
@@ -805,7 +810,7 @@ FROM (
 ) AS alig
 WHERE distance > 0
   AND distance < 100000
-GROUP BY rname;
+ORDER BY rname;
 
 -- Description:
 -- Searches for potential secondary alignment pairs. The subquery joins two 
subresults together. Both subresults must
@@ -866,4 +871,4 @@ ORDER BY rname;
 
 -- Description:
 -- Optimized version of query 12a, by doing some filtering in the WITH clause.
--- Tests will have to prove which of the two queries runs faster.
\ No newline at end of file
+-- Tests will have to prove which of the two queries runs faster.
diff --git a/sql/backends/monet5/bam/bam_clear_0.sql 
b/sql/backends/monet5/bam/bam_clear_0.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/bam_clear_0.sql
@@ -0,0 +1,14 @@
+DROP FUNCTION bam.bam_files_reg;
+DROP FUNCTION bam.bam_sq_reg;
+DROP FUNCTION bam.bam_rg_reg;
+DROP FUNCTION bam.bam_pg_reg;
+DROP FUNCTION bam.bam_alignments_reg;
+DROP FUNCTION bam.bam_alignments_extra_reg;
+
+DROP TABLE bam.alignments_extra;
+DROP TABLE bam.alignments;
+DROP TABLE bam.pg;
+DROP TABLE bam.rg;
+DROP TABLE bam.sq;
+DROP TABLE bam.files;
+DROP SCHEMA bam;
diff --git a/sql/backends/monet5/bam/bam_clear_1.sql 
b/sql/backends/monet5/bam/bam_clear_1.sql
--- a/sql/backends/monet5/bam/bam_clear_1.sql
+++ b/sql/backends/monet5/bam/bam_clear_1.sql
@@ -5,8 +5,11 @@ DROP FUNCTION bam.bam_pg_reg;
 DROP FUNCTION bam.bam_alignments_reg;
 DROP FUNCTION bam.bam_alignments_extra_reg;
 
+DROP VIEW bam.alignments;
+
 DROP TABLE bam.alignments_extra;
-DROP TABLE bam.alignments;
+DROP TABLE bam.alignments_data;
+DROP TABLE bam.alignments_meta;
 DROP TABLE bam.pg;
 DROP TABLE bam.rg;
 DROP TABLE bam.sq;
diff --git a/sql/backends/monet5/bam/bam_loader.c 
b/sql/backends/monet5/bam/bam_loader.c
--- a/sql/backends/monet5/bam/bam_loader.c
+++ b/sql/backends/monet5/bam/bam_loader.c
@@ -31,7 +31,6 @@
         *flag = TRUE;
 
     
-    
 /*
  * NOTE: Copied directly from miniseed/registrar.c
  * TODO: Make both miniseed/registrar.c and this file include some generic 
library for these kind of structures
@@ -70,6 +69,21 @@ typedef struct {
     sht num_options;
 } _bam_header_line;
 
+typedef struct {
+    lng virtual_offset;
+    sht file_id;
+    str qname;
+    sht flag;
+    str rname;
+    int pos;
+    sht mapq;
+    str cigar;
+    str rnext;
+    int pnext;
+    int tlen;
+    str seq;
+    str qual;
+} _alignment;
 
 /* Global vars */
 
@@ -77,11 +91,11 @@ FILE *_logfile = NULL; /* keep _logfile 
                         /* since opening and closing every time something has 
to be written turned out to be very slow */
 
 /* SQL schema details that are common to every schema */
-sht _num_col_files               = 5;
-sht _num_col_sq                  = 7;
-sht _num_col_rg                  = 13;
-sht _num_col_pg                  = 6;
-sht _num_col_alignments_extra    = 5;
+sht _nr_col_files               = 5;
+sht _nr_col_sq                  = 7;
+sht _nr_col_rg                  = 13;
+sht _nr_col_pg                  = 6;
+sht _nr_col_alignments_extra    = 5;
 
 str _coln_files[]  = {"file_id", "file_location", "format_version", 
"sorting_order", "comments"};
 int _colt_files[]  = {TYPE_sht , TYPE_str       , TYPE_str        , TYPE_str   
    , TYPE_str  };
@@ -102,16 +116,18 @@ int _colt_alignments_extra[]  = {TYPE_st
 
 
 /* File format specific functions */
-static lng _get_line_num(str filename);
+static lng _get_line_nr(str filename);
 static lng _get_file_paths(str repo_path, str** ret_file_paths);
 static str _next_file_id(Client cntxt, MalBlkPtr mb, sht *next_file_id);
-static str _init_temp_container(_temp_container *ret_tc, sht dbschema);
-static str _init_temp_container_simple(_temp_container *ret_tc);
+static str _init_tc_full_load(_temp_container *ret_tc);
+static str _init_tc_simple_split(_temp_container *ret_tc);
 static str _loadfile(str filepath, _temp_container *ret_tc, sht dbschema, sht 
file_id); /* load file and add contents to ret_tc */
 static str _parse_bam_header(sht file_id, str header, _temp_container *ret_tc);
 static str _parse_bam_header_line(str *header, _bam_header_line *ret_hl, bit 
*eof);
 static void _free_bam_header_line(_bam_header_line *hl);
-static str _process_bam_alignment(sht file_id, lng virtual_offset, 
bam_header_t *header, bam1_t *alignment, _temp_container *ret_tc, sht schema);
+static str _process_alignment(sht file_id, lng virtual_offset, bam_header_t 
*header, bam1_t *alignment, _temp_container *ret_tc, sht schema);
+static str _store_alignment_full_load(_alignment *a, _temp_container *ret_tc);
+static str _store_alignment_simple_split(_alignment *a, _temp_container 
*ret_tc);
 
 /* Generic functions */
 static str _init_temp_subcontainer(_temp_subcontainer *ret_tsc, str 
*col_names, int *col_types, sht num_cols);
@@ -121,6 +137,18 @@ static void _append_to_log(str mssg);
 static void _free_temp_container(_temp_container* tc);
 
 
+
+/* Schema dependent functionality */
+str (*_tc_initializers[])(_temp_container *) = {&_init_tc_full_load, 
&_init_tc_simple_split};
+str (*_alignment_storers[])(_alignment *, _temp_container *) = 
{&_store_alignment_full_load, &_store_alignment_simple_split};
+
+#define _init_tc(ret_tc, dbschema) (_tc_initializers[(dbschema)]((ret_tc))) 
+#define _store_alignment(a, ret_tc, dbschema) 
(_alignment_storers[(dbschema)]((a), (ret_tc)))
+
+sht nr_alignment_tables[] = {1, 2};
+
+
+
 /* File format specific functions */
 
 str 
@@ -136,23 +164,33 @@ bam_loader(Client cntxt, MalBlkPtr mb, M
     str *file_paths;
     int nr_files, k;
     
+    
+    if(dbschema < 0 || dbschema > 1)
+        throw(MAL, "bam_loader", "Invalid argument received for dbschema: 
%d\n", dbschema);
+        
     /* Get the file list */
     if((nr_files = _get_file_paths(*repo_path, &file_paths)) < 0)
         throw(MAL, "bam_loader", "Couldn't retrieve file list from file 
'%s'\n", *repo_path);
     
     for(k=0; k<nr_files; k++)
     {
+        int bufsize = 255;
+        str buf = GDKmalloc(bufsize * sizeof(char));
+        
+        int start = GDKms();
+        float duration_sec;
+        
         str file_path = file_paths[k];
         sht next_file_id = 0;
         str err1 = NULL, err2 = NULL;
         _temp_container *tc = (_temp_container 
*)GDKmalloc(sizeof(_temp_container));
     
-        if(tc == NULL)
+        if(buf == NULL || tc == NULL)
             throw(MAL, "bam_loader", MAL_MALLOC_FAIL);
         
         if((err1 = _next_file_id(cntxt, mb, &next_file_id)) != MAL_SUCCEED)
             err2 = "Error while retrieving next file_id: %s\n";
-        else if((err1 = _init_temp_container(tc, dbschema)) != MAL_SUCCEED)
+        else if((err1 = _init_tc(tc, dbschema)) != MAL_SUCCEED)
             err2 = "Error while creating _temp_container: %s\n";
         else if((err1 = _loadfile(file_path, tc, dbschema, next_file_id)) != 
MAL_SUCCEED)
         {
@@ -171,8 +209,14 @@ bam_loader(Client cntxt, MalBlkPtr mb, M
             throw(MAL, "bam_loader", err2, err1);
         }
         _free_temp_container(tc);
+        
+        duration_sec = (GDKms() - start) / 1000.0f;
+        snprintf(buf, bufsize, "Processed file '%s' in %f seconds = %f minutes 
= %f hours", file_path, duration_sec, duration_sec/60, duration_sec/3600);
+        _append_to_log(buf);
+        GDKfree(buf);
     }
 
+    fclose(_logfile);
     GDKfree(file_paths);
         
     (void)nr_threads;
@@ -190,7 +234,7 @@ bam_loader(Client cntxt, MalBlkPtr mb, M
  * not have a newline at the end.
  */
 static lng 
-_get_line_num(str filename)
+_get_line_nr(str filename)
 {
        FILE *f;
        char c;
@@ -244,7 +288,7 @@ static lng
                        /* each line is a file_path */
 
                        FILE *file;
-                       num_file_paths = _get_line_num(repo_path);
+                       num_file_paths = _get_line_nr(repo_path);
                        printf("num_file_paths: "LLFMT"\n", num_file_paths);
 
                        *ret_file_paths = file_paths = (str*) 
GDKmalloc(num_file_paths * sizeof(str));
@@ -324,22 +368,12 @@ static str
     return MAL_SUCCEED;
 }
 
+
 /*
-* The dbschema argument will determine how the _temp_container structure will 
be filled
+* Fill the _temp_container with the fully loaded schema
 */
 static str 
-_init_temp_container(_temp_container *ret_tc, sht dbschema)
-{
-    if(dbschema == DBSCHEMA_SIMPLE)
-        return _init_temp_container_simple(ret_tc);
-    throw(MAL, "_init_temp_container", "No temp container initialization 
method exists for dbschema %d", dbschema);
-}
-
-/*
-* Fill the _temp_container with the simple schema
-*/
-static str 
-_init_temp_container_simple(_temp_container *ret_tc)
+_init_tc_full_load(_temp_container *ret_tc)
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to