Changeset: 872efa4d660f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=872efa4d660f
Added Files:
sql/backends/monet5/bam/Tests/benchmark.sql
Modified Files:
sql/backends/monet5/bam/85_bam.sql
sql/backends/monet5/bam/bam.mal
sql/backends/monet5/bam/bam_lib.c
sql/backends/monet5/bam/bam_lib.h
sql/backends/monet5/bam/bam_loader.c
sql/backends/monet5/bam/bam_schema_1.sql
Branch: DVframework_bam
Log Message:
Fixed a segmentation fault that occurred when calculating new file_id when
database is empty.
Extended and corrected bam_lib.
Added benchmarks. SQL queries make use of the UDFs in bam_lib. Extensive
explanation of the queries is also added. The queries will be tested soon. Some
of them are known to crash mclient.
H: segmentation fault that occurred when calculating new file_id when database
is empty. Extended and corrected bam_lib.
Enter commit message. Lines beginning with 'HG:' are removed.
diffs (truncated from 473 to 300 lines):
diff --git a/sql/backends/monet5/bam/85_bam.sql
b/sql/backends/monet5/bam/85_bam.sql
--- a/sql/backends/monet5/bam/85_bam.sql
+++ b/sql/backends/monet5/bam/85_bam.sql
@@ -2,11 +2,14 @@ CREATE PROCEDURE bam_loader(repo STRING,
EXTERNAL NAME bam.bam_loader;
-CREATE FUNCTION bam_flag(flag INT, name STRING)
-RETURNS STRING EXTERNAL NAME bam.bam_flag;
+CREATE FUNCTION bam_flag(flag SMALLINT, name STRING)
+RETURNS BOOLEAN EXTERNAL NAME bam.bam_flag;
CREATE FUNCTION reverse_seq(seq STRING)
RETURNS STRING EXTERNAL NAME bam.reverse_seq;
CREATE FUNCTION reverse_qual(qual STRING)
RETURNS STRING EXTERNAL NAME bam.reverse_qual;
+
+CREATE FUNCTION seq_length(cigar STRING)
+RETURNS INT EXTERNAL NAME bam.seq_length;
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/Tests/benchmark.sql
b/sql/backends/monet5/bam/Tests/benchmark.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/Tests/benchmark.sql
@@ -0,0 +1,306 @@
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 1
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT qname,
+ CASE WHEN bam_flag("flag", 'segm_reve') THEN reverse_seq("seq") ELSE
"seq" END,
+ CASE WHEN bam_flag("flag", 'segm_reve') THEN reverse_qual("qual") ELSE
"qual" END
+FROM bam.alignments
+WHERE file_id = 2
+ AND bam_flag("flag", 'seco_alig') = FALSE
+ AND bam_flag("flag", 'segm_unma') = TRUE
+ AND qname IN (
+ SELECT qname
+ FROM bam.alignments
+ WHERE bam_flag("flag", 'seco_alig') = FALSE
+ GROUP BY qname
+ HAVING COUNT(*) = 2
+ AND SUM(bam_flag("flag", 'firs_segm')) = 1
+ AND SUM(bam_flag("flag", 'last_segm')) = 1
+ )
+ORDER BY qname, bam_flag("flag", 'last_segm');
+
+-- Description:
+-- This query selects fields required by the FASTQ file format (qname,
seq/seq-reverse, qual/qual-reverse).
+-- The result of this query has the following properties:
+-- * It is calculated over a single BAM file
+-- * It only considers primary alignments
+-- * It only considers unmapped alignments
+-- * The subquery selects only those qnames that have exactly two primary
alignments: one left and one right alignment
+-- * It is ordered by qname and then by the last_segm flag. The subquery
imposes the invariant on the outer query that
+-- only qnames are selected that have exactly one left primary read and one
right primary read. The result of this
+-- order is then that alignments of the same read are stored beneath
eachother; the left segment first, and then the
+-- right segment.
+
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 2
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS
nr_alignments
+FROM (
+ SELECT *
+ FROM bam.alignments
+ WHERE file_id = 1
+ AND bam_flag(flag, 'seco_alig') = False
+ AND bam_flag(flag, 'firs_segm') = True
+) AS a1 JOIN (
+ SELECT *
+ FROM bam.alignments
+ WHERE file_id = 1
+ AND bam_flag(flag, 'seco_alig') = False
+ AND bam_flag(flag, 'last_segm') = True
+) AS a2 ON a1.qname = a2.qname
+GROUP BY distance;
+
+-- Description:
+-- This query calculates a histogram that displays the number of read pairs
with a certain distance.
+-- The outer query joins two subresults together. These subresults contain all
primary alignments for
+-- the first segment and the last segment respectively. If the query is
consistent according to the
+-- consistency check in query 4, there is a unique primary alignment for both
sides (firs_segm and last_segm).
+-- In the case there is a primary alignment in just one of the two subresults,
the join will drop this
+-- since the join predicate won't be fulfilled.
+-- The distance of each record in this joined table can now be calculated.
Furthermore, grouping can be
+-- done on this distance to create the histogram.
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 3
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM bam.alignments
+WHERE qname IN (
+ SELECT qname
+ FROM bam.alignments
+ WHERE file_id = 1
+ GROUP BY qname
+ HAVING SUM(bam_flag(flag, 'firs_segm')) = 0
+ OR SUM(bam_flag(flag, 'last_segm')) = 0
+)
+ORDER BY qname;
+
+-- Description:
+-- This query checks a consistency aspect of the BAM file with file_id=1. It
returns all alignments
+-- for qnames that have either no alignment flagged as first segment or no
alignment flagged as last segment.
+-- The inner query groups by qname and selects the inconsistent ones. The
outer query then selects all attributes
+-- from all alignments for these qnames.
+-- To provide the user with a convenient overview of the inconsistencies, the
result is ordered by qname.
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 4
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM bam.alignments
+WHERE qname IN (
+ SELECT qname,
+ SUM(bam_flag(flag, 'firs_segm')) AS sum_firs_segm,
+ SUM(bam_flag(flag, 'last_segm')) AS sum_last_segm,
+ COUNT(*) - SUM(bam_flag(flag, 'seco_alig')) AS sum_prim_alig
+ FROM bam.alignments
+ WHERE file_id=1
+ AND bam_flag(flag, 'segm_unma') = False
+ GROUP BY qname
+ HAVING (sum_firs_segm > 0 AND sum_last_segm > 0 AND sum_prim_alig <> 2)
+ OR (sum_firs_segm > 0 AND sum_prim_alig <> 1)
+ OR (sum_last_segm > 0 AND sum_prim_alig <> 1)
+)
+ORDER BY qname;
+
+-- Description:
+-- This query also checks a consistency aspect of the BAM file with file_id=1.
Every qname consists of two reads.
+-- For both reads it must hold that all its alignments must either be unmapped
or there must be exactly one primary
+-- alignment.
+-- In the inner query, the alignments with the unmapped flag set are thrown
away directly. The alignments that
+-- then remain are grouped by qname. A qname is then considered to be
inconsistent if it fulfills one of these cases:
+-- * There exists a left alignment and a right alignment in the group and the
number of primary alignments <> 2
+-- * There exists a left alignment and no right alignment in the group and the
number of primary alignments <> 1
+-- * There exists no left alignment and a right alignment in the group and the
number of primary alignments <> 1
+-- Exactly these three cases can be seen in the HAVING clause of the inner
query.
+-- The query assumes that there won't exist alignments with flags firs_segm =
last_segm = 0.
+-- The outer query again selects all attributes of all alignments in the
inconsistent qnames.
+-- The result is again ordered by qname for convenient displaying.
+
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 5
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM alignments
+WHERE qname IN (
+ SELECT qname
+ FROM bam.alignments
+ WHERE file_id = 1
+ INTERSECT
+ SELECT qname
+ FROM bam.alignments
+ WHERE file_id = 2
+);
+
+-- Description:
+-- Does a set intersection on BAM files with file_id=1 and file_id=2, based on
qname.
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 6
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM alignments
+WHERE qname IN (
+ SELECT qname
+ FROM bam.alignments
+ WHERE file_id = 1
+ EXCEPT
+ SELECT qname
+ FROM bam.alignments
+ WHERE file_id = 2
+);
+
+-- Description:
+-- Does a set minus on BAM files with file_id=1 and file_id=2, based on qname.
+
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 7
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM (
+ SELECT *
+ FROM
+ bam.alignments
+ WHERE file_id = 1
+ AND bam_flag(flag, 'seco_alig') = FALSE
+) AS a1 JOIN (
+ SELECT *
+ FROM
+ bam.alignments
+ WHERE file_id = 2
+ AND bam_flag(flag, 'seco_alig') = FALSE
+) AS a2
+ ON a1.qname = a2.qname
+ AND bam_flag(a1.flag, 'firs_segm') = bam_flag(a2.flag, 'firs_segm')
+ AND bam_flag(a1.flag, 'last_segm') = bam_flag(a2.flag, 'last_segm')
+ AND a1.pos <> a2.pos
+
+-- Description:
+-- Joins primary alignments from two files (file_id=1 and file_id=2) together
if these alignments have the same qname but
+-- are mapped to different positions.
+-- The first subquery selects all primary alignments from file with file_id=1
+-- The second subquery selects all primary alignments from file with file_id=2
+-- The outer query then joins two alignments from these two subresults
together under the following conditions:
+-- * The qnames of the alignments are equal
+-- * The alignments are both either left or right alignments
+-- * The positions of the alignments aren't equal
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 8
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM alignments
+WHERE file_id = 1
+ AND rname = 'chr22'
+ AND 10 >= pos
+ AND 10 < pos + seq_length(cigar);
+
+-- Description:
+-- Selects all alignments with file_id=1 that overlap position 10 in
chromosome "chr22"
+
+
+
+
+
+
+
+
+
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list