Changeset: 56d6b80b515b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=56d6b80b515b
Modified Files:
        sql/backends/monet5/bam/Tests/benchmark2.sql
        sql/backends/monet5/bam/bam_lib.c
Branch: DVframework_bam
Log Message:

* Updated seq reverse function in bam library
* Revised and extended benchmark2.sql


diffs (truncated from 993 to 300 lines):

diff --git a/sql/backends/monet5/bam/Tests/benchmark2.sql 
b/sql/backends/monet5/bam/Tests/benchmark2.sql
--- a/sql/backends/monet5/bam/Tests/benchmark2.sql
+++ b/sql/backends/monet5/bam/Tests/benchmark2.sql
@@ -2,35 +2,69 @@
 ------------------------------------------------------------- Query 1a 
---------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
 
-SELECT qname,
-    CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq)   ELSE seq  
END AS seq,
-    CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE qual 
END AS qual
-FROM bam.alignments
-WHERE   file_id = 1
-    AND bam_flag(flag, 'seco_alig') = False
-    AND qname IN (
-        SELECT qname
-        FROM bam.alignments
-        WHERE file_id = 1
-          AND bam_flag(flag, 'seco_alig') = False
-        GROUP BY qname
-        HAVING COUNT(*) = 2
-           AND SUM(bam_flag(flag, 'firs_segm')) = 1
-           AND SUM(bam_flag(flag, 'last_segm')) = 1
-    )
-ORDER BY qname, bam_flag(flag, 'last_segm');
+WITH qnames AS (
+    SELECT qname
+    FROM bam.alignments
+    WHERE file_id = 1
+      AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+      AND (bam_flag(flag, 'seco_alig') = False
+      OR   bam_flag(flag, 'segm_unma') = True)
+    GROUP BY qname
+    HAVING COUNT(*) = 2
+       AND SUM(bam_flag(flag, 'firs_segm')) = 1
+       AND SUM(bam_flag(flag, 'last_segm')) = 1
+)
+SELECT a1.qname AS qname, a1.seq AS seq1, a1.qual AS qual1, a2.seq AS seq2, 
a2.qual AS qual2
+FROM (
+    SELECT qname,
+        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq)   ELSE seq 
 END AS seq,
+        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE 
qual END AS qual
+    FROM bam.alignments
+    WHERE file_id = 1
+      AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+      AND (bam_flag(flag, 'seco_alig') = False
+      OR   bam_flag(flag, 'segm_unma') = True)
+      AND qname IN (
+            SELECT *
+            FROM qnames
+      )
+      AND  bam_flag(flag, 'firs_segm') = True
+) AS a1 JOIN (
+    SELECT qname,
+        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq)   ELSE seq 
 END AS seq,
+        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE 
qual END AS qual
+    FROM bam.alignments
+    WHERE file_id = 1
+      AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+      AND (bam_flag(flag, 'seco_alig') = False
+      OR   bam_flag(flag, 'segm_unma') = True)
+      AND qname IN (
+            SELECT *
+            FROM qnames
+      )
+      AND  bam_flag(flag, 'last_segm') = True
+) AS a2 
+    ON a1.qname = a2.qname
+ORDER BY qname;
+
+
 
 -- Description:
 -- This query selects fields required by the FASTQ file format (qname, 
seq/seq-reverse, qual/qual-reverse).
--- The result of this query has the following properties:
--- * It is calculated over a single BAM file
--- * It only considers primary alignments
--- * The subquery selects only those qnames from file with file_id=1 that have 
exactly two primary alignments: 
---   one left and one right alignment
--- * It is ordered by qname and then by the last_segm flag. The subquery 
imposes the invariant on the outer query that 
---   only qnames are selected that have exactly one left primary read and one 
right primary read. The result of this
---   ordering is then that alignments of the same read are stored beneath 
eachother; the left segment first, and then the
---   right segment.
+-- It performs a join resulting in a wide table starting with a qname, 
followed by the seq/seq-reverse
+-- and the qual/qual-reverse for both reads of this qname. I.e., every tuple 
in the result contains a read pair. 
+-- The outer query joins two subresults together. Both subresults only contain 
primary and unmapped alignments 
+-- from file with file_id=1.
+-- The subresults contain alignments with their 'firs_segm' and 'last_segm' 
flag set respectively. Furthermore,
+-- since we don't want alignments showing up in the result more than once, we 
have to make sure that for every
+-- qname, exactly one left and exactly one right alignment remains in the 
subresults. Therefore, qnames that
+-- do not fulfill the conditions in the WITH clause are excluded from the 
subresults. 
+-- The WITH clause first filters alignments that are not primary and not 
unmapped and also alignments that have
+-- their 'firs_segm' flag equal to their 'last_segm' flag. It then only 
selects those qnames for which 
+-- sum(nr_primary) + sum(nr_unmapped) = 2. Furthermore, from these 2 
alignments, one has to be flagged as 
+-- 'firs_segm' and one has to be flagged as'last_segm'. This makes sure that 
only qnames will result that
+-- have exactly one matching alignment for both the left side and the right 
side of the result.
+-- The result is ordered by qname.
 
 
 
@@ -40,28 +74,45 @@ ORDER BY qname, bam_flag(flag, 'last_seg
 ------------------------------------------------------------- Query 1b 
---------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
 
-WITH prim_reve AS (
-    SELECT qname, flag,
-        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq)   ELSE seq 
 END AS seq,
-        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE 
qual END AS qual
+WITH alig AS (
+    SELECT qname, flag, seq, qual
     FROM bam.alignments
     WHERE file_id = 1
-      AND bam_flag(flag, 'seco_alig') = False
-)
-SELECT qname, seq, qual
-FROM prim_reve
-WHERE qname IN (
+      AND  bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+      AND (bam_flag(flag, 'seco_alig') = False
+      OR   bam_flag(flag, 'segm_unma') = True)
+), qnames AS (
     SELECT qname
-    FROM prim_reve
+    FROM alig
     GROUP BY qname
     HAVING COUNT(*) = 2
        AND SUM(bam_flag(flag, 'firs_segm')) = 1
        AND SUM(bam_flag(flag, 'last_segm')) = 1
+), alig_proj AS (
+    SELECT qname, flag,
+        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq)   ELSE seq 
 END AS seq,
+        CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE 
qual END AS qual
+    FROM alig
+    WHERE qname IN (
+        SELECT *
+        FROM qnames
+    )
 )
-ORDER BY qname, bam_flag(flag, 'last_segm');
+SELECT a1.qname AS qname, a1.seq AS seq1, a1.qual AS qual1, a2.seq AS seq2, 
a2.qual AS qual2
+FROM (
+    SELECT *
+    FROM alig_proj
+    WHERE bam_flag(flag, 'firs_segm') = True
+) AS a1 JOIN (
+    SELECT *
+    FROM alig_proj
+    WHERE bam_flag(flag, 'last_segm') = True
+) AS a2 
+    ON a1.qname = a2.qname
+ORDER BY qname;
 
 -- Description:
--- Optimized version of query 1a, by doing the filtering of primary alignments 
for file with file_id=1 just once.
+-- Optimized version of query 1a, by doing the filtering and the projection 
just once in the WITH clauses.
 -- Tests will have to prove which of the two queries runs faster.
 
 
@@ -76,32 +127,47 @@ ORDER BY qname, bam_flag(flag, 'last_seg
 
--------------------------------------------------------------------------------------------------------------------------------------
 ------------------------------------------------------------- Query 2a 
---------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
-SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS 
nr_alignments
+
+SELECT 
+    CASE WHEN a1.pos < a2.pos 
+         THEN a2.pos - (a1.pos + seq_length(a1.cigar))
+         ELSE a1.pos - (a2.pos + seq_length(a2.cigar))
+    END AS distance, 
+    COUNT(*) AS nr_alignments
 FROM (
-    SELECT qname, pos, cigar
+    SELECT qname, rname, pos, cigar
     FROM bam.alignments
     WHERE file_id = 1
+      AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
       AND bam_flag(flag, 'seco_alig') = False
       AND bam_flag(flag, 'firs_segm') = True
 ) AS a1 JOIN (
-    SELECT qname, pos
+    SELECT qname, rname, pos, cigar
     FROM bam.alignments
     WHERE file_id = 1
+      AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
       AND bam_flag(flag, 'seco_alig') = False
       AND bam_flag(flag, 'last_segm') = True
-) AS a2 ON a1.qname = a2.qname
+) AS a2 
+    ON a1.qname = a2.qname
+   AND a1.rname = a2.rname
 GROUP BY distance
 ORDER BY nr_alignments DESC;
 
 -- Description:
 -- This query calculates a histogram that displays the number of read pairs 
with a certain distance.
 -- The outer query joins two subresults together. These subresults contain all 
primary alignments for
--- the first segment and the last segment respectively. If the query is 
consistent according to the 
--- consistency check in query 4, there is a unique primary alignment for both 
sides (firs_segm and last_segm).
--- In the case there is a primary alignment in just one of the two subresults, 
the join will drop this
--- since the join predicate won't be fulfilled. 
--- The distance of each record in this joined table can now be calculated. 
Furthermore, grouping can be 
--- done on this distance to create the histogram.
+-- the first segment and the last segment respectively. Note that only 
alignments are considered that have
+-- exactly one flag of the flags 'firs_segm' and 'last_segm' set.
+-- If the query is consistent according to the consistency check in query 4, 
there is a unique primary 
+-- alignment for both sides (firs_segm and last_segm). In the case there is a 
primary alignment in just 
+-- one of the two subresults, the join will drop this since the join predicate 
won't be fulfilled. 
+-- Furtermore, only alignments are joined that are in the same chromosome, 
since the distance between two
+-- alignments is not defined if they are not in the same chromosome.
+-- The distance of each record in this joined table can now be calculated. A 
CASE statement is used since
+-- there is no guarantee on whether or not the alignments flagged as 
first_segm (last_segm) are the left (right)
+-- alignments. 
+-- Furthermore, grouping is done on this distance to create the histogram.
 -- The result is ordered by the nr of alignments in a descending fashion, such 
that the most occurring
 -- distances will be presented first.
 
@@ -113,22 +179,30 @@ ORDER BY nr_alignments DESC;
 ------------------------------------------------------------- Query 2b 
----------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
 
-WITH prim AS (
-    SELECT *
+WITH alig AS (
+    SELECT qname, flag, rname, pos, cigar
     FROM bam.alignments
     WHERE file_id = 1
+      AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
       AND bam_flag(flag, 'seco_alig') = False
 )
-SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS 
nr_alignments
+SELECT 
+    CASE WHEN a1.pos < a2.pos 
+         THEN a2.pos - (a1.pos + seq_length(a1.cigar))
+         ELSE a1.pos - (a2.pos + seq_length(a2.cigar))
+    END AS distance, 
+    COUNT(*) AS nr_alignments
 FROM (
-    SELECT qname, pos, cigar
-    FROM prim
+    SELECT qname, rname, pos, cigar
+    FROM alig
     WHERE bam_flag(flag, 'firs_segm') = True
 ) AS a1 JOIN (
-    SELECT qname, pos
-    FROM prim
+    SELECT qname, rname, pos, cigar
+    FROM alig
     WHERE bam_flag(flag, 'last_segm') = True
-) AS a2 ON a1.qname = a2.qname
+) AS a2 
+    ON a1.qname = a2.qname
+   AND a1.rname = a2.rname
 GROUP BY distance
 ORDER BY nr_alignments DESC;
 
@@ -149,14 +223,14 @@ ORDER BY nr_alignments DESC;
 ------------------------------------------------------------- Query 3 
----------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
 
-SELECT *
+SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual
 FROM bam.alignments 
 WHERE qname IN (
     SELECT qname
     FROM bam.alignments
     WHERE file_id = 1
     GROUP BY qname
-    HAVING SUM(bam_flag(flag, 'firs_segm')) = 0 
+    HAVING SUM(bam_flag(flag, 'firs_segm')) = 0
         OR SUM(bam_flag(flag, 'last_segm')) = 0
 )
 ORDER BY qname;
@@ -164,7 +238,7 @@ ORDER BY qname;
 -- Description:
 -- This query checks a consistency aspect of the BAM file with file_id=1. It 
returns all alignments
 -- for qnames that have either no alignment flagged as first segment or no 
alignment flagged as last segment.
--- The inner query groups by qname and selects the inconsistent ones. The 
outer query then selects all attributes
+-- The inner query groups by qname and selects the inconsistent ones. The 
outer query then selects all relevant attributes
 -- from all alignments for these qnames.
 -- To provide the user with a convenient overview of the inconsistencies, the 
result is ordered by qname.
 
@@ -181,17 +255,20 @@ ORDER BY qname;
 ------------------------------------------------------------- Query 4 
----------------------------------------------------------------
 
--------------------------------------------------------------------------------------------------------------------------------------
 
-SELECT *
+SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual
 FROM bam.alignments
 WHERE qname IN (
     SELECT qname
-    FROM bam.alignments
-    WHERE file_id=1
-      AND bam_flag(flag, 'segm_unma') = False
-    GROUP BY qname
-    HAVING CAST((SUM(bam_flag(flag, 'firs_segm')) > 0) AS SMALLINT) + 
-           CAST((SUM(bam_flag(flag, 'last_segm')) > 0) AS SMALLINT) <> 
-           (COUNT(*) - SUM(bam_flag(flag, 'seco_alig')))
+    FROM (
+        SELECT qname, bam_flag(flag, 'seco_alig') AS seco_alig, bam_flag(flag, 
'segm_unma') AS segm_unma, 
+            bam_flag(flag, 'firs_segm') AS firs_segm
+        FROM bam.alignments
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to