Changeset: 56d6b80b515b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=56d6b80b515b
Modified Files:
sql/backends/monet5/bam/Tests/benchmark2.sql
sql/backends/monet5/bam/bam_lib.c
Branch: DVframework_bam
Log Message:
* Updated seq reverse function in bam library
* Revised and extended benchmark2.sql
diffs (truncated from 993 to 300 lines):
diff --git a/sql/backends/monet5/bam/Tests/benchmark2.sql
b/sql/backends/monet5/bam/Tests/benchmark2.sql
--- a/sql/backends/monet5/bam/Tests/benchmark2.sql
+++ b/sql/backends/monet5/bam/Tests/benchmark2.sql
@@ -2,35 +2,69 @@
------------------------------------------------------------- Query 1a
---------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------
-SELECT qname,
- CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq) ELSE seq
END AS seq,
- CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE qual
END AS qual
-FROM bam.alignments
-WHERE file_id = 1
- AND bam_flag(flag, 'seco_alig') = False
- AND qname IN (
- SELECT qname
- FROM bam.alignments
- WHERE file_id = 1
- AND bam_flag(flag, 'seco_alig') = False
- GROUP BY qname
- HAVING COUNT(*) = 2
- AND SUM(bam_flag(flag, 'firs_segm')) = 1
- AND SUM(bam_flag(flag, 'last_segm')) = 1
- )
-ORDER BY qname, bam_flag(flag, 'last_segm');
+WITH qnames AS (
+ SELECT qname
+ FROM bam.alignments
+ WHERE file_id = 1
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+ AND (bam_flag(flag, 'seco_alig') = False
+ OR bam_flag(flag, 'segm_unma') = True)
+ GROUP BY qname
+ HAVING COUNT(*) = 2
+ AND SUM(bam_flag(flag, 'firs_segm')) = 1
+ AND SUM(bam_flag(flag, 'last_segm')) = 1
+)
+SELECT a1.qname AS qname, a1.seq AS seq1, a1.qual AS qual1, a2.seq AS seq2,
a2.qual AS qual2
+FROM (
+ SELECT qname,
+ CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq) ELSE seq
END AS seq,
+ CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE
qual END AS qual
+ FROM bam.alignments
+ WHERE file_id = 1
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+ AND (bam_flag(flag, 'seco_alig') = False
+ OR bam_flag(flag, 'segm_unma') = True)
+ AND qname IN (
+ SELECT *
+ FROM qnames
+ )
+ AND bam_flag(flag, 'firs_segm') = True
+) AS a1 JOIN (
+ SELECT qname,
+ CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq) ELSE seq
END AS seq,
+ CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE
qual END AS qual
+ FROM bam.alignments
+ WHERE file_id = 1
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+ AND (bam_flag(flag, 'seco_alig') = False
+ OR bam_flag(flag, 'segm_unma') = True)
+ AND qname IN (
+ SELECT *
+ FROM qnames
+ )
+ AND bam_flag(flag, 'last_segm') = True
+) AS a2
+ ON a1.qname = a2.qname
+ORDER BY qname;
+
+
-- Description:
-- This query selects fields required by the FASTQ file format (qname,
seq/seq-reverse, qual/qual-reverse).
--- The result of this query has the following properties:
--- * It is calculated over a single BAM file
--- * It only considers primary alignments
--- * The subquery selects only those qnames from file with file_id=1 that have
exactly two primary alignments:
--- one left and one right alignment
--- * It is ordered by qname and then by the last_segm flag. The subquery
imposes the invariant on the outer query that
--- only qnames are selected that have exactly one left primary read and one
right primary read. The result of this
--- ordering is then that alignments of the same read are stored beneath
eachother; the left segment first, and then the
--- right segment.
+-- It performs a join resulting in a wide table starting with a qname,
followed by the seq/seq-reverse
+-- and the qual/qual-reverse for both reads of this qname. I.e., every tuple
in the result contains a read pair.
+-- The outer query joins two subresults together. Both subresults only contain
primary and unmapped alignments
+-- from file with file_id=1.
+-- The subresults contain alignments with their 'firs_segm' and 'last_segm'
flag set respectively. Furthermore,
+-- since we don't want alignments showing up in the result more than once, we
have to make sure that for every
+-- qname, exactly one left and exactly one right alignment remains in the
subresults. Therefore, qnames that
+-- do not fulfill the conditions in the WITH clause are excluded from the
subresults.
+-- The WITH clause first filters alignments that are not primary and not
unmapped and also alignments that have
+-- their 'firs_segm' flag equal to their 'last_segm' flag. It then only
selects those qnames for which
+-- sum(nr_primary) + sum(nr_unmapped) = 2. Furthermore, from these 2
alignments, one has to be flagged as
+-- 'firs_segm' and one has to be flagged as'last_segm'. This makes sure that
only qnames will result that
+-- have exactly one matching alignment for both the left side and the right
side of the result.
+-- The result is ordered by qname.
@@ -40,28 +74,45 @@ ORDER BY qname, bam_flag(flag, 'last_seg
------------------------------------------------------------- Query 1b
---------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------
-WITH prim_reve AS (
- SELECT qname, flag,
- CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq) ELSE seq
END AS seq,
- CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE
qual END AS qual
+WITH alig AS (
+ SELECT qname, flag, seq, qual
FROM bam.alignments
WHERE file_id = 1
- AND bam_flag(flag, 'seco_alig') = False
-)
-SELECT qname, seq, qual
-FROM prim_reve
-WHERE qname IN (
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
+ AND (bam_flag(flag, 'seco_alig') = False
+ OR bam_flag(flag, 'segm_unma') = True)
+), qnames AS (
SELECT qname
- FROM prim_reve
+ FROM alig
GROUP BY qname
HAVING COUNT(*) = 2
AND SUM(bam_flag(flag, 'firs_segm')) = 1
AND SUM(bam_flag(flag, 'last_segm')) = 1
+), alig_proj AS (
+ SELECT qname, flag,
+ CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_seq(seq) ELSE seq
END AS seq,
+ CASE WHEN bam_flag(flag, 'segm_reve') THEN reverse_qual(qual) ELSE
qual END AS qual
+ FROM alig
+ WHERE qname IN (
+ SELECT *
+ FROM qnames
+ )
)
-ORDER BY qname, bam_flag(flag, 'last_segm');
+SELECT a1.qname AS qname, a1.seq AS seq1, a1.qual AS qual1, a2.seq AS seq2,
a2.qual AS qual2
+FROM (
+ SELECT *
+ FROM alig_proj
+ WHERE bam_flag(flag, 'firs_segm') = True
+) AS a1 JOIN (
+ SELECT *
+ FROM alig_proj
+ WHERE bam_flag(flag, 'last_segm') = True
+) AS a2
+ ON a1.qname = a2.qname
+ORDER BY qname;
-- Description:
--- Optimized version of query 1a, by doing the filtering of primary alignments
for file with file_id=1 just once.
+-- Optimized version of query 1a, by doing the filtering and the projection
just once in the WITH clauses.
-- Tests will have to prove which of the two queries runs faster.
@@ -76,32 +127,47 @@ ORDER BY qname, bam_flag(flag, 'last_seg
--------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------- Query 2a
---------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------
-SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS
nr_alignments
+
+SELECT
+ CASE WHEN a1.pos < a2.pos
+ THEN a2.pos - (a1.pos + seq_length(a1.cigar))
+ ELSE a1.pos - (a2.pos + seq_length(a2.cigar))
+ END AS distance,
+ COUNT(*) AS nr_alignments
FROM (
- SELECT qname, pos, cigar
+ SELECT qname, rname, pos, cigar
FROM bam.alignments
WHERE file_id = 1
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
AND bam_flag(flag, 'seco_alig') = False
AND bam_flag(flag, 'firs_segm') = True
) AS a1 JOIN (
- SELECT qname, pos
+ SELECT qname, rname, pos, cigar
FROM bam.alignments
WHERE file_id = 1
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
AND bam_flag(flag, 'seco_alig') = False
AND bam_flag(flag, 'last_segm') = True
-) AS a2 ON a1.qname = a2.qname
+) AS a2
+ ON a1.qname = a2.qname
+ AND a1.rname = a2.rname
GROUP BY distance
ORDER BY nr_alignments DESC;
-- Description:
-- This query calculates a histogram that displays the number of read pairs
with a certain distance.
-- The outer query joins two subresults together. These subresults contain all
primary alignments for
--- the first segment and the last segment respectively. If the query is
consistent according to the
--- consistency check in query 4, there is a unique primary alignment for both
sides (firs_segm and last_segm).
--- In the case there is a primary alignment in just one of the two subresults,
the join will drop this
--- since the join predicate won't be fulfilled.
--- The distance of each record in this joined table can now be calculated.
Furthermore, grouping can be
--- done on this distance to create the histogram.
+-- the first segment and the last segment respectively. Note that only
alignments are considered that have
+-- exactly one flag of the flags 'firs_segm' and 'last_segm' set.
+-- If the query is consistent according to the consistency check in query 4,
there is a unique primary
+-- alignment for both sides (firs_segm and last_segm). In the case there is a
primary alignment in just
+-- one of the two subresults, the join will drop this since the join predicate
won't be fulfilled.
+-- Furtermore, only alignments are joined that are in the same chromosome,
since the distance between two
+-- alignments is not defined if they are not in the same chromosome.
+-- The distance of each record in this joined table can now be calculated. A
CASE statement is used since
+-- there is no guarantee on whether or not the alignments flagged as
first_segm (last_segm) are the left (right)
+-- alignments.
+-- Furthermore, grouping is done on this distance to create the histogram.
-- The result is ordered by the nr of alignments in a descending fashion, such
that the most occurring
-- distances will be presented first.
@@ -113,22 +179,30 @@ ORDER BY nr_alignments DESC;
------------------------------------------------------------- Query 2b
----------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------
-WITH prim AS (
- SELECT *
+WITH alig AS (
+ SELECT qname, flag, rname, pos, cigar
FROM bam.alignments
WHERE file_id = 1
+ AND bam_flag(flag, 'firs_segm') <> bam_flag(flag, 'last_segm')
AND bam_flag(flag, 'seco_alig') = False
)
-SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS
nr_alignments
+SELECT
+ CASE WHEN a1.pos < a2.pos
+ THEN a2.pos - (a1.pos + seq_length(a1.cigar))
+ ELSE a1.pos - (a2.pos + seq_length(a2.cigar))
+ END AS distance,
+ COUNT(*) AS nr_alignments
FROM (
- SELECT qname, pos, cigar
- FROM prim
+ SELECT qname, rname, pos, cigar
+ FROM alig
WHERE bam_flag(flag, 'firs_segm') = True
) AS a1 JOIN (
- SELECT qname, pos
- FROM prim
+ SELECT qname, rname, pos, cigar
+ FROM alig
WHERE bam_flag(flag, 'last_segm') = True
-) AS a2 ON a1.qname = a2.qname
+) AS a2
+ ON a1.qname = a2.qname
+ AND a1.rname = a2.rname
GROUP BY distance
ORDER BY nr_alignments DESC;
@@ -149,14 +223,14 @@ ORDER BY nr_alignments DESC;
------------------------------------------------------------- Query 3
----------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------
-SELECT *
+SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual
FROM bam.alignments
WHERE qname IN (
SELECT qname
FROM bam.alignments
WHERE file_id = 1
GROUP BY qname
- HAVING SUM(bam_flag(flag, 'firs_segm')) = 0
+ HAVING SUM(bam_flag(flag, 'firs_segm')) = 0
OR SUM(bam_flag(flag, 'last_segm')) = 0
)
ORDER BY qname;
@@ -164,7 +238,7 @@ ORDER BY qname;
-- Description:
-- This query checks a consistency aspect of the BAM file with file_id=1. It
returns all alignments
-- for qnames that have either no alignment flagged as first segment or no
alignment flagged as last segment.
--- The inner query groups by qname and selects the inconsistent ones. The
outer query then selects all attributes
+-- The inner query groups by qname and selects the inconsistent ones. The
outer query then selects all relevant attributes
-- from all alignments for these qnames.
-- To provide the user with a convenient overview of the inconsistencies, the
result is ordered by qname.
@@ -181,17 +255,20 @@ ORDER BY qname;
------------------------------------------------------------- Query 4
----------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------
-SELECT *
+SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual
FROM bam.alignments
WHERE qname IN (
SELECT qname
- FROM bam.alignments
- WHERE file_id=1
- AND bam_flag(flag, 'segm_unma') = False
- GROUP BY qname
- HAVING CAST((SUM(bam_flag(flag, 'firs_segm')) > 0) AS SMALLINT) +
- CAST((SUM(bam_flag(flag, 'last_segm')) > 0) AS SMALLINT) <>
- (COUNT(*) - SUM(bam_flag(flag, 'seco_alig')))
+ FROM (
+ SELECT qname, bam_flag(flag, 'seco_alig') AS seco_alig, bam_flag(flag,
'segm_unma') AS segm_unma,
+ bam_flag(flag, 'firs_segm') AS firs_segm
+ FROM bam.alignments
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list