Changeset: 97d38cacec07 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=97d38cacec07
Modified Files:
monetdb5/optimizer/opt_support.c
sql/backends/monet5/bam/bam_db_interface.c
sql/backends/monet5/bam/bam_export.c
sql/backends/monet5/bam/bam_wrapper.c
Branch: bamloader
Log Message:
Implemented bam_export function and made earlier written functionality for SAM
export more generic. Only problem left is that the SAM header does not get
parsed properly for some reason. Therefore, no dictionary is built containing
the different chromosomes. This will need some extra time to tackle.
diffs (truncated from 644 to 300 lines):
diff --git a/monetdb5/optimizer/opt_support.c b/monetdb5/optimizer/opt_support.c
--- a/monetdb5/optimizer/opt_support.c
+++ b/monetdb5/optimizer/opt_support.c
@@ -840,13 +840,9 @@ int isAllScalar(MalBlkPtr mb, InstrPtr p
int isMapOp(InstrPtr p){
return getModuleId(p) &&
((getModuleId(p) == malRef && getFunctionId(p) == multiplexRef)
||
- (getModuleId(p)== batcalcRef && getFunctionId(p) != mark_grpRef
&& getFunctionId(p) != rank_grpRef) ||
- (getModuleId(p)== batmtimeRef) ||
- (getModuleId(p)== batstrRef) ||
- (getModuleId(p)== batmmathRef) ||
- (getModuleId(p)== batxmlRef) ||
- (strcmp(getModuleId(p),"batsql") == 0) ||
- (getModuleId(p)== mkeyRef));
+ (getModuleId(p) == batcalcRef && getFunctionId(p) !=
mark_grpRef && getFunctionId(p) != rank_grpRef) ||
+ (getModuleId(p) != batcalcRef && strncmp(getModuleId(p), "bat",
3) == 0) ||
+ (getModuleId(p) == mkeyRef));
}
int isLikeOp(InstrPtr p){
diff --git a/sql/backends/monet5/bam/bam_db_interface.c
b/sql/backends/monet5/bam/bam_db_interface.c
--- a/sql/backends/monet5/bam/bam_db_interface.c
+++ b/sql/backends/monet5/bam/bam_db_interface.c
@@ -299,6 +299,8 @@ next_file_id(mvc * m, sql_table * files,
BATiter li;
BUN p = 0, q = 0;
lng max_file_id = 0;
+
+ sht i;
assert(m != NULL);
assert(files != NULL);
@@ -309,15 +311,17 @@ next_file_id(mvc * m, sql_table * files,
"Could not retrieve the next file id: Error binding
file_id column of 'files' table");
}
- /* Loop through BAT for this column and find the maximum file_id */
- b = store_funcs.bind_col(m->session->tr, c, RD_INS);
+ /* Loop through BATs for this column and find the maximum file_id */
+ for(i=0; i<3; ++i) {
+ b = store_funcs.bind_col(m->session->tr, c, i);
- li = bat_iterator(b);
- BATloop(b, p, q) {
- lng t = *(lng *) BUNtail(li, p);
- max_file_id = MAX(max_file_id, t);
+ li = bat_iterator(b);
+ BATloop(b, p, q) {
+ lng t = *(lng *) BUNtail(li, p);
+ max_file_id = MAX(max_file_id, t);
+ }
+ BBPreleaseref(b->batCacheid);
}
- BBPreleaseref(b->batCacheid);
*next_file_id = max_file_id + 1;
return MAL_SUCCEED;
diff --git a/sql/backends/monet5/bam/bam_export.c
b/sql/backends/monet5/bam/bam_export.c
--- a/sql/backends/monet5/bam/bam_export.c
+++ b/sql/backends/monet5/bam/bam_export.c
@@ -27,6 +27,8 @@
#include "monetdb_config.h"
+#include <samtools/bam.h>
+
#include "bam_globals.h"
#include "bam_export.h"
@@ -40,46 +42,210 @@ typedef struct bam_field {
} bam_field;
-#define CUR_STR(field, i) ((str) BUNtail(field.iter, (field.cur+i)))
-#define CUR_SHT(field, i) (*(sht *) BUNtail(field.iter, (field.cur+i)))
-#define CUR_INT(field, i) (*(int *) BUNtail(field.iter, (field.cur+i)))
+/**
+ * Copied directly from bam.h/bam_import.c for use by fill_bam_alig
+ * Can not change the call to realloc to GDKrealloc, since
+ * bam_destroy1 does not use GDKfree...
+ */
+#ifndef kroundup32
+/*! @function
+ @abstract Round an integer to the next closest power-2 integer.
+ @param x integer to be rounded (in place)
+ @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4,
(x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+static inline uint8_t *
+alloc_data(bam1_t *b, int size)
+{
+ if (b->m_data < size) {
+ b->m_data = size;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ return b->data;
+}
-str
-sam_export(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+/* Error macro */
+#define FILL_BAM_ALIG_ERR "Error processing alignment '%d': "
+
+/**
+ * I had to write this function, which contains much low level
+ * function calls to the Samtools library, since the sam_read1 function
+ * contained in bam.h/bam_import.c only works on files. This function is
+ * an adjusted variant of the sam_read1 function that gets its fields
+ * directly instead of reads it from a tamFile.
+ * I wrote this function since I did not feel like first writing everything
+ * to a SAM file and then applying sam_read1 since that is just a waste of I/O.
+ * Note that I did do a write to/read from SAM for the BAM header, but headers
are
+ * insignificant compared to alignment data.
+ *
+ * I also added realloc checking, since this was not done in the Samtools code
+ * and i properly throw a MAL_MALLOC_FAIL whenever applicable.
+ */
+static str
+fill_bam_alig(str qname, sht flag, str rname, int pos,
+ sht mapq, str cigar, str rnext, int pnext,
+ int tlen, str seq, str qual,
+ bam_header_t *header, bam1_t *b, int alignment_nr)
{
- /* arg 1: path to desired output file */
- str output_path = *(str *) getArgReference(stk, pci, pci->retc);
+ int doff = 0;
+ bam1_core_t *c = &b->core;
+
+ /* Note: in sam_read1, str->s is often used. This string is not NULL
terminated! */
+ { // qname
+ c->l_qname = strlen(qname) + 1;
+ if(alloc_data(b, doff + c->l_qname) == NULL) {
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr);
+ }
+ memcpy(b->data + doff, qname, c->l_qname);
+ doff += c->l_qname;
+ }
+ { // flag
+ c->flag = flag;
+ }
+ { // rname, pos, mapq
+ c->tid = bam_get_tid(header, rname);
+ if (c->tid < 0 && strcmp(rname, "*") != 0) {
+ /* Should not happen, since we built the header structure
ourselves */
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR "SQ entry '%s' not found in header table",
+ alignment_nr, rname);
+ }
+ c->pos = pos - 1;
+ c->qual = mapq;
+ }
+ { // cigar
+ str s, t;
+ int i, op;
+ long x;
+ c->n_cigar = 0;
+ if (cigar[0] != '*') {
+ uint32_t *cigar_enc;
+ for (s = cigar; *s != '\0'; ++s) {
+ if ((isalpha(*s)) || (*s=='=')) {
+ ++c->n_cigar;
+ }
+ else if (!isdigit(*s)) {
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR "Parse error while parsing CIGAR
string '%s'",
+ alignment_nr, cigar);
+ }
+ }
+ if (alloc_data(b, doff + c->n_cigar * 4) == NULL) {
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr);
+ }
+ cigar_enc = bam1_cigar(b);
+ for (i = 0, s = cigar; i != c->n_cigar; ++i) {
+ x = strtol(s, &t, 10);
+ op = toupper(*t);
+ if (op == 'M') op = BAM_CMATCH;
+ else if (op == 'I') op = BAM_CINS;
+ else if (op == 'D') op = BAM_CDEL;
+ else if (op == 'N') op = BAM_CREF_SKIP;
+ else if (op == 'S') op = BAM_CSOFT_CLIP;
+ else if (op == 'H') op = BAM_CHARD_CLIP;
+ else if (op == 'P') op = BAM_CPAD;
+ else if (op == '=') op = BAM_CEQUAL;
+ else if (op == 'X') op = BAM_CDIFF;
+ else if (op == 'B') op = BAM_CBACK;
+ else throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR "invalid CIGAR operation found in CIGAR
string '%s': '%c'",
+ alignment_nr, cigar, op);
+ s = t + 1;
+ cigar_enc[i] = bam_cigar_gen(x, op);
+ }
+ if (*s) throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR "Unmatched CIGAR operation in CIGAR string
'%s'",
+ alignment_nr, cigar);
+ c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar_enc));
+ doff += c->n_cigar * 4;
+ } else {
+ if (!(c->flag&BAM_FUNMAP)) {
+ c->flag |= BAM_FUNMAP;
+ }
+ c->bin = bam_reg2bin(c->pos, c->pos + 1);
+ }
+ }
+ { // rnext, pnext, tlen
+ c->mtid = strcmp(rnext, "=") != 0 ? bam_get_tid(header, rnext)
: c->tid;
+ c->mpos = pnext - 1;
+ c->isize = tlen;
+ }
+ { // seq and qual
+ int i;
+ uint8_t *p = 0;
+ if (strcmp(seq, "*") != 0) {
+ c->l_qseq = strlen(seq);
+ if (c->n_cigar && c->l_qseq !=
(int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR "CIGAR and sequence length
inconsistency: %d (SEQ) vs %d (CIGAR)",
+ alignment_nr, c->l_qseq, (int32_t)bam_cigar2qlen(c,
bam1_cigar(b)));
+ }
+ p = (uint8_t*)alloc_data(b, doff + c->l_qseq +
(c->l_qseq+1)/2) + doff;
+ if(p == NULL) {
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr);
+ }
+ memset(p, 0, (c->l_qseq+1)/2);
+ for (i = 0; i < c->l_qseq; ++i)
+ p[i/2] |= bam_nt16_table[(int)seq[i]] <<
4*(1-i%2);
+ } else {
+ c->l_qseq = 0;
+ }
+ if (strcmp(qual, "*") != 0 && (size_t)c->l_qseq !=
strlen(qual)) {
+ throw(MAL, "fill_bam_alig",
+ FILL_BAM_ALIG_ERR "SEQ and QUAL are inconsistent",
+ alignment_nr);
+ }
+ p += (c->l_qseq+1)/2;
+ if (strcmp(qual, "*") == 0) {
+ for (i = 0; i < c->l_qseq; ++i) {
+ p[i] = 0xff;
+ }
+ } else {
+ for (i = 0; i < c->l_qseq; ++i) {
+ p[i] = qual[i] - 33;
+ }
+ }
+ doff += c->l_qseq + (c->l_qseq+1)/2;
+ }
+ b->data_len = doff;
+ if (bam_no_B) {
+ bam_remove_B(b);
+ }
+
+ return MAL_SUCCEED;
+}
+
+
+
+
+
+static str
+bind_export_result(Client cntxt, MalBlkPtr mb, bam_field fields[11], int
*tuple_count)
+{
mvc *m;
sql_schema *s_bam;
sql_table *t_export;
- bam_field fields[11];
-
- stream *output = NULL;
int cnt, i;
- str msg = MAL_SUCCEED;
-
- memset(fields, 0, 11 * sizeof(bam_field));
-
- if ((output = bsopen(output_path)) == NULL) {
- msg = createException(MAL, "sam_export", "Could not open output file
'%s' for writing", output_path);
- goto cleanup;
- }
+ str msg;
if ((msg = getSQLContext(cntxt, mb, &m, NULL)) != MAL_SUCCEED) {
- REUSE_EXCEPTION(msg, MAL, "sam_export", "Could not retrieve SQL
context: %s", msg);
- goto cleanup;
+ REUSE_EXCEPTION(msg, MAL, "bind_export_result", "Could not retrieve
SQL context: %s", msg);
+ return msg;
}
if ((s_bam = mvc_bind_schema(m, "bam")) == NULL) {
- msg = createException(MAL, "sam_export", "Could not find bam schema");
- goto cleanup;
+ throw(MAL, "bind_export_result", "Could not find bam schema");
}
if ((t_export = mvc_bind_table(m, s_bam, "export")) == NULL) {
- msg = createException(MAL, "sam_export", "Could not find bam.export
table");
- goto cleanup;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list