Changeset: a8bb9d5909ae for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a8bb9d5909ae
Modified Files:
sql/backends/monet5/bam/bam_globals.h
sql/backends/monet5/bam/bam_loader.c
sql/backends/monet5/bam/bam_wrapper.c
sql/backends/monet5/bam/bam_wrapper.h
Branch: bamloader
Log Message:
Started with support for loading SAM files. Not finished yet due to annoying
problem with Samtools. Will look further into it after implementing SAM/BAM
export functionality.
diffs (truncated from 1284 to 300 lines):
diff --git a/sql/backends/monet5/bam/bam_globals.h
b/sql/backends/monet5/bam/bam_globals.h
--- a/sql/backends/monet5/bam/bam_globals.h
+++ b/sql/backends/monet5/bam/bam_globals.h
@@ -38,7 +38,6 @@
#endif
-
#ifdef BAM_DEBUG
/**
diff --git a/sql/backends/monet5/bam/bam_loader.c
b/sql/backends/monet5/bam/bam_loader.c
--- a/sql/backends/monet5/bam/bam_loader.c
+++ b/sql/backends/monet5/bam/bam_loader.c
@@ -33,6 +33,22 @@
#include "bam_loader.h"
+/* Macro that checks whether or not a filename ends with either .sam or .bam
+ *(case insensitive) */
+#define IS_SAMORBAM(filename, len) \
+ ((filename[len-4] == '.') && \
+ (filename[len-3] == 'b' || filename[len-3] == 'B' || \
+ filename[len-3] == 's' || filename[len-4] == 's') && \
+ (filename[len-2] == 'a' || filename[len-2] == 'A') && \
+ (filename[len-1] == 'm' || filename[len-1] == 'M'))
+
+/* Given a filename that ends on either .sam or .bam, this macro checks if
+ * it ends on .bam */
+#define IS_BAM(samorbam_filename, len) \
+ (samorbam_filename[len-3] == 'b' || samorbam_filename[len-3] == 'B')
+
+
+
typedef struct reader_thread_data {
sht thread_id;
/* BAM wrappers of all BAM files that have to be processed */
@@ -126,7 +142,7 @@ run_process_bam_alignments(void *d)
bw->file_id);
if ((data->msg =
- process_bam_alignments(bw, data->failure)) != MAL_SUCCEED) {
+ process_alignments(bw, data->failure)) != MAL_SUCCEED) {
TO_LOG("<Thread %d> Error while processing alignments of file
'%s' (file id " LLFMT ") (%s)\n", data->thread_id, bw->file_location,
bw->file_id, data->msg);
REUSE_EXCEPTION(data->msg, MAL, "run_process_bam_alignments",
"Error while processing alignments of file '%s'
(file id "
@@ -250,10 +266,11 @@ bam_loader(Client cntxt, MalBlkPtr mb, s
memset(bws, 0, nr_files * sizeof(bam_wrapper));
for (i = 0; i < nr_files; ++i) {
+ int fln = strlen(filenames[i]);
TO_LOG("<bam_loader> Initializing BAM wrapper for file
'%s'...\n", filenames[i]);
if ((msg =
- init_bam_wrapper(bws + i, filenames[i], cur_file_id++,
- dbschema)) != MAL_SUCCEED) {
+ init_bam_wrapper(bws + i, (IS_BAM(filenames[i], fln) ? BAM
: SAM),
+ filenames[i], cur_file_id++, dbschema)) !=
MAL_SUCCEED) {
goto cleanup;
}
}
@@ -262,7 +279,7 @@ bam_loader(Client cntxt, MalBlkPtr mb, s
for (i = 0; i < nr_files; ++i) {
TO_LOG("<bam_loader> Parsing BAM header for file '%s'...\n",
filenames[i]);
- if ((msg = process_bam_header(bws + i)) != MAL_SUCCEED) {
+ if ((msg = process_header(bws + i)) != MAL_SUCCEED) {
goto cleanup;
}
}
@@ -408,14 +425,6 @@ bam_loader(Client cntxt, MalBlkPtr mb, s
}
-/* Macro that checks whether or not a filename ends with .bam (case
- * insensitive) */
-#define IS_BAM(filename, len) \
- (filename[len-4] == '.') && \
- (filename[len-3] == 'b' || filename[len-3] == 'B') && \
- (filename[len-2] == 'a' || filename[len-2] == 'A') && \
- (filename[len-1] == 'm' || filename[len-1] == 'M') \
-
/**
* Gathers all BAM files in the given repository and calls bam_loader for
these files
*/
@@ -452,17 +461,17 @@ bam_loader_repos(Client cntxt, MalBlkPtr
goto cleanup;
}
- /* First, count number of BAM files */
+ /* First, count number of SAM/BAM files */
while ((direntry = readdir(d)) != NULL) {
int len = strlen(direntry->d_name);
- if (IS_BAM(direntry->d_name, len))
+ if (IS_SAMORBAM(direntry->d_name, len))
++filecount;
}
if (filecount == 0) {
msg = createException(MAL, "bam_loader_repos",
- "No BAM files found in directory '%s'",
+ "No SAM or BAM files found in directory
'%s'",
bam_repos);
goto cleanup;
}
@@ -484,18 +493,17 @@ bam_loader_repos(Client cntxt, MalBlkPtr
* slashes in a path, but as far as I know, this is no problem
* on all OS's */
while ((direntry = readdir(d)) != NULL) {
- /* Check if d_name has the .bam extension (case
+ /* Check if d_name has the .sam or .bam extension (case
* insensitive) */
int len = strlen(direntry->d_name);
-
- if (IS_BAM(direntry->d_name, len)) {
- /* This is a BAM file, construct its path and
+ if (IS_SAMORBAM(direntry->d_name, len)) {
+ /* This is a SAM or BAM file, construct its path and
* add that to the files array */
if (snprintf
(path, 4096, "%s/%s", bam_repos,
direntry->d_name) < 0) {
msg = createException(MAL, "bam_loader_repos",
- "Could not construct
filepath for BAM file '%s'",
+ "Could not construct
filepath for SAM/BAM file '%s'",
direntry->d_name);
goto cleanup;
}
diff --git a/sql/backends/monet5/bam/bam_wrapper.c
b/sql/backends/monet5/bam/bam_wrapper.c
--- a/sql/backends/monet5/bam/bam_wrapper.c
+++ b/sql/backends/monet5/bam/bam_wrapper.c
@@ -24,8 +24,9 @@
#include "monetdb_config.h"
#include "mal_exception.h"
#include "stream.h"
+#include "bam_globals.h"
-#include <samtools/bam.h>
+#include <samtools/sam.h>
#ifdef HAVE_SAMTOOLS_KSTRING_H
#include <samtools/kstring.h>
#else
@@ -33,7 +34,6 @@
* version */
#include "mykstring.h"
#endif
-#include "bam_globals.h"
#include "bam_wrapper.h"
str
@@ -63,7 +63,6 @@ get_ordering(str ord)
return ORDERING_UNKNOWN;
}
-
static stream *
bsopen(str filepath)
{
@@ -96,8 +95,8 @@ bsopen(str filepath)
* working directory.
*/
str
-init_bam_wrapper(bam_wrapper * bw, str file_location, lng file_id,
- sht dbschema)
+init_bam_wrapper(bam_wrapper * bw, filetype type, str file_location,
+ lng file_id, sht dbschema)
{
unsigned int i;
char flushdir[128];
@@ -120,22 +119,35 @@ init_bam_wrapper(bam_wrapper * bw, str f
file_location, flushdir, strerror(errno));
}
- /* Open BAM file */
- if ((bw->input = bam_open(file_location, "r")) == NULL) {
- throw(MAL, "init_bam_wrapper",
- ERR_INIT_BAM_WRAPPER "BAM file could not be opened",
- file_location);
- }
-
- /* Get BAM header */
- if ((bw->header = bam_header_read(bw->input)) == NULL) {
- throw(MAL, "init_bam_wrapper",
- ERR_INIT_BAM_WRAPPER "Unable to read header from file",
- file_location);
- }
-
+ if (type == BAM) {
+ /* Open BAM file and read its header */
+ if ((bw->bam.input = bam_open(file_location, "r")) == NULL) {
+ throw(MAL, "init_bam_wrapper",
+ ERR_INIT_BAM_WRAPPER "BAM file could not be opened",
+ file_location);
+ }
+ if ((bw->header = bam_header_read(bw->bam.input)) == NULL) {
+ throw(MAL, "init_bam_wrapper",
+ ERR_INIT_BAM_WRAPPER "Unable to read header from file",
+ file_location);
+ }
+ } else {
+ /* Open SAM file and read its header */
+ if ((bw->sam.input = samopen(file_location, "r", NULL)) == NULL) {
+ throw(MAL, "init_bam_wrapper",
+ ERR_INIT_BAM_WRAPPER "SAM file could not be opened",
+ file_location);
+ }
+ if ((bw->header = bw->sam.input->header) == NULL) {
+ throw(MAL, "init_bam_wrapper",
+ ERR_INIT_BAM_WRAPPER "Unable to read header from file",
+ file_location);
+ }
+ }
+
/* Set ordering to unknown, since we don't know until we have
* processed the header */
+ bw->type = type;
bw->ord = ORDERING_UNKNOWN;
bw->file_id = file_id;
@@ -247,10 +259,10 @@ init_bam_wrapper(bam_wrapper * bw, str f
}
static void
-close_streams(bam_wrapper * bw, bit unlink_files)
+close_write_streams(bam_wrapper * bw, bit unlink_files)
{
int i;
-
+
for (i = 0; i < 6; ++i) {
if (bw->files[i]) {
close_stream(bw->files[i]);
@@ -324,7 +336,7 @@ close_streams(bam_wrapper * bw, bit unli
void
prepare_for_copy(bam_wrapper * bw)
{
- close_streams(bw, FALSE);
+ close_write_streams(bw, FALSE);
}
void
@@ -332,16 +344,22 @@ clear_bam_wrapper(bam_wrapper * bw)
{
char flushdir[128];
- /* Clear fields */
- if (bw->header) {
- bam_header_destroy(bw->header);
- }
- if (bw->input) {
- bam_close(bw->input);
- }
+ /* Clear bam/sam specific fields */
+ if (bw->type == BAM) {
+ if (bw->header) {
+ bam_header_destroy(bw->header);
+ }
+ if (bw->bam.input) {
+ bam_close(bw->bam.input);
+ }
+ } else {
+ if (bw->sam.input) {
+ samclose(bw->sam.input);
+ }
+ }
/* Close file streams and remove files */
- close_streams(bw, TRUE);
+ close_write_streams(bw, TRUE);
/* Finally, attempt to remove flush directory */
snprintf(flushdir, 128, DIR_BINARIES "/" LLFMT, bw->file_id);
@@ -422,7 +440,7 @@ typedef struct bam_header_line {
} bam_header_line;
-#define ERR_PROCESS_BAM_HEADER_LINE "Could not parse a header line in BAM file
'%s': "
+#define ERR_PROCESS_HEADER_LINE "Could not parse a header line in BAM file
'%s': "
/**
* Parses the next BAM header line from the given header.
@@ -430,13 +448,13 @@ typedef struct bam_header_line {
* and attempts to parse it into the provided bam_header_line
* structure. In case the function fails, the calling function must
* call clear_bam_header_line to free possible resources that are
- * malloced by process_bam_header_line. The *eof flag will be set to
+ * malloced by process_header_line. The *eof flag will be set to
* TRUE if the input doesn't contain a header line anymore. The
* function needs the file_location in order to generate decent error
* messages
*/
static str
-process_bam_header_line(str * header, bam_header_line * ret_hl, bit * eof,
+process_header_line(str * header, bam_header_line * ret_hl, bit * eof,
str file_location)
{
bam_header_option *opt = NULL;
@@ -457,8 +475,8 @@ process_bam_header_line(str * header, ba
if (**header != '@') {
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list