[galaxy-dev] Proposed patch to blastxml_to_tabular.py

Peter van Heusden Tue, 29 Nov 2011 23:38:09 -0800

Hi there. When trying to convert some BLAST XML to tabular format, from
an old version of BLAST (2.2.15), I had to make some changes to
blastxml_to_tabular.py to get correct results. A patch is enclosed in
this message (and attached). I don't think they should negatively affect
anyone, but the semantics are as follows:


1) If you find an Iteration with no Iteration_query-ID tag, return skip
it, it has no hits.
1) If the Iteration_query-ID tag matches regexp lcl\|\d+_\d+ (e.g.
lcl|1_0) the real accession is in the first word of Iteration_query-def.
2) If the Hit_id matches regexp gnl\|BL_ORD_ID\|d+ (e.g.
gnl|BL_ORD_ID|278) and the Hit_accession tag contains the text of that
\d+ (e.g. 278) then the real accession is in the first word of Hit_def.

I'm basing these semantic on the BLAST reports I've got here, so if
anyone can see an error in them please speak up. Else, here is the patch:

--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py    Tue Nov 29
17:35:14 2011 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py    Wed Nov 30
09:31:11 2011 +0200
@@ -103,7 +103,7 @@
     stop_err( "Invalid data format." )
 
 
-re_default_query_id = re.compile("^Query_\d+$")
+re_default_query_id = re.compile("^(lcl\|\d+_\d+|Query_\d+)$")
 assert re_default_query_id.match("Query_101")
 assert not re_default_query_id.match("Query_101a")
 assert not re_default_query_id.match("MyQuery_101")
@@ -112,6 +112,7 @@
 assert not re_default_subject_id.match("Subject_")
 assert not re_default_subject_id.match("Subject_12a")
 assert not re_default_subject_id.match("TheSubject_1")
+re_default_subject_id2 = re.compile("^gnl\|BL_ORD_ID\|(\d+)$")
 
 
 outfile = open(out_file, 'w')
@@ -133,6 +134,9 @@
         # <Iteration_query-len>516</Iteration_query-len>
         # <Iteration_hits>...
         qseqid = elem.findtext("Iteration_query-ID")
+        if qseqid == None:
+            # no query ID - this happens when there are no hits, so
skip this Iteration
+            continue
         if re_default_query_id.match(qseqid):
             #Place holder ID, take the first word of the query definition
             qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
@@ -152,8 +156,10 @@
             #apparently depending on the parse_deflines switch
             sseqid = hit.findtext("Hit_id").split(None,1)[0]
             hit_def = sseqid + " " + hit.findtext("Hit_def")
-            if re_default_subject_id.match(sseqid) \
-            and sseqid == hit.findtext("Hit_accession"):
+            match2 = re_default_subject_id2.match(sseqid)
+            if (re_default_subject_id.match(sseqid) \
+            and sseqid == hit.findtext("Hit_accession")) \
+            or (match2 and match2.group(1) ==
hit.findtext("Hit_accession")):
                 #Place holder ID, take the first word of the subject
definition
                 hit_def = hit.findtext("Hit_def")
                 sseqid = hit_def.split(None,1)[0]

--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py      Tue Nov 29 17:35:14 
2011 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py      Wed Nov 30 09:31:11 
2011 +0200
@@ -103,7 +103,7 @@
     stop_err( "Invalid data format." )
 
 
-re_default_query_id = re.compile("^Query_\d+$")
+re_default_query_id = re.compile("^(lcl\|\d+_\d+|Query_\d+)$")
 assert re_default_query_id.match("Query_101")
 assert not re_default_query_id.match("Query_101a")
 assert not re_default_query_id.match("MyQuery_101")
@@ -112,6 +112,7 @@
 assert not re_default_subject_id.match("Subject_")
 assert not re_default_subject_id.match("Subject_12a")
 assert not re_default_subject_id.match("TheSubject_1")
+re_default_subject_id2 = re.compile("^gnl\|BL_ORD_ID\|(\d+)$")
 
 
 outfile = open(out_file, 'w')
@@ -133,6 +134,9 @@
         # <Iteration_query-len>516</Iteration_query-len>
         # <Iteration_hits>...
         qseqid = elem.findtext("Iteration_query-ID")
+        if qseqid == None:
+            # no query ID - this happens when there are no hits, so skip this 
Iteration
+            continue
         if re_default_query_id.match(qseqid):
             #Place holder ID, take the first word of the query definition
             qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
@@ -152,8 +156,10 @@
             #apparently depending on the parse_deflines switch
             sseqid = hit.findtext("Hit_id").split(None,1)[0]
             hit_def = sseqid + " " + hit.findtext("Hit_def")
-            if re_default_subject_id.match(sseqid) \
-            and sseqid == hit.findtext("Hit_accession"):
+            match2 = re_default_subject_id2.match(sseqid)
+            if (re_default_subject_id.match(sseqid) \
+            and sseqid == hit.findtext("Hit_accession")) \
+            or (match2 and match2.group(1) == hit.findtext("Hit_accession")):
                 #Place holder ID, take the first word of the subject definition
                 hit_def = hit.findtext("Hit_def")
                 sseqid = hit_def.split(None,1)[0]

___________________________________________________________
Please keep all replies on the list by using "reply all"
in your mail client.  To manage your subscriptions to this
and other Galaxy lists, please use the interface at:

  http://lists.bx.psu.edu/

[galaxy-dev] Proposed patch to blastxml_to_tabular.py

Reply via email to