--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Nov 29 17:35:14 2011 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Wed Nov 30 09:31:11 2011 +0200
@@ -103,7 +103,7 @@
stop_err( "Invalid data format." )
-re_default_query_id = re.compile("^Query_\d+$")
+re_default_query_id = re.compile("^(lcl\|\d+_\d+|Query_\d+)$")
assert re_default_query_id.match("Query_101")
assert not re_default_query_id.match("Query_101a")
assert not re_default_query_id.match("MyQuery_101")
@@ -112,6 +112,7 @@
assert not re_default_subject_id.match("Subject_")
assert not re_default_subject_id.match("Subject_12a")
assert not re_default_subject_id.match("TheSubject_1")
+re_default_subject_id2 = re.compile("^gnl\|BL_ORD_ID\|(\d+)$")
outfile = open(out_file, 'w')
@@ -133,6 +134,9 @@
# 516
# ...
qseqid = elem.findtext("Iteration_query-ID")
+ if qseqid == None:
+ # no query ID - this happens when there are no hits, so skip this Iteration
+ continue
if re_default_query_id.match(qseqid):
#Place holder ID, take the first word of the query definition
qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
@@ -152,8 +156,10 @@
#apparently depending on the parse_deflines switch
sseqid = hit.findtext("Hit_id").split(None,1)[0]
hit_def = sseqid + " " + hit.findtext("Hit_def")
- if re_default_subject_id.match(sseqid) \
- and sseqid == hit.findtext("Hit_accession"):
+ match2 = re_default_subject_id2.match(sseqid)
+ if (re_default_subject_id.match(sseqid) \
+ and sseqid == hit.findtext("Hit_accession")) \
+ or (match2 and match2.group(1) == hit.findtext("Hit_accession")):
#Place holder ID, take the first word of the subject definition
hit_def = hit.findtext("Hit_def")
sseqid = hit_def.split(None,1)[0]