--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Nov 29 17:35:14 2011 -0500 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Wed Nov 30 09:31:11 2011 +0200 @@ -103,7 +103,7 @@ stop_err( "Invalid data format." ) -re_default_query_id = re.compile("^Query_\d+$") +re_default_query_id = re.compile("^(lcl\|\d+_\d+|Query_\d+)$") assert re_default_query_id.match("Query_101") assert not re_default_query_id.match("Query_101a") assert not re_default_query_id.match("MyQuery_101") @@ -112,6 +112,7 @@ assert not re_default_subject_id.match("Subject_") assert not re_default_subject_id.match("Subject_12a") assert not re_default_subject_id.match("TheSubject_1") +re_default_subject_id2 = re.compile("^gnl\|BL_ORD_ID\|(\d+)$") outfile = open(out_file, 'w') @@ -133,6 +134,9 @@ # 516 # ... qseqid = elem.findtext("Iteration_query-ID") + if qseqid == None: + # no query ID - this happens when there are no hits, so skip this Iteration + continue if re_default_query_id.match(qseqid): #Place holder ID, take the first word of the query definition qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] @@ -152,8 +156,10 @@ #apparently depending on the parse_deflines switch sseqid = hit.findtext("Hit_id").split(None,1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") - if re_default_subject_id.match(sseqid) \ - and sseqid == hit.findtext("Hit_accession"): + match2 = re_default_subject_id2.match(sseqid) + if (re_default_subject_id.match(sseqid) \ + and sseqid == hit.findtext("Hit_accession")) \ + or (match2 and match2.group(1) == hit.findtext("Hit_accession")): #Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None,1)[0]