# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User peterjc <p.j.a.cock@googlemail.com> # Date 1286551276 -3600 # Node ID 8979b429d7691f4121afb368a05e6b9d0ea40706 # Parent b2906cd2d484a25f80134f432b7c36ed18747bb5 Adding a Python script for converting BLAST XML to 12 column tabular --- /dev/null +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml @@ -0,0 +1,41 @@ +<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.1"> + <description>Convert BLAST XML output to tabular</description> + <command interpreter="python"> + blastxml_to_tabular.py $blastxml_file $tabular_file + </command> + <inputs> + <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> + </inputs> + <outputs> + <data name="tabular_file" format="tabular" label="BLAST results as tabular" /> + </outputs> + <requirements> + </requirements> + <tests> + </tests> + <help> + +**What it does** + +NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of formats including +tabular and a more detailed XML format. A complex workflow may need both the XML and the +tabular output. + +This tool takes the BLAST XML output and converts it into the 12 column tabular equivalent: + +1. Id of your sequence +2. GI of the database hit +3. % identity +4. Alignment length +5. # mismatches +6. # gaps +7. Start position in your sequence +8. End position in your sequence +9. Start position in database hit +10. End position in database hit +11. E-value +12. Bit score + + + </help> +</tool> --- /dev/null +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +"""Convert a BLAST XML file to 12 column tabular output + +Takes two command line options, input BLAST XML filename and output tabular +BLAST filename. + +The 12 colums output are 'qseqid sseqid pident length mismatch gapopen qstart +qend sstart send evalue bitscore' which mean: + + * qseqid - Query Seq-id + * sseqid - Subject Seq-id + * pident - Percentage of identical matches + * length - Alignment length + * mismatch - Number of mismatches + * gapopen - Number of gap openings + * qstart - Start of alignment in query + * qend - End of alignment in query + * sstart - Start of alignment in subject + * send - End of alignment in subject + * evalue - Expect value + * bitscore - Bit score + +Most of these fields are given explicitly in the XML file, others some like +the percentage identity and the number of gap openings must be calculated. + +This script attempts to produce idential output to what BLAST+ would have done. +However, check this with "diff -b ..." since BLAST+ sometimes includes an extra +space character (probably a bug). +""" +import sys + +#Parse Command Line +in_file, out_file = sys.argv[1:] + +assert sys.version_info[:2] >= ( 2, 4 ) +if sys.version_info[:2] >= ( 2, 5 ): + import xml.etree.cElementTree as cElementTree +else: + import cElementTree + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + +tags = ["Hsp_identity", + "Hsp_align-len", + "Hsp_gaps", + "Hsp_query-from", + "Hsp_query-to", + "Hsp_hit-from", + "Hsp_hit-to", + "Hsp_evalue", + "Hsp_bit-score"] + +# get an iterable +try: + context = cElementTree.iterparse(in_file, events=("start", "end")) +except: + stop_err("Invalid data format.") +# turn it into an iterator +context = iter(context) +# get the root element +try: + event, root = context.next() +except: + stop_err( "Invalid data format." ) + +outfile = open(out_file, 'w') +for event, elem in context: + # for every <Iteration> tag + if event == "end" and elem.tag == "Iteration": + qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] + # for every <Hit> within <Iteration> + for hit in elem.findall("Iteration_hits/Hit/"): + sseqid = hit.findtext("Hit_id").split(None,1)[0] + # for every <Hsp> within <Hit> + for hsp in hit.findall("Hit_hsps/Hsp"): + identity = hsp.findtext("Hsp_identity") + length = hsp.findtext("Hsp_align-len") + pident = "%0.2f" % (100*float(identity)/float(length)) + + q_seq = hsp.findtext("Hsp_qseq") + h_seq = hsp.findtext("Hsp_hseq") + assert len(q_seq) == len(h_seq) == int(length) + gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ + len(h_seq.replace('-', ' ').split())-1) + mismatch = str(len(q_seq) - sum(1 for q,h in zip(q_seq, h_seq) \ + if q == h or q == "-" or h == "-")) + assert int(identity) == sum(1 for q,h in zip(q_seq, h_seq) if q == h) + + evalue = hsp.findtext("Hsp_evalue") + if evalue == "0": + evalue = "0.0" + else: + evalue = "%0.0e" % float(evalue) + + bitscore = float(hsp.findtext("Hsp_bit-score")) + if bitscore < 100: + #Seems to show one decimal place for lower scores + bitscore = "%0.1f" % bitscore + else: + #Note BLAST does not round to nearest int, it truncates + bitscore = "%i" % bitscore + + values = [qseqid, + sseqid, + pident, + length, #hsp.findtext("Hsp_align-len") + mismatch, + gapopen, + hsp.findtext("Hsp_query-from"), #qstart, + hsp.findtext("Hsp_query-to"), #qend, + hsp.findtext("Hsp_hit-from"), #sstart, + hsp.findtext("Hsp_hit-to"), #send, + evalue, #hsp.findtext("Hsp_evalue") in scientific notation + bitscore, #hsp.findtext("Hsp_bit-score") rounded + ] + #print "\t".join(values) + outfile.write("\t".join(values) + "\n") + # prevents ElementTree from growing large datastructure + root.clear() + elem.clear() +outfile.close()