[galaxy-commits] galaxy-dist commit 8979b429d769: Adding a Python script for converting BLAST XML to 12 column tabular

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User peterjc <p.j.a.cock@googlemail.com>
# Date 1286551276 -3600
# Node ID 8979b429d7691f4121afb368a05e6b9d0ea40706
# Parent  b2906cd2d484a25f80134f432b7c36ed18747bb5
Adding a Python script for converting BLAST XML to 12 column tabular

--- /dev/null
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml
@@ -0,0 +1,41 @@
+<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.1">
+    <description>Convert BLAST XML output to tabular</description>
+    <command interpreter="python">
+      blastxml_to_tabular.py $blastxml_file $tabular_file
+    </command>
+    <inputs>
+        <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> 
+    </inputs>
+    <outputs>
+        <data name="tabular_file" format="tabular" label="BLAST results as tabular" />
+    </outputs>
+    <requirements>
+    </requirements>
+    <tests>
+    </tests>
+    <help>
+    
+**What it does**
+
+NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of formats including
+tabular and a more detailed XML format. A complex workflow may need both the XML and the
+tabular output.
+
+This tool takes the BLAST XML output and converts it into the 12 column tabular equivalent:
+
+1. Id of your sequence 
+2. GI of the database hit 
+3. % identity
+4. Alignment length
+5. # mismatches
+6. # gaps
+7. Start position in your sequence
+8. End position in your sequence
+9. Start position in database hit
+10. End position in database hit
+11. E-value
+12. Bit score
+
+
+    </help>
+</tool>

--- /dev/null
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""Convert a BLAST XML file to 12 column tabular output
+
+Takes two command line options, input BLAST XML filename and output tabular
+BLAST filename.
+
+The 12 colums output are 'qseqid sseqid pident length mismatch gapopen qstart
+qend sstart send evalue bitscore' which mean:
+   
+ * qseqid   - Query Seq-id
+ * sseqid   - Subject Seq-id
+ * pident   - Percentage of identical matches
+ * length   - Alignment length
+ * mismatch - Number of mismatches
+ * gapopen  - Number of gap openings
+ * qstart   - Start of alignment in query
+ * qend     - End of alignment in query
+ * sstart   - Start of alignment in subject
+ * send     - End of alignment in subject
+ * evalue   - Expect value
+ * bitscore - Bit score
+
+Most of these fields are given explicitly in the XML file, others some like
+the percentage identity and the number of gap openings must be calculated.
+
+This script attempts to produce idential output to what BLAST+ would have done.
+However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
+space character (probably a bug).
+"""
+import sys
+
+#Parse Command Line
+in_file, out_file = sys.argv[1:]
+
+assert sys.version_info[:2] >= ( 2, 4 )
+if sys.version_info[:2] >= ( 2, 5 ):
+    import xml.etree.cElementTree as cElementTree
+else:
+    import cElementTree 
+
+def stop_err( msg ):
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(1)
+
+tags = ["Hsp_identity",
+        "Hsp_align-len",
+        "Hsp_gaps",
+        "Hsp_query-from",
+        "Hsp_query-to",
+        "Hsp_hit-from",
+        "Hsp_hit-to",
+        "Hsp_evalue",
+        "Hsp_bit-score"]
+
+# get an iterable
+try: 
+    context = cElementTree.iterparse(in_file, events=("start", "end"))
+except:
+    stop_err("Invalid data format.")
+# turn it into an iterator
+context = iter(context)
+# get the root element
+try:
+    event, root = context.next()
+except:
+    stop_err( "Invalid data format." )
+
+outfile = open(out_file, 'w')
+for event, elem in context:
+    # for every <Iteration> tag
+    if event == "end" and elem.tag == "Iteration":
+        qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
+        # for every <Hit> within <Iteration>
+        for hit in elem.findall("Iteration_hits/Hit/"):
+            sseqid = hit.findtext("Hit_id").split(None,1)[0]
+            # for every <Hsp> within <Hit>
+            for hsp in hit.findall("Hit_hsps/Hsp"):
+                identity = hsp.findtext("Hsp_identity")
+                length = hsp.findtext("Hsp_align-len")
+                pident = "%0.2f" % (100*float(identity)/float(length))
+
+                q_seq = hsp.findtext("Hsp_qseq")
+                h_seq = hsp.findtext("Hsp_hseq")
+                assert len(q_seq) == len(h_seq) == int(length)
+                gapopen = str(len(q_seq.replace('-', ' ').split())-1  + \
+                              len(h_seq.replace('-', ' ').split())-1)
+                mismatch = str(len(q_seq) - sum(1 for q,h in zip(q_seq, h_seq) \
+                                                if q == h or q == "-" or h == "-"))
+                assert int(identity) == sum(1 for q,h in zip(q_seq, h_seq) if q == h)
+
+                evalue = hsp.findtext("Hsp_evalue")
+                if evalue == "0":
+                    evalue = "0.0"
+                else:
+                    evalue = "%0.0e" % float(evalue)
+                
+                bitscore = float(hsp.findtext("Hsp_bit-score"))
+                if bitscore < 100:
+                    #Seems to show one decimal place for lower scores
+                    bitscore = "%0.1f" % bitscore
+                else:
+                    #Note BLAST does not round to nearest int, it truncates
+                    bitscore = "%i" % bitscore
+
+                values = [qseqid,
+                          sseqid,
+                          pident,
+                          length, #hsp.findtext("Hsp_align-len")
+                          mismatch,
+                          gapopen,
+                          hsp.findtext("Hsp_query-from"), #qstart,
+                          hsp.findtext("Hsp_query-to"), #qend,
+                          hsp.findtext("Hsp_hit-from"), #sstart,
+                          hsp.findtext("Hsp_hit-to"), #send,
+                          evalue, #hsp.findtext("Hsp_evalue") in scientific notation
+                          bitscore, #hsp.findtext("Hsp_bit-score") rounded
+                          ]
+                #print "\t".join(values) 
+                outfile.write("\t".join(values) + "\n")
+        # prevents ElementTree from growing large datastructure
+        root.clear()
+        elem.clear()
+outfile.close()

    

[galaxy-commits] galaxy-dist commit 8979b429d769: Adding a Python script for converting BLAST XML to 12 column tabular

commits-noreply＠bitbucket.org