galaxy-dist commit 5498ff5b31ac: Allow filter to pick ID column

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User peterjc <p.j.a.cock@googlemail.com>
# Date 1285873215 -3600
# Node ID 5498ff5b31ac871c7d22df60146fd96c61b74aab
# Parent  ad83deba645811d8b9509a6e3c58e986ffa89e3b
Allow filter to pick ID column

--- a/tools/ncbi_blast_plus/blast_filter_fasta.py
+++ b/tools/ncbi_blast_plus/blast_filter_fasta.py
@@ -1,22 +1,26 @@
 #!/usr/bin/env python
-"""Filter a FASTA file using tabular BLAST output.
+"""Filter a FASTA file using tabular output, e.g. from BLAST.
 
-Takes four command line options, tabular BLAST filename, input FASTA filename,
-and two output FASTA filenames (for records with and without any BLAST hits).
+Takes five command line options, tabular BLAST filename, ID column number
+(using one based counting), input FASTA filename, and two output FASTA
+filenames (for records with and without any BLAST hits).
+
+In the default NCBI BLAST+ tabular output, the query sequence ID is in column
+one, and the ID of the match from the database is in column two.
 """
-#TODO - Option to define which column to use for ID?
-
 import sys
 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
 
 #Parse Command Line
-blast_file, in_file, out_positive_file, out_negative_file = sys.argv[1:]
+blast_file, blast_col, in_file, out_positive_file, out_negative_file = sys.argv[1:]
+blast_col = int(blast_col)-1
+assert blast_col >= 0
 
 #Read tabular BLAST file and record all queries with hit(s)
 ids = set()
 blast_handle = open(blast_file, "rU")  
 for line in blast_handle:
-    ids.add(line.split("\t")[0])
+    ids.add(line.split("\t")[blast_col])
 blast_handle.close()
 
 #Write filtered FASTA file based on IDs from BLAST file

--- a/tools/ncbi_blast_plus/blast_filter_fasta.xml
+++ b/tools/ncbi_blast_plus/blast_filter_fasta.xml
@@ -1,10 +1,13 @@
 <tool id="blast_filter_fasta" name="BLAST filter FASTA file" version="0.0.1"><description>Divide a FASTA file based on BLAST hits</description><command interpreter="python">
-      blast_filter_fasta.py $blast_file $in_file $out_positive_file $out_negative_file
+      blast_filter_fasta.py $blast_file $blast_col $in_file $out_positive_file $out_negative_file
     </command><inputs><param name="blast_file" type="data" format="tabular" label="Tabular BLAST output"/> 
+        <param name="blast_col" type="integer" value="1" label="Column containing FASTA identifiers">
+            <validator type="in_range" min="1" />
+        </param><param name="in_file" type="data" format="fasta" label="BLAST query FASTA file"/></inputs><outputs>
@@ -19,8 +22,12 @@
     
 **What it does**
 
-Takes a multi-sequence FASTA and the tabular output of running BLAST on it, and
-divides the FASTA file in two: those sequence with a BLAST hit, and those without.
+Typical use would be to take a multi-sequence FASTA and the tabular output of
+running BLAST on it, and divides the FASTA file in two: those sequence with a
+BLAST hit, and those without.
+
+In the default NCBI BLAST+ tabular output, the query sequence ID is in column
+one, and the ID of the match from the database is in column two.
 
     </help></tool>

    

commits-noreply＠bitbucket.org

tags

participants (1)