# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User peterjc <p.j.a.cock@googlemail.com> # Date 1285873215 -3600 # Node ID 5498ff5b31ac871c7d22df60146fd96c61b74aab # Parent ad83deba645811d8b9509a6e3c58e986ffa89e3b Allow filter to pick ID column --- a/tools/ncbi_blast_plus/blast_filter_fasta.py +++ b/tools/ncbi_blast_plus/blast_filter_fasta.py @@ -1,22 +1,26 @@ #!/usr/bin/env python -"""Filter a FASTA file using tabular BLAST output. +"""Filter a FASTA file using tabular output, e.g. from BLAST. -Takes four command line options, tabular BLAST filename, input FASTA filename, -and two output FASTA filenames (for records with and without any BLAST hits). +Takes five command line options, tabular BLAST filename, ID column number +(using one based counting), input FASTA filename, and two output FASTA +filenames (for records with and without any BLAST hits). + +In the default NCBI BLAST+ tabular output, the query sequence ID is in column +one, and the ID of the match from the database is in column two. """ -#TODO - Option to define which column to use for ID? - import sys from galaxy_utils.sequence.fasta import fastaReader, fastaWriter #Parse Command Line -blast_file, in_file, out_positive_file, out_negative_file = sys.argv[1:] +blast_file, blast_col, in_file, out_positive_file, out_negative_file = sys.argv[1:] +blast_col = int(blast_col)-1 +assert blast_col >= 0 #Read tabular BLAST file and record all queries with hit(s) ids = set() blast_handle = open(blast_file, "rU") for line in blast_handle: - ids.add(line.split("\t")[0]) + ids.add(line.split("\t")[blast_col]) blast_handle.close() #Write filtered FASTA file based on IDs from BLAST file --- a/tools/ncbi_blast_plus/blast_filter_fasta.xml +++ b/tools/ncbi_blast_plus/blast_filter_fasta.xml @@ -1,10 +1,13 @@ <tool id="blast_filter_fasta" name="BLAST filter FASTA file" version="0.0.1"><description>Divide a FASTA file based on BLAST hits</description><command interpreter="python"> - blast_filter_fasta.py $blast_file $in_file $out_positive_file $out_negative_file + blast_filter_fasta.py $blast_file $blast_col $in_file $out_positive_file $out_negative_file </command><inputs><param name="blast_file" type="data" format="tabular" label="Tabular BLAST output"/> + <param name="blast_col" type="integer" value="1" label="Column containing FASTA identifiers"> + <validator type="in_range" min="1" /> + </param><param name="in_file" type="data" format="fasta" label="BLAST query FASTA file"/></inputs><outputs> @@ -19,8 +22,12 @@ **What it does** -Takes a multi-sequence FASTA and the tabular output of running BLAST on it, and -divides the FASTA file in two: those sequence with a BLAST hit, and those without. +Typical use would be to take a multi-sequence FASTA and the tabular output of +running BLAST on it, and divides the FASTA file in two: those sequence with a +BLAST hit, and those without. + +In the default NCBI BLAST+ tabular output, the query sequence ID is in column +one, and the ID of the match from the database is in column two. </help></tool>