# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User peterjc <p.j.a.cock@googlemail.com> # Date 1287998919 -3600 # Node ID d8184d91928c83f0269bd0267be949bc5d676e8b # Parent 5c212dfc6189bb41d334b0519411ca4f04fde9ec Remove FASTA filter script from BLAST+ tools (I want to generalise it) --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -269,7 +269,6 @@ <tool file="ncbi_blast_plus/ncbi_tblastn_wrapper.xml" /><tool file="ncbi_blast_plus/ncbi_tblastx_wrapper.xml" /><tool file="ncbi_blast_plus/blastxml_to_tabular.xml" /> - <tool file="ncbi_blast_plus/blast_filter_fasta.xml" /></section><section name="NGS: Mapping" id="solexa_tools"><tool file="sr_mapping/lastz_wrapper.xml" /> --- a/tools/ncbi_blast_plus/blast_filter_fasta.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python -"""Filter a FASTA file using tabular output, e.g. from BLAST. - -Takes five command line options, tabular BLAST filename, ID column number -(using one based counting), input FASTA filename, and two output FASTA -filenames (for records with and without any BLAST hits). - -In the default NCBI BLAST+ tabular output, the query sequence ID is in column -one, and the ID of the match from the database is in column two. -""" -import sys -from galaxy_utils.sequence.fasta import fastaReader, fastaWriter - -#Parse Command Line -blast_file, blast_col, in_file, out_positive_file, out_negative_file = sys.argv[1:] -blast_col = int(blast_col)-1 -assert blast_col >= 0 - -#Read tabular BLAST file and record all queries with hit(s) -ids = set() -blast_handle = open(blast_file, "rU") -for line in blast_handle: - ids.add(line.split("\t")[blast_col]) -blast_handle.close() - -#Write filtered FASTA file based on IDs from BLAST file -reader = fastaReader(open(in_file, "rU")) -positive_writer = fastaWriter(open(out_positive_file, "w")) -negative_writer = fastaWriter(open(out_negative_file, "w")) -for record in reader: - #The [1:] is because the fastaReader leaves the > on the identifer. - if record.identifier and record.identifier.split()[0][1:] in ids: - positive_writer.write(record) - else: - negative_writer.write(record) -positive_writer.close() -negative_writer.close() -reader.close() --- a/tools/ncbi_blast_plus/blast_filter_fasta.xml +++ /dev/null @@ -1,37 +0,0 @@ -<tool id="blast_filter_fasta" name="Filter FASTA using BLAST output" version="0.0.1"> - <description>Divide a FASTA file based on BLAST hits</description> - <command interpreter="python"> - blast_filter_fasta.py $blast_file $blast_col $in_file $out_positive_file $out_negative_file - </command> - <inputs> - <param name="in_file" type="data" format="fasta" label="FASTA file to filter"/> - <param name="blast_file" type="data" format="tabular" label="Tabular BLAST output"/> - <param name="blast_col" type="select" label="Column containing FASTA identifiers"> - <option value="1">Column 1 - BLAST query ID</option> - <option value="2">Column 2 - BLAST match ID</option> - </param> - </inputs> - <outputs> - <data name="out_positive_file" format="fasta" label="Sequences with BLAST hits" /> - <data name="out_negative_file" format="fasta" label="Sequences without BLAST hits" /> - </outputs> - <requirements> - </requirements> - <tests> - </tests> - <help> - -**What it does** - -Typical use would be to take a multi-sequence FASTA and the tabular output of -running BLAST on it, and divide the FASTA file in two: those sequence with a -BLAST hit, and those without. - -In the default NCBI BLAST+ tabular output, the query sequence ID is in column -one, and the ID of the match from the database is in column two. - -This allows you to filter the FASTA file for the subjects in the BLAST search, -rather than filtering the FASTA file for the queries in the BLAST search. - - </help> -</tool>