# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User peterjc <p.j.a.cock@googlemail.com> # Date 1285681430 -3600 # Node ID 34794995b12f700d50c12bb013431c882dca9c10 # Parent 5244ad54bebadf57e8bede36643afa49c58ac579 Adding simple python script to split a FASTA file into those sequence with/without BLAST hits using tabular output --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -265,6 +265,7 @@ <section name="NCBI BLAST+" id="ncbi_blast_plus_tools"><tool file="ncbi_blast_plus/ncbi_blastn_wrapper.xml" /><tool file="ncbi_blast_plus/ncbi_blastp_wrapper.xml" /> + <tool file="ncbi_blast_plus/blast_filter_fasta.xml" /></section><section name="NGS: Mapping" id="solexa_tools"><tool file="sr_mapping/lastz_wrapper.xml" /> --- /dev/null +++ b/tools/ncbi_blast_plus/blast_filter_fasta.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +"""Filter a FASTA file using tabular BLAST output. + +Takes four command line options, tabular BLAST filename, input FASTA filename, +and two output FASTA filenames (for records with and without any BLAST hits). +""" +#TODO - Option to define which column to use for ID? + +import sys +from galaxy_utils.sequence.fasta import fastaReader, fastaWriter + +#Parse Command Line +blast_file, in_file, out_positive_file, out_negative_file = sys.argv[1:] + +#Read tabular BLAST file and record all queries with hit(s) +ids = set() +blast_handle = open(blast_file, "rU") +for line in blast_handle: + ids.add(line.split("\t")[0]) +blast_handle.close() + +#Write filtered FASTA file based on IDs from BLAST file +reader = fastaReader(open(in_file, "rU")) +positive_writer = fastaWriter(open(out_positive_file, "w")) +negative_writer = fastaWriter(open(out_negative_file, "w")) +for record in reader: + #The [1:] is because the fastaReader leaves the > on the identifer. + if record.identifier and record.identifier.split()[0][1:] in ids: + positive_writer.write(record) + else: + negative_writer.write(record) +positive_writer.close() +negative_writer.close() +reader.close() --- /dev/null +++ b/tools/ncbi_blast_plus/blast_filter_fasta.xml @@ -0,0 +1,26 @@ +<tool id="blast_filter_fasta" name="BLAST filter FASTA file" version="0.0.1"> + <description>Divide a FASTA file based on BLAST hits</description> + <command interpreter="python"> + blast_filter_fasta.py $blast_file $in_file $out_positive_file $out_negative_file + </command> + <inputs> + <param name="blast_file" type="data" format="tabular" label="Tabular BLAST output"/> + <param name="in_file" type="data" format="fasta" label="BLAST query FASTA file"/> + </inputs> + <outputs> + <data name="out_positive_file" format="fasta" label="Sequences with BLAST hits" /> + <data name="out_negative_file" format="fasta" label="Sequences without BLAST hits" /> + </outputs> + <requirements> + </requirements> + <tests> + </tests> + <help> + +**What it does** + +Takes a multi-sequence FASTA and the tabular output of running BLAST on it, and +divides the FASTA file in two: those sequence with a BLAST hit, and those without. + + </help> +</tool>