commit/galaxy-central: jmchilton: Merged in BjoernGruening/galaxy-central-1/snpeff_datatype (pull request #640)
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/482d85dda5d3/ Changeset: 482d85dda5d3 User: jmchilton Date: 2015-01-23 17:12:43+00:00 Summary: Merged in BjoernGruening/galaxy-central-1/snpeff_datatype (pull request #640) Add SnpEff datatypes Affected #: 3 files diff -r 2fd8714c4faeb580f3ccf1591003f10e821397a1 -r 482d85dda5d3546e1d6e9cca6d712d7af8c772ba config/datatypes_conf.xml.sample --- a/config/datatypes_conf.xml.sample +++ b/config/datatypes_conf.xml.sample @@ -113,7 +113,7 @@ <converter file="interval_to_bed12_converter.xml" target_datatype="bed12"/><converter file="interval_to_bgzip_converter.xml" target_datatype="bgzip"/><converter file="interval_to_tabix_converter.xml" target_datatype="tabix" depends_on="bgzip"/> - <converter file="interval_to_bigwig_converter.xml" target_datatype="bigwig"/> + <converter file="interval_to_bigwig_converter.xml" target_datatype="bigwig"/><!-- <display file="ucsc/interval_as_bed.xml" inherit="True" /> --><display file="ensembl/ensembl_interval_as_bed.xml" inherit="True"/><display file="gbrowse/gbrowse_interval_as_bed.xml" inherit="True"/> @@ -152,11 +152,11 @@ <datatype extension="mafcustomtrack" type="galaxy.datatypes.sequence:MafCustomTrack"><display file="ucsc/maf_customtrack.xml" /></datatype> - <datatype extension="encodepeak" type="galaxy.datatypes.interval:ENCODEPeak" display_in_upload="True"> - <converter file="encodepeak_to_tabix_converter.xml" target_datatype="tabix" depends_on="bgzip"/> - <converter file="encodepeak_to_bgzip_converter.xml" target_datatype="bgzip"/> - <converter file="bed_gff_or_vcf_to_bigwig_converter.xml" target_datatype="bigwig"/> - </datatype> + <datatype extension="encodepeak" type="galaxy.datatypes.interval:ENCODEPeak" display_in_upload="True"> + <converter file="encodepeak_to_tabix_converter.xml" target_datatype="tabix" depends_on="bgzip"/> + <converter file="encodepeak_to_bgzip_converter.xml" target_datatype="bgzip"/> + <converter file="bed_gff_or_vcf_to_bigwig_converter.xml" target_datatype="bigwig"/> + </datatype><datatype extension="pdf" type="galaxy.datatypes.images:Pdf" mimetype="application/pdf"/><datatype extension="pileup" type="galaxy.datatypes.tabular:Pileup" display_in_upload="true"><converter file="interval_to_bgzip_converter.xml" target_datatype="bgzip"/> @@ -172,11 +172,16 @@ <datatype extension="qual454" type="galaxy.datatypes.qualityscore:QualityScore454" display_in_upload="true"/><datatype extension="Roadmaps" type="galaxy.datatypes.assembly:Roadmaps" display_in_upload="false"/><datatype extension="sam" type="galaxy.datatypes.tabular:Sam" display_in_upload="true"> - <converter file="sam_to_bam.xml" target_datatype="bam"/> - <converter file="sam_to_bigwig_converter.xml" target_datatype="bigwig"/> - </datatype> + <converter file="sam_to_bam.xml" target_datatype="bam"/> + <converter file="sam_to_bigwig_converter.xml" target_datatype="bigwig"/> + </datatype><datatype extension="scf" type="galaxy.datatypes.binary:Scf" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'scf' format with a '.scf' file extension. You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Scf"/><datatype extension="Sequences" type="galaxy.datatypes.assembly:Sequences" display_in_upload="false"/> + <datatype extension="snpeffdb" type="galaxy.datatypes.text:SnpEffDb" display_in_upload="True"/> + <datatype extension="snpsiftdbnsfp" type="galaxy.datatypes.txt:SnpSiftDbNSFP" display_in_upload="True"/> + <datatype extension="dbnsfp.tabular" type="galaxy.datatypes.tabular:Tabular" subclass="True" display_in_upload="True"> + <converter file="tabular_to_dbnsfp.xml" target_datatype="snpsiftdbnsfp"/> + </datatype><datatype extension="sff" type="galaxy.datatypes.binary:Sff" mimetype="application/octet-stream" display_in_upload="true" description="A binary file in 'Standard Flowgram Format' with a '.sff' file extension." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Sff"/><datatype extension="svg" type="galaxy.datatypes.images:Image" mimetype="image/svg+xml"/><datatype extension="taxonomy" type="galaxy.datatypes.tabular:Taxonomy" display_in_upload="true"/> diff -r 2fd8714c4faeb580f3ccf1591003f10e821397a1 -r 482d85dda5d3546e1d6e9cca6d712d7af8c772ba lib/galaxy/datatypes/converters/tabular_to_dbnsfp.xml --- /dev/null +++ b/lib/galaxy/datatypes/converters/tabular_to_dbnsfp.xml @@ -0,0 +1,12 @@ +<tool id="tabular_to_dbnsfp" name="Convert tabular to dbnsfp" version="1.0.0"> + <description></description> + <command interpreter="python">tabular_to_dbnsfp.py $input $dbnsfp.extra_files_path/dbNSFP.gz</command> + <inputs> + <param format="tabular" name="input" type="data" label="Choose a dbnsfp tabular file"/> + </inputs> + <outputs> + <data format="snpsiftdbnsfp" name="dbnsfp"/> + </outputs> + <help> + </help> +</tool> diff -r 2fd8714c4faeb580f3ccf1591003f10e821397a1 -r 482d85dda5d3546e1d6e9cca6d712d7af8c772ba lib/galaxy/datatypes/text.py --- a/lib/galaxy/datatypes/text.py +++ b/lib/galaxy/datatypes/text.py @@ -2,7 +2,6 @@ """ Clearing house for generic text datatypes that are not XML or tabular. """ - from galaxy.datatypes.data import Text from galaxy.datatypes.data import get_file_peek from galaxy.datatypes.data import nice_size @@ -12,13 +11,13 @@ import tempfile import subprocess import json +import gzip import os import re import logging log = logging.getLogger(__name__) - class Json( Text ): file_ext = "json" @@ -259,3 +258,121 @@ dataset.metadata.comment_lines = comment_lines dataset.metadata.columns = column_count + +class SnpEffDb( Text ): + """Class describing a SnpEff genome build""" + file_ext = "snpeffdb" + MetadataElement( name="genome_version", default=None, desc="Genome Version", readonly=True, visible=True, no_value=None ) + MetadataElement( name="regulation", default=[], desc="Regulation Names", readonly=True, visible=True, no_value=[], optional=True) + MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[], optional=True) + + def __init__( self, **kwd ): + Text.__init__( self, **kwd ) + + def set_meta( self, dataset, **kwd ): + Text.set_meta(self, dataset, **kwd ) + data_dir = dataset.extra_files_path + ## search data_dir/genome_version for files + regulation_pattern = 'regulation_(.+).bin' + # annotation files that are included in snpEff by a flag + annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} + regulations = [] + annotations = [] + if data_dir and os.path.isdir(data_dir): + for root, dirs, files in os.walk(data_dir): + for fname in files: + if fname.startswith('snpEffectPredictor'): + # if snpEffectPredictor.bin download succeeded + genome_version = os.path.basename(root) + dataset.metadata.genome_version = genome_version + else: + m = re.match(regulation_pattern,fname) + if m: + name = m.groups()[0] + regulations.append(name) + elif fname in annotations_dict: + value = annotations_dict[fname] + name = value.lstrip('-') + annotations.append(name) + dataset.metadata.regulation = regulations + dataset.metadata.annotation = annotations + try: + fh = file(dataset.file_name,'w') + fh.write("%s\n" % genome_version) + if annotations: + fh.write("annotations: %s\n" % ','.join(annotations)) + if regulations: + fh.write("regulations: %s\n" % ','.join(regulations)) + fh.close() + except: + pass + + +class SnpSiftDbNSFP( Text ): + """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """ + MetadataElement( name='reference_name', default='dbSNFP' , desc='Reference Name', readonly=True, visible=True, set_in_upload=True, no_value='dbSNFP' ) + MetadataElement( name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None ) + MetadataElement( name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None) + MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[] ) + file_ext = "snpsiftdbnsfp" + composite_type = 'auto_primary_file' + allow_datatype_change = False + """ + ## The dbNSFP file is a tabular file with 1 header line + ## The first 4 columns are required to be: chrom pos ref alt + ## These match columns 1,2,4,5 of the VCF file + ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix + ## Example: + ## Compress using block-gzip algorithm + bgzip dbNSFP2.3.txt + ## Create tabix index + tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz + """ + def __init__( self, **kwd ): + Text.__init__( self, **kwd ) + self.add_composite_file( '%s.grp', description = 'Group File', substitute_name_with_metadata = 'reference_name', is_binary = False ) + self.add_composite_file( '%s.ti', description = '', substitute_name_with_metadata = 'reference_name', is_binary = False ) + def init_meta( self, dataset, copy_from=None ): + Text.init_meta( self, dataset, copy_from=copy_from ) + def generate_primary_file( self, dataset = None ): + """ + This is called only at upload to write the html file + cannot rename the datasets here - they come with the default unfortunately + """ + regenerate_primary_file( self, dataset) + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + annotations = "dbNSFP Annotations: %s\n" % ','.join(dataset.metadata.annotation) + f = open(dataset.file_name,'a') + if dataset.metadata.bgzip: + bn = dataset.metadata.bgzip + f.write(bn) + f.write('\n') + f.write(annotations) + f.close() + def set_meta( self, dataset, overwrite=True, **kwd ): + try: + efp = dataset.extra_files_path + if os.path.exists(efp): + flist = os.listdir(efp) + for i,fname in enumerate(flist): + if fname.endswith('.gz'): + dataset.metadata.bgzip = fname + try: + fh = gzip.open(os.path.join(efp,fname),'r') + buf = fh.read(5000) + lines = buf.splitlines() + headers = lines[0].split('\t') + dataset.metadata.annotation = headers[4:] + except Exception,e: + log.warn("set_meta fname: %s %s" % (fname,str(e))) + finally: + fh.close() + if fname.endswith('.tbi'): + dataset.metadata.index = fname + self.regenerate_primary_file(dataset) + except Exception,e: + log.warn("set_meta fname: %s %s" % (dataset.file_name if dataset and dataset.file_name else 'Unkwown',str(e))) + Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org