3 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/4ae7e5848f7f/ Changeset: 4ae7e5848f7f User: nsoranzo Date: 2014-03-27 17:03:16 Summary: Add BCF datatype sniffing, so BCF files are not uncompressed during upload. Affected #: 1 file diff -r b6d04f39a37fd73abd9c0c5e9695d74330786f47 -r 4ae7e5848f7f555e24540518ba95582f6af1cd58 lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -69,6 +69,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def get_mime( self ): """Returns the mime type of the datatype""" return 'application/octet-stream' @@ -96,6 +97,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -133,12 +135,14 @@ version = line.split()[1] break return version + def _is_coordinate_sorted( self, file_name ): """See if the input BAM file is sorted from the header information.""" params = [ "samtools", "view", "-H", file_name ] output = subprocess.Popen( params, stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[0] # find returns -1 if string is not found return output.find( "SO:coordinate" ) != -1 or output.find( "SO:sorted" ) != -1 + def dataset_content_needs_grooming( self, file_name ): """See if file_name is a sorted BAM file""" version = self._get_samtools_version() @@ -186,6 +190,7 @@ except OSError: pass return False + def groom_dataset_content( self, file_name ): """ Ensures that the Bam file contents are sorted. This function is called @@ -221,8 +226,10 @@ # Remove temp file and empty temporary directory os.unlink( stderr_name ) os.rmdir( tmp_dir ) + def init_meta( self, dataset, copy_from=None ): Binary.init_meta( self, dataset, copy_from=copy_from ) + def set_meta( self, dataset, overwrite = True, **kwd ): """ Creates the index for the BAM file. """ # These metadata values are not accessible by users, always overwrite @@ -247,6 +254,7 @@ dataset.metadata.bam_index = index_file # Remove temp file os.unlink( stderr_name ) + def sniff( self, filename ): # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy. # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. @@ -257,6 +265,7 @@ return False except: return False + def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Binary bam alignments file" @@ -264,6 +273,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -355,6 +365,24 @@ Binary.register_sniffable_binary_format("bam", "bam", Bam) +class Bcf( Binary): + """Class describing a BCF file""" + file_ext = "bcf" + + def sniff( self, filename ): + # BCF is compressed in the BGZF format, and must not be uncompressed in Galaxy. + # The first 3 bytes of any bcf file is 'BCF', and the file is binary. + try: + header = gzip.open( filename ).read(3) + if binascii.b2a_hex( header ) == binascii.hexlify( 'BCF' ): + return True + return False + except: + return False + +Binary.register_sniffable_binary_format("bcf", "bcf", Bcf) + + class H5( Binary ): """Class describing an HDF5 file""" file_ext = "h5" @@ -366,6 +394,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -386,6 +415,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -401,6 +431,7 @@ def __init__( self, **kwd ): Binary.__init__( self, **kwd ) + def sniff( self, filename ): # The first 4 bytes of any sff file is '.sff', and the file is binary. For details # about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format @@ -411,6 +442,7 @@ return False except: return False + def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Binary sff file" @@ -418,6 +450,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -440,14 +473,17 @@ Binary.__init__( self, **kwd ) self._magic = 0x888FFC26 self._name = "BigWig" + def _unpack( self, pattern, handle ): return struct.unpack( pattern, handle.read( struct.calcsize( pattern ) ) ) + def sniff( self, filename ): try: magic = self._unpack( "I", open( filename ) ) return magic[0] == self._magic except: return False + def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Binary UCSC %s file" % self._name @@ -455,6 +491,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -493,12 +530,14 @@ return True except IOError: return False + def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = "Binary TwoBit format nucleotide file" dataset.blurb = data.nice_size(dataset.get_size()) else: return super(TwoBit, self).set_peek(dataset, is_multi_byte) + def display_peek(self, dataset): try: return dataset.peek https://bitbucket.org/galaxy/galaxy-central/commits/6f9d3980f9d8/ Changeset: 6f9d3980f9d8 User: nsoranzo Date: 2014-03-27 17:19:39 Summary: Use new Bcf class. Affected #: 1 file diff -r 4ae7e5848f7f555e24540518ba95582f6af1cd58 -r 6f9d3980f9d87af158e11cabad5df892efbb69ad datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample +++ b/datatypes_conf.xml.sample @@ -189,7 +189,7 @@ <display file="igv/vcf.xml" /><display file="rviewer/vcf.xml" inherit="True"/></datatype> - <datatype extension="bcf" type="galaxy.datatypes.binary:Binary" subclass="True" display_in_upload="True"/> + <datatype extension="bcf" type="galaxy.datatypes.binary:Bcf" mimetype="application/octet-stream" display_in_upload="True"/><datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/><datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true" description="The wiggle format is line-oriented. Wiggle data is preceded by a track definition line, which adds a number of options for controlling the default display of this track." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Wig"><converter file="wig_to_bigwig_converter.xml" target_datatype="bigwig"/> https://bitbucket.org/galaxy/galaxy-central/commits/4d99cae3baac/ Changeset: 4d99cae3baac User: dannon Date: 2014-03-27 17:37:21 Summary: Merged in nsoranzo/galaxy-central (pull request #353) Add BCF datatype sniffing, so BCF files are not uncompressed during upload. Affected #: 2 files diff -r c4f468e43b93406804bd7665a2755b16f8bac463 -r 4d99cae3baac3d5c136541aaed5aa640881e6a5b datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample +++ b/datatypes_conf.xml.sample @@ -189,7 +189,7 @@ <display file="igv/vcf.xml" /><display file="rviewer/vcf.xml" inherit="True"/></datatype> - <datatype extension="bcf" type="galaxy.datatypes.binary:Binary" subclass="True" display_in_upload="True"/> + <datatype extension="bcf" type="galaxy.datatypes.binary:Bcf" mimetype="application/octet-stream" display_in_upload="True"/><datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/><datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true" description="The wiggle format is line-oriented. Wiggle data is preceded by a track definition line, which adds a number of options for controlling the default display of this track." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Wig"><converter file="wig_to_bigwig_converter.xml" target_datatype="bigwig"/> diff -r c4f468e43b93406804bd7665a2755b16f8bac463 -r 4d99cae3baac3d5c136541aaed5aa640881e6a5b lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -69,6 +69,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def get_mime( self ): """Returns the mime type of the datatype""" return 'application/octet-stream' @@ -96,6 +97,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -133,12 +135,14 @@ version = line.split()[1] break return version + def _is_coordinate_sorted( self, file_name ): """See if the input BAM file is sorted from the header information.""" params = [ "samtools", "view", "-H", file_name ] output = subprocess.Popen( params, stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[0] # find returns -1 if string is not found return output.find( "SO:coordinate" ) != -1 or output.find( "SO:sorted" ) != -1 + def dataset_content_needs_grooming( self, file_name ): """See if file_name is a sorted BAM file""" version = self._get_samtools_version() @@ -186,6 +190,7 @@ except OSError: pass return False + def groom_dataset_content( self, file_name ): """ Ensures that the Bam file contents are sorted. This function is called @@ -221,8 +226,10 @@ # Remove temp file and empty temporary directory os.unlink( stderr_name ) os.rmdir( tmp_dir ) + def init_meta( self, dataset, copy_from=None ): Binary.init_meta( self, dataset, copy_from=copy_from ) + def set_meta( self, dataset, overwrite = True, **kwd ): """ Creates the index for the BAM file. """ # These metadata values are not accessible by users, always overwrite @@ -247,6 +254,7 @@ dataset.metadata.bam_index = index_file # Remove temp file os.unlink( stderr_name ) + def sniff( self, filename ): # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy. # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. @@ -257,6 +265,7 @@ return False except: return False + def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Binary bam alignments file" @@ -264,6 +273,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -355,6 +365,24 @@ Binary.register_sniffable_binary_format("bam", "bam", Bam) +class Bcf( Binary): + """Class describing a BCF file""" + file_ext = "bcf" + + def sniff( self, filename ): + # BCF is compressed in the BGZF format, and must not be uncompressed in Galaxy. + # The first 3 bytes of any bcf file is 'BCF', and the file is binary. + try: + header = gzip.open( filename ).read(3) + if binascii.b2a_hex( header ) == binascii.hexlify( 'BCF' ): + return True + return False + except: + return False + +Binary.register_sniffable_binary_format("bcf", "bcf", Bcf) + + class H5( Binary ): """Class describing an HDF5 file""" file_ext = "h5" @@ -366,6 +394,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -386,6 +415,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -401,6 +431,7 @@ def __init__( self, **kwd ): Binary.__init__( self, **kwd ) + def sniff( self, filename ): # The first 4 bytes of any sff file is '.sff', and the file is binary. For details # about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format @@ -411,6 +442,7 @@ return False except: return False + def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Binary sff file" @@ -418,6 +450,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -440,14 +473,17 @@ Binary.__init__( self, **kwd ) self._magic = 0x888FFC26 self._name = "BigWig" + def _unpack( self, pattern, handle ): return struct.unpack( pattern, handle.read( struct.calcsize( pattern ) ) ) + def sniff( self, filename ): try: magic = self._unpack( "I", open( filename ) ) return magic[0] == self._magic except: return False + def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Binary UCSC %s file" % self._name @@ -455,6 +491,7 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): try: return dataset.peek @@ -493,12 +530,14 @@ return True except IOError: return False + def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = "Binary TwoBit format nucleotide file" dataset.blurb = data.nice_size(dataset.get_size()) else: return super(TwoBit, self).set_peek(dataset, is_multi_byte) + def display_peek(self, dataset): try: return dataset.peek Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.