[hg] galaxy 3141: Add support for uploading BAM files.
details: http://www.bx.psu.edu/hg/galaxy/rev/5e30e1ea6bbb changeset: 3141:5e30e1ea6bbb user: Greg Von Kuster <greg@bx.psu.edu> date: Wed Dec 02 20:02:05 2009 -0500 description: Add support for uploading BAM files. diffstat: datatypes_conf.xml.sample | 3 +- lib/galaxy/datatypes/binary.py | 55 ++++++++++++++++----------- lib/galaxy/datatypes/registry.py | 1 + lib/galaxy/datatypes/sniff.py | 3 + lib/galaxy/datatypes/test/1.bam | test-data/1.bam | test/functional/test_get_data.py | 18 ++++++++- tools/data_source/upload.xml | 6 +++ 8 files changed, 62 insertions(+), 24 deletions(-) diffs (183 lines): diff -r f4e6ed22a39e -r 5e30e1ea6bbb datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample Wed Dec 02 15:23:15 2009 -0500 +++ b/datatypes_conf.xml.sample Wed Dec 02 20:02:05 2009 -0500 @@ -3,7 +3,7 @@ <registration converters_path="lib/galaxy/datatypes/converters"> <datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true"/> <datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true"/> - <datatype extension="bam" type="galaxy.datatypes.binary:Bam" mimetype="application/octet-stream"/> + <datatype extension="bam" type="galaxy.datatypes.binary:Bam" mimetype="application/octet-stream" display_in_upload="true"/> <datatype extension="bed" type="galaxy.datatypes.interval:Bed" display_in_upload="true"> <converter file="bed_to_gff_converter.xml" target_datatype="gff"/> <converter file="interval_to_coverage.xml" target_datatype="coverage"/> @@ -203,6 +203,7 @@ defined format first, followed by next-most rigidly defined, and so on. --> + <sniffer type="galaxy.datatypes.binary:Bam"/> <sniffer type="galaxy.datatypes.binary:Sff"/> <sniffer type="galaxy.datatypes.xml:BlastXml"/> <sniffer type="galaxy.datatypes.sequence:Maf"/> diff -r f4e6ed22a39e -r 5e30e1ea6bbb lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py Wed Dec 02 15:23:15 2009 -0500 +++ b/lib/galaxy/datatypes/binary.py Wed Dec 02 20:02:05 2009 -0500 @@ -12,7 +12,7 @@ log = logging.getLogger(__name__) -sniffable_binary_formats = [ 'sff' ] +sniffable_binary_formats = [ 'sff', 'bam' ] # Currently these supported binary data types must be manually set on upload unsniffable_binary_formats = [ 'ab1', 'scf' ] @@ -26,6 +26,9 @@ else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def get_mime( self ): + """Returns the mime type of the datatype""" + return 'application/octet-stream' class Ab1( Binary ): """Class describing an ab1 binary sequence file""" @@ -48,29 +51,40 @@ """Class describing a BAM binary file""" file_ext = "bam" MetadataElement( name="bam_index", desc="BAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) + def init_meta( self, dataset, copy_from=None ): Binary.init_meta( self, dataset, copy_from=copy_from ) - def set_meta( self, dataset, overwrite = True, **kwd ): """ - Sets index for BAM file. + GVK 12/2/09: just noticed this - not good and doesn't work, so commenting out for now. + def set_meta( self, dataset, overwrite = True, **kwd ): + # Sets index for BAM file. + index_file = dataset.metadata.bam_index + if not index_file: + index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset ) + tmp_dir = tempfile.gettempdir() + tmpf1 = tempfile.NamedTemporaryFile( dir=tmp_dir ) + tmpf1bai = '%s.bai' % tmpf1.name + try: + os.system( 'cd %s' % tmp_dir ) + os.system( 'cp %s %s' % ( dataset.file_name, tmpf1.name ) ) + os.system( 'samtools index %s' % tmpf1.name ) + os.system( 'cp %s %s' % ( tmpf1bai, index_file.file_name ) ) + except Exception, ex: + sys.stderr.write( 'There was a problem creating the index for the BAM file\n%s\n' + str( ex ) ) + tmpf1.close() + if os.path.exists( tmpf1bai ): + os.remove( tmpf1bai ) + dataset.metadata.bam_index = index_file """ - index_file = dataset.metadata.bam_index - if not index_file: - index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset ) - tmp_dir = tempfile.gettempdir() - tmpf1 = tempfile.NamedTemporaryFile( dir=tmp_dir ) - tmpf1bai = '%s.bai' % tmpf1.name + def sniff( self, filename ): + # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. try: - os.system( 'cd %s' % tmp_dir ) - os.system( 'cp %s %s' % ( dataset.file_name, tmpf1.name ) ) - os.system( 'samtools index %s' % tmpf1.name ) - os.system( 'cp %s %s' % ( tmpf1bai, index_file.file_name ) ) - except Exception, ex: - sys.stderr.write( 'There was a problem creating the index for the BAM file\n%s\n' + str( ex ) ) - tmpf1.close() - if os.path.exists( tmpf1bai ): - os.remove( tmpf1bai ) - dataset.metadata.bam_index = index_file + header = open( filename ).read(4) + if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ): + return True + return False + except: + return False def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: export_url = "/history_add_to?" + urlencode( {'history_id':dataset.history_id,'ext':'bam','name':'bam alignments','info':'Alignments file','dbkey':dataset.dbkey} ) @@ -84,9 +98,6 @@ return dataset.peek except: return "Binary bam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) ) - def get_mime( self ): - """Returns the mime type of the datatype""" - return 'application/octet-stream' class Binseq( Binary ): """Class describing a zip archive of binary sequence files""" diff -r f4e6ed22a39e -r 5e30e1ea6bbb lib/galaxy/datatypes/registry.py --- a/lib/galaxy/datatypes/registry.py Wed Dec 02 15:23:15 2009 -0500 +++ b/lib/galaxy/datatypes/registry.py Wed Dec 02 20:02:05 2009 -0500 @@ -174,6 +174,7 @@ # because some formats are much more flexibly defined than others. if len(self.sniff_order) < 1: self.sniff_order = [ + binary.Bam(), binary.Sff(), xml.BlastXml(), sequence.Maf(), diff -r f4e6ed22a39e -r 5e30e1ea6bbb lib/galaxy/datatypes/sniff.py --- a/lib/galaxy/datatypes/sniff.py Wed Dec 02 15:23:15 2009 -0500 +++ b/lib/galaxy/datatypes/sniff.py Wed Dec 02 20:02:05 2009 -0500 @@ -252,6 +252,9 @@ >>> fname = get_test_fname('1.sff') >>> guess_ext(fname) 'sff' + >>> fname = get_test_fname('1.bam') + >>> guess_ext(fname) + 'bam' """ if sniff_order is None: datatypes_registry = registry.Registry() diff -r f4e6ed22a39e -r 5e30e1ea6bbb lib/galaxy/datatypes/test/1.bam Binary file lib/galaxy/datatypes/test/1.bam has changed diff -r f4e6ed22a39e -r 5e30e1ea6bbb test-data/1.bam Binary file test-data/1.bam has changed diff -r f4e6ed22a39e -r 5e30e1ea6bbb test/functional/test_get_data.py --- a/test/functional/test_get_data.py Wed Dec 02 15:23:15 2009 -0500 +++ b/test/functional/test_get_data.py Wed Dec 02 20:02:05 2009 -0500 @@ -520,7 +520,23 @@ self.check_history_for_string( '1.axt format: <span class="axt">axt</span>, database: \? Info: uploaded file' ) self.check_metadata_for_string( 'value="1.axt" value="\?" Change data type selected value="axt" selected="yes"' ) self.delete_history( id=self.security.encode_id( history.id ) ) - def test_0150_url_paste( self ): + def test_0150_upload_file( self ): + """Test uploading 1.bam, NOT setting the file format""" + self.check_history_for_string( 'Your history is empty' ) + history = sa_session.query( galaxy.model.History ) \ + .filter( and_( galaxy.model.History.table.c.deleted==False, + galaxy.model.History.table.c.user_id==admin_user.id ) ) \ + .order_by( desc( galaxy.model.History.table.c.create_time ) ) \ + .first() + self.upload_file( '1.bam' ) + hda = sa_session.query( galaxy.model.HistoryDatasetAssociation ) \ + .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ) \ + .first() + assert hda is not None, "Problem retrieving hda from database" + self.verify_dataset_correctness( '1.bam', hid=str( hda.hid ) ) + self.check_history_for_string( '<span class="bam">bam</span>' ) + self.delete_history( id=self.security.encode_id( history.id ) ) + def test_0155_url_paste( self ): """Test url paste behavior""" # Logged in as admin_user # Deleting the current history should have created a new history diff -r f4e6ed22a39e -r 5e30e1ea6bbb tools/data_source/upload.xml --- a/tools/data_source/upload.xml Wed Dec 02 15:23:15 2009 -0500 +++ b/tools/data_source/upload.xml Wed Dec 02 20:02:05 2009 -0500 @@ -57,6 +57,12 @@ ----- +**Bam** + +A binary file compressed in the BGZF format with a '.bam' file extension. + +----- + **Binseq.zip** A zipped archive consisting of binary sequence files in either 'ab1' or 'scf' format. All files in this archive must have the same file extension which is one of '.ab1' or '.scf'. You must manually select this 'File Format' when uploading the file.
participants (1)
-
Greg Von Kuster