[hg] galaxy 3155: Fix for uploading a Bam file that has not yet ...

11 Dec 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/8feff3bc14bc
changeset: 3155:8feff3bc14bc
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Mon Dec 07 16:04:33 2009 -0500
description:
Fix for uploading a Bam file that has not yet been sorted, the call to samtools sort has been moved from the sam_to_bam tool to the Bam()set_meta() method to ensure all Bam datasets are sorted prior to indexing.  Added new functional tests to cover uploaing unsorted Bam files.  Also cleaned up code in the upload tool for uploaing various binary data formats.

diffstat:

 lib/galaxy/datatypes/binary.py   |   92 ++++++++++++++++++----
 lib/galaxy/datatypes/sniff.py    |    3 +
 lib/galaxy/datatypes/test/3.bam  |      
 test-data/3.bam                  |      
 test/functional/test_get_data.py |   26 ++++++-
 tools/data_source/upload.py      |  138 +++++++++++++++-------------------
 tools/samtools/sam_to_bam.py     |   21 +----
 tools/samtools/sam_to_bam.xml    |    4 -
 8 files changed, 164 insertions(+), 120 deletions(-)

diffs (440 lines):

diff -r 119315b57656 -r 8feff3bc14bc lib/galaxy/datatypes/binary.py

--- a/lib/galaxy/datatypes/binary.py	Mon Dec 07 15:57:06 2009 -0500
+++ b/lib/galaxy/datatypes/binary.py	Mon Dec 07 16:04:33 2009 -0500
@@ -12,7 +12,6 @@
 
 log = logging.getLogger(__name__)
 
-sniffable_binary_formats = [ 'sff', 'bam' ]
 # Currently these supported binary data types must be manually set on upload
 unsniffable_binary_formats = [ 'ab1', 'scf' ]
 
@@ -55,29 +54,84 @@
     def init_meta( self, dataset, copy_from=None ):
         Binary.init_meta( self, dataset, copy_from=copy_from )
     def set_meta( self, dataset, overwrite = True, **kwd ):
-        """ Sets index for BAM file. """
+        """ Ensures that the Bam file contents are sorted and creates the index for the BAM file. """
+        errors = False
         # These metadata values are not accessible by users, always overwrite
         index_file = dataset.metadata.bam_index
-        if not index_file:
+        if index_file:
+            # If an index file already exists on disk, then the data must have previously been sorted
+            # since samtools requires a sorted Bam file in order to create an index.
+            sorted = os.path.exists( index_file.file_name )
+        else:
             index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset )
+            sorted = False
+        tmp_dir = tempfile.gettempdir()
         try:
-            # Using a symlink from ~/database/files/dataset_XX.dat, create a temporary file
-            # to store the indexex generated from samtools, something like ~/tmp/dataset_XX.dat.bai
-            tmp_dir = tempfile.gettempdir()
-            tmp_file_path = os.path.join( tmp_dir, os.path.basename( dataset.file_name ) )
-            # Here tmp_file_path looks something like /tmp/dataset_XX.dat
-            os.symlink( dataset.file_name, tmp_file_path )
-            command = 'samtools index %s' % tmp_file_path
-            proc = subprocess.Popen( args=command, shell=True )
-            proc.wait()
-        except:
-            err_msg = 'Error creating index file (%s) for BAM file (%s)' % ( str( tmp_file_path ), str( dataset.file_name ) )
+            # Create a symlink from the temporary directory to the dataset file so that samtools can mess with it.
+            tmp_dataset_file_name = os.path.join( tmp_dir, os.path.basename( dataset.file_name ) )
+            # Here tmp_dataset_file_name looks something like /tmp/dataset_XX.dat
+            os.symlink( dataset.file_name, tmp_dataset_file_name )
+        except Exception, e:
+            errors = True
+            err_msg = 'Error creating tmp symlink to file (%s).  ' % str( dataset.file_name )
             log.exception( err_msg )
-            sys.stderr.write( err_msg )
-        # Move the temporary index file ~/tmp/dataset_XX.dat.bai to be ~/database/files/_metadata_files/dataset_XX.dat
-        shutil.move( '%s.bai' % ( tmp_file_path ), index_file.file_name )
-        os.unlink( tmp_file_path )
-        dataset.metadata.bam_index = index_file
+            sys.stderr.write( err_msg + str( e ) )
+        if not errors and not sorted:
+            try:
+                # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
+                # TODO: This command may also create temporary files <out.prefix>.%d.bam when the
+                # whole alignment cannot be fitted into memory ( controlled by option -m ).  We're
+                # not handling this case here.
+                tmp_sorted_dataset_file = tempfile.NamedTemporaryFile( prefix=tmp_dataset_file_name )
+                tmp_sorted_dataset_file_name = tmp_sorted_dataset_file.name
+                tmp_sorted_dataset_file.close()
+                command = "samtools sort %s %s 2>/dev/null" % ( tmp_dataset_file_name, tmp_sorted_dataset_file_name )
+                proc = subprocess.Popen( args=command, shell=True )
+                proc.wait()
+            except Exception, e:
+                errors = True
+                err_msg = 'Error sorting alignments from (%s).  ' % tmp_dataset_file_name
+                log.exception( err_msg )
+                sys.stderr.write( err_msg + str( e ) )
+        if not errors:
+            if sorted:
+                try:
+                    # Create the Bam index
+                    command = 'samtools index %s' % tmp_dataset_file_name
+                    proc = subprocess.Popen( args=command, shell=True )
+                    proc.wait()
+                except Exception, e:
+                    errors = True
+                    err_msg = 'Error creating index for BAM file (%s)' % str( tmp_dataset_file_name )
+                    log.exception( err_msg )
+                    sys.stderr.write( err_msg + str( e ) )
+            else:
+                tmp_sorted_bam_file_name = '%s.bam' % tmp_sorted_dataset_file_name
+                try:
+                    # Create the Bam index
+                    command = 'samtools index %s' % tmp_sorted_bam_file_name
+                    proc = subprocess.Popen( args=command, shell=True )
+                    proc.wait()
+                except Exception, e:
+                    errors = True
+                    err_msg = 'Error creating index for BAM file (%s)' % str( tmp_sorted_dataset_file_name )
+                    log.exception( err_msg )
+                    sys.stderr.write( err_msg + str( e ) )
+        if not errors:
+            if sorted:
+                # Move the temporary index file ~/tmp/dataset_XX.dat.bai to our metadata file
+                # storage location ~/database/files/_metadata_files/dataset_XX.dat
+                shutil.move( '%s.bai' % ( tmp_dataset_file_name ), index_file.file_name )
+            else:
+                # Move tmp_sorted_bam_file_name to our output dataset location
+                shutil.move( tmp_sorted_bam_file_name, dataset.file_name )
+                # Move the temporary sorted index file ~/tmp/dataset_XX.dat.bai to our metadata file
+                # storage location ~/database/files/_metadata_files/dataset_XX.dat
+                shutil.move( '%s.bai' % ( tmp_sorted_bam_file_name ), index_file.file_name )
+            # Remove all remaining temporary files
+            os.unlink( tmp_dataset_file_name )
+            # Set the metadata
+            dataset.metadata.bam_index = index_file
     def sniff( self, filename ):
         # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
         # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. 
diff -r 119315b57656 -r 8feff3bc14bc lib/galaxy/datatypes/sniff.py
--- a/lib/galaxy/datatypes/sniff.py	Mon Dec 07 15:57:06 2009 -0500
+++ b/lib/galaxy/datatypes/sniff.py	Mon Dec 07 16:04:33 2009 -0500
@@ -255,6 +255,9 @@
     >>> fname = get_test_fname('1.bam')
     >>> guess_ext(fname)
     'bam'
+    >>> fname = get_test_fname('3.bam')
+    >>> guess_ext(fname)
+    'bam'
     """
     if sniff_order is None:
         datatypes_registry = registry.Registry()
diff -r 119315b57656 -r 8feff3bc14bc lib/galaxy/datatypes/test/3.bam
Binary file lib/galaxy/datatypes/test/3.bam has changed
diff -r 119315b57656 -r 8feff3bc14bc test-data/3.bam
Binary file test-data/3.bam has changed
diff -r 119315b57656 -r 8feff3bc14bc test/functional/test_get_data.py
--- a/test/functional/test_get_data.py	Mon Dec 07 15:57:06 2009 -0500
+++ b/test/functional/test_get_data.py	Mon Dec 07 16:04:33 2009 -0500
@@ -521,7 +521,7 @@
         self.check_metadata_for_string( 'value="1.axt" value="\?" Change data type selected value="axt" selected="yes"' )
         self.delete_history( id=self.security.encode_id( history.id ) )
     def test_0150_upload_file( self ):
-        """Test uploading 1.bam, NOT setting the file format"""
+        """Test uploading 1.bam, which is a sorted Bam file creaed by the Galaxy sam_to_bam tool, NOT setting the file format"""
         self.check_history_for_string( 'Your history is empty' )
         history = sa_session.query( galaxy.model.History ) \
                             .filter( and_( galaxy.model.History.table.c.deleted==False,
@@ -535,8 +535,30 @@
         assert hda is not None, "Problem retrieving hda from database"
         self.verify_dataset_correctness( '1.bam', hid=str( hda.hid ) )
         self.check_history_for_string( '<span class="bam">bam</span>' )
+        # Make sure the Bam index was created
+        assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 1.bam"
         self.delete_history( id=self.security.encode_id( history.id ) )
-    def test_0155_url_paste( self ):
+    def test_0155_upload_file( self ):
+        """Test uploading 3.bam, which is an unsorted Bam file, NOT setting the file format"""
+        self.check_history_for_string( 'Your history is empty' )
+        history = sa_session.query( galaxy.model.History ) \
+                            .filter( and_( galaxy.model.History.table.c.deleted==False,
+                                           galaxy.model.History.table.c.user_id==admin_user.id ) ) \
+                            .order_by( desc( galaxy.model.History.table.c.create_time ) ) \
+                            .first()
+        self.upload_file( '3.bam' )
+        hda = sa_session.query( galaxy.model.HistoryDatasetAssociation ) \
+                        .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ) \
+                        .first()
+        assert hda is not None, "Problem retrieving hda from database"
+        # Since 3.bam is not sorted, we cannot verify dataset correctness since the uploaded
+        # dataset will be sorted.  However, the check below to see if the index was created is
+        # sufficient.
+        self.check_history_for_string( '<span class="bam">bam</span>' )
+        # Make sure the Bam index was created
+        assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 3.bam"
+        self.delete_history( id=self.security.encode_id( history.id ) )
+    def test_0160_url_paste( self ):
         """Test url paste behavior"""
         # Logged in as admin_user
         # Deleting the current history should have created a new history
diff -r 119315b57656 -r 8feff3bc14bc tools/data_source/upload.py
--- a/tools/data_source/upload.py	Mon Dec 07 15:57:06 2009 -0500
+++ b/tools/data_source/upload.py	Mon Dec 07 16:04:33 2009 -0500
@@ -9,7 +9,7 @@
 # need to import model before sniff to resolve a circular import dependency
 import galaxy.model
 from galaxy.datatypes import sniff
-from galaxy.datatypes.binary import sniffable_binary_formats, unsniffable_binary_formats
+from galaxy.datatypes.binary import *
 from galaxy import util
 from galaxy.util.json import *
 
@@ -61,62 +61,54 @@
     if chunk is None:
         temp.close()
     return False
-def check_binary( temp_name, chunk=None ):
-    if chunk is None:
+def check_binary( temp_name ):
+    is_binary = False
+    temp = open( temp_name, "U" )
+    chars_read = 0
+    for chars in temp:
+        for char in chars:
+            chars_read += 1
+            if ord( char ) > 128:
+                is_binary = True
+                break
+            if chars_read > 100:
+                break
+        if chars_read > 100:
+            break
+    temp.close()
+    return is_binary
+def check_bam( temp_name ):
+    return Bam().sniff( temp_name )
+def check_sff( temp_name ):
+    return Sff().sniff( temp_name )
+def check_gzip( temp_name ):
+    # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
+    # Make sure we have a gzipped file
+    try:
         temp = open( temp_name, "U" )
-    else:
-        temp = chunk
-    lineno = 0
-    for line in temp:
-        lineno += 1
-        line = line.strip()
-        if line:
-            for char in line:
-                if ord( char ) > 128:
-                    if chunk is None:
-                        temp.close()
-                    return True
-        if lineno > 10:
-            break
-    if chunk is None:
+        magic_check = temp.read( 2 )
         temp.close()
-    return False
-def check_gzip( temp_name ):
-    # This is sort of hacky.  BAM is compressed in the BGZF format, and must 
-    # not be uncompressed in upon upload ( it will be detected as gzipped ).
-    # The tuple we're returning from here contains boolean values for
-    # ( is_compressed, is_valid, is_bam ).
-    temp = open( temp_name, "U" )
-    magic_check = temp.read( 2 )
-    temp.close()
-    if magic_check != util.gzip_magic:
-        return ( False, False, False )
+        if magic_check != util.gzip_magic:
+            return ( False, False )
+    except:
+        return ( False, False )
+    # We support some binary data types, so check if the compressed binary file is valid
+    # If the file is Bam, it should already have been detected as such, so we'll just check
+    # for sff format.
+    try:
+        header = gzip.open( temp_name ).read(4)
+        if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
+            return ( True, True )
+    except:
+        return( False, False )
     CHUNK_SIZE = 2**15 # 32Kb
-    gzipped_file = gzip.GzipFile( temp_name )
+    gzipped_file = gzip.GzipFile( temp_name, mode='rb' )
     chunk = gzipped_file.read( CHUNK_SIZE )
     gzipped_file.close()
+    # See if we have a compressed HTML file
     if check_html( temp_name, chunk=chunk ):
-        return ( True, False, False )
-    if check_binary( temp_name, chunk=chunk ):
-        # We do support some binary data types, so check if the compressed binary file is valid
-        # We currently only check for [ 'sff', 'bam' ]
-        # TODO: this should be fixed to more easily support future-supported binary data types.
-        # This is currently just copied from the sniff methods.
-        # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. 
-        try:
-            header = gzip.open( temp_name ).read(4)
-            if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ):
-                return ( True, True, True )
-        except:
-            pass
-        try:
-            header = gzip.open( temp_name ).read(4)
-            if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
-                return ( True, True, False )
-        except:
-            pass
-        return ( True, False, False )
-    return ( True, True, False )
+        return ( True, False )
+    return ( True, True )
 def check_zip( temp_name ):
     if not zipfile.is_zipfile( temp_name ):
         return ( False, False, None )
@@ -126,7 +118,7 @@
     # 2. All file extensions within an archive must be the same
     name = zip_file.namelist()[0]
     test_ext = name.split( "." )[1].strip().lower()
-    if not ( test_ext == 'scf' or test_ext == 'ab1' or test_ext == 'txt' ):
+    if not ( test_ext in unsniffable_binary_formats or test_ext == 'txt' ):
         return ( True, False, test_ext )
     for name in zip_file.namelist():
         ext = name.split( "." )[1].strip().lower()
@@ -163,21 +155,25 @@
             dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
         except UnicodeDecodeError, e:
             dataset.is_multi_byte = False
+    # Is dataset content multi-byte?
     if dataset.is_multi_byte:
         data_type = 'multi-byte char'
         ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
+    # Is dataset content supported sniffable binary?
+    elif check_bam( dataset.path ):
+        ext = 'bam'
+        data_type = 'bam'
+    elif check_sff( dataset.path ):
+        ext = 'sff'
+        data_type = 'sff'
     else:
         # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
-        is_gzipped, is_valid, is_bam = check_gzip( dataset.path )
+        is_gzipped, is_valid = check_gzip( dataset.path )
         if is_gzipped and not is_valid:
             file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
             return
-        elif is_gzipped and is_valid and is_bam:
-            ext = 'bam'
-            data_type = 'bam'
-        elif is_gzipped and is_valid and not is_bam:
-            # We need to uncompress the temp_name file, but BAM files must remain compressed
-            # in order for samtools to function on them
+        elif is_gzipped and is_valid:
+            # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
             CHUNK_SIZE = 2**20 # 1Mb   
             fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False )
             gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
@@ -207,7 +203,7 @@
             elif is_zipped and is_valid:
                 # Currently, we force specific tools to handle this case.  We also require the user
                 # to manually set the incoming file_type
-                if ( test_ext == 'ab1' or test_ext == 'scf' ) and dataset.file_type != 'binseq.zip':
+                if ( test_ext in unsniffable_binary_formats ) and dataset.file_type != 'binseq.zip':
                     file_err( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'", dataset, json_file )
                     return
                 elif test_ext == 'txt' and dataset.file_type != 'txtseq.zip':
@@ -220,35 +216,25 @@
                 ext = dataset.file_type
         if not data_type:
             if check_binary( dataset.path ):
+                # We have a binary dataset, but it is not Bam or Sff
                 data_type = 'binary'
-                binary_ok = False
+                #binary_ok = False
                 parts = dataset.name.split( "." )
                 if len( parts ) > 1:
                     ext = parts[1].strip().lower()
-                    if ext in unsniffable_binary_formats and dataset.file_type == ext:
-                        binary_ok = True
+                    if ext not in unsniffable_binary_formats:
+                        file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
+                        return
                     elif ext in unsniffable_binary_formats and dataset.file_type != ext:
                         err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
                         file_err( err_msg, dataset, json_file )
                         return
-                    if not binary_ok and ext in sniffable_binary_formats:
-                        # Sniff the file to confirm it's data type
-                        tmp_ext = sniff.guess_ext( dataset.path )
-                        if tmp_ext == ext:
-                            binary_ok = True
-                        else:
-                            err_msg = "The content of the file does not match its type (%s)." % ext.capitalize()
-                            file_err( err_msg, dataset, json_file )
-                            return
-                    if not binary_ok:
-                        file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
-                        return
         if not data_type:
             # We must have a text file
             if check_html( dataset.path ):
                 file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
                 return
-        if data_type != 'bam' and data_type != 'binary' and data_type != 'zip':
+        if data_type != 'binary' and data_type != 'zip':
             if dataset.space_to_tab:
                 line_count = sniff.convert_newlines_sep2tabs( dataset.path )
             else:
diff -r 119315b57656 -r 8feff3bc14bc tools/samtools/sam_to_bam.py
--- a/tools/samtools/sam_to_bam.py	Mon Dec 07 15:57:06 2009 -0500
+++ b/tools/samtools/sam_to_bam.py	Mon Dec 07 16:04:33 2009 -0500
@@ -79,35 +79,18 @@
         tmp_aligns_file = tempfile.NamedTemporaryFile()
         tmp_aligns_file_name = tmp_aligns_file.name
         tmp_aligns_file.close()
-        # IMPORTANT NOTE: for some reason the samtools view command gzips the resulting bam file without warning,
-        # and the docs do not currently state that this occurs ( very bad ).
         command = "samtools view -bt %s -o %s %s 2>/dev/null" % ( fai_index_file_path, tmp_aligns_file_name, options.input1 ) 
         proc = subprocess.Popen( args=command, shell=True )
         proc.wait()
+        shutil.move( tmp_aligns_file_name, options.output1 )
     except Exception, e:
         stop_err( 'Error extracting alignments from (%s), %s' % ( options.input1, str( e ) ) )
-    try:
-        # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created. This command
-        # may also create temporary files <out.prefix>.%d.bam when the whole alignment cannot be fitted
-        # into memory ( controlled by option -m ).
-        tmp_sorted_aligns_file = tempfile.NamedTemporaryFile()
-        tmp_sorted_aligns_file_name = tmp_sorted_aligns_file.name
-        tmp_sorted_aligns_file.close()
-        command = "samtools sort %s %s 2>/dev/null" % ( tmp_aligns_file_name, tmp_sorted_aligns_file_name )
-        proc = subprocess.Popen( args=command, shell=True )
-        proc.wait()
-    except Exception, e:
-        stop_err( 'Error sorting alignments from (%s), %s' % ( tmp_aligns_file_name, str( e ) ) )
-    # Move tmp_aligns_file_name to our output dataset location
-    sorted_bam_file = '%s.bam' % tmp_sorted_aligns_file_name
-    shutil.move( sorted_bam_file, options.output1 )
+    # NOTE: samtools requires the Bam file to be sorted, but this occurs in Bam().set_meta() to ensure that uploaded Bam files are sorted as well.
     if options.ref_file != "None":
         # Remove the symlink from /tmp/dataset_13.dat to ~/database/files/000/dataset_13.dat
         os.unlink( fai_index_file_path )
         # Remove the index file
         index_file_name = '%s.fai' % fai_index_file_path
         os.unlink( index_file_name )
-    # Remove the tmp_aligns_file_name
-    os.unlink( tmp_aligns_file_name )
 
 if __name__=="__main__": __main__()
diff -r 119315b57656 -r 8feff3bc14bc tools/samtools/sam_to_bam.xml
--- a/tools/samtools/sam_to_bam.xml	Mon Dec 07 15:57:06 2009 -0500
+++ b/tools/samtools/sam_to_bam.xml	Mon Dec 07 16:04:33 2009 -0500
@@ -31,10 +31,6 @@
     <data name="output1" format="bam"/>
   </outputs>
   <tests>
-    <!--
-    # IMPORTANT NOTE: for some reason the samtools view command gzips the resulting bam file without warning,
-    # and the docs do not currently state that this occurs ( very bad ).
-    -->
     <test>
       <param name="index_source" value="history" /> 
       <param name="input1" value="3.sam" ftype="sam" />

    

Greg Von Kuster

tags

participants (1)