commit/galaxy-central: greg: Improved checking to determine if a bam file is sorted. If the version of samtools being used is 0.1.13 or newer, determination of bam sorting is now much more robust. Add a new functional test to the test_library_features script that tests uploading a library_dataset (where the file is an unsorted bam file) using a combination of 'upload_paths' and 'link_to_files'. Rename the test-data file previously named 3.bam to now be named 3unsorted.bam to clarify that it
1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/4acde9321b63/ changeset: r5256:4acde9321b63 user: greg date: 2011-03-23 17:18:57 summary: Improved checking to determine if a bam file is sorted. If the version of samtools being used is 0.1.13 or newer, determination of bam sorting is now much more robust. Add a new functional test to the test_library_features script that tests uploading a library_dataset (where the file is an unsorted bam file) using a combination of 'upload_paths' and 'link_to_files'. Rename the test-data file previously named 3.bam to now be named 3unsorted.bam to clarify that it is an unsorted bam file. affected #: 10 files (5.6 KB) --- a/lib/galaxy/datatypes/binary.py Wed Mar 23 11:46:10 2011 -0400 +++ b/lib/galaxy/datatypes/binary.py Wed Mar 23 12:18:57 2011 -0400 @@ -51,16 +51,72 @@ """Class describing a BAM binary file""" file_ext = "bam" MetadataElement( name="bam_index", desc="BAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) - - def _is_coordinate_sorted(self, filename): - """Check if the input BAM file is sorted from the header information. - """ - params = ["samtools", "view", "-H", filename] - output = subprocess.Popen(params, stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] + + def _get_samtools_version( self ): + # Determine the version of samtools being used. Wouldn't it be nice if + # samtools provided a version flag to make this much simpler? + version = '0.0.0' + output = subprocess.Popen( [ 'samtools' ], stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[1] + lines = output.split( '\n' ) + for line in lines: + if line.lower().startswith( 'version' ): + # Assuming line looks something like: version: 0.1.12a (r862) + version = line.split()[1] + break + return version + def _is_coordinate_sorted( self, file_name ): + """See if the input BAM file is sorted from the header information.""" + params = [ "samtools", "view", "-H", file_name ] + output = subprocess.Popen( params, stderr=subprocess.PIPE, stdout=subprocess.PIPE ).communicate()[0] # find returns -1 if string is not found - return output.find("SO:coordinate") != -1 or output.find("SO:sorted") != -1 + return output.find( "SO:coordinate" ) != -1 or output.find( "SO:sorted" ) != -1 def dataset_content_needs_grooming( self, file_name ): - return not self._is_coordinate_sorted( file_name ) + """See if file_name is a sorted BAM file""" + version = self._get_samtools_version() + if version < '0.1.13': + return not self._is_coordinate_sorted( file_name ) + else: + # Samtools version 0.1.13 or newer produces an error condition when attempting to index an + # unsorted bam file - see http://biostar.stackexchange.com/questions/5273/is-my-bam-file-sorted. + # So when using a newer version of samtools, we'll first check if the input BAM file is sorted + # from the header information. If the header is present and sorted, we do nothing by returning False. + # If it's present and unsorted or if it's missing, we'll index the bam file to see if it produces the + # error. If it does, sorting is needed so we return True (otherwise False). + # + # TODO: we're creating an index file here and throwing it away. We then create it again when + # the set_meta() method below is called later in the job process. We need to enhance this overall + # process so we don't create an index twice. In order to make it worth the time to implement the + # upload tool / framework to allow setting metadata from directly within the tool itself, it should be + # done generically so that all tools will have the ability. In testing, a 6.6 gb BAM file took 128 + # seconds to index with samtools, and 45 minutes to sort, so indexing is relatively inexpensive. + if self._is_coordinate_sorted( file_name ): + return False + index_name = tempfile.NamedTemporaryFile( prefix = "bam_index" ).name + stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name + command = 'samtools index %s %s' % ( file_name, index_name ) + proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) ) + exit_code = proc.wait() + stderr = open( stderr_name ).read().strip() + if stderr: + try: + os.unlink( index_name ) + except OSError: + pass + try: + os.unlink( stderr_name ) + except OSError: + pass + # Return True if unsorted error condition is found (find returns -1 if string is not found). + return stderr.find( "[bam_index_core] the alignment is not sorted" ) != -1 + try: + os.unlink( index_name ) + except OSError: + pass + try: + os.unlink( stderr_name ) + except OSError: + pass + return False def groom_dataset_content( self, file_name ): """ Ensures that the Bam file contents are sorted. This function is called @@ -104,7 +160,6 @@ index_file = dataset.metadata.bam_index if not index_file: index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset ) - # Create the Bam index ##$ samtools index ##Usage: samtools index <in.bam> [<out.index>] --- a/lib/galaxy/datatypes/sniff.py Wed Mar 23 11:46:10 2011 -0400 +++ b/lib/galaxy/datatypes/sniff.py Wed Mar 23 12:18:57 2011 -0400 @@ -267,7 +267,7 @@ >>> fname = get_test_fname('1.bam') >>> guess_ext(fname) 'bam' - >>> fname = get_test_fname('3.bam') + >>> fname = get_test_fname('3unsorted.bam') >>> guess_ext(fname) 'bam' """ --- a/scripts/functional_tests.py Wed Mar 23 11:46:10 2011 -0400 +++ b/scripts/functional_tests.py Wed Mar 23 12:18:57 2011 -0400 @@ -93,6 +93,7 @@ nginx_x_accel_redirect_base = '/_x_accel_redirect', nginx_upload_store = nginx_upload_store, nginx_upload_path = '/_upload', + allow_library_path_paste = 'True', cluster_files_directory = cluster_files_directory, job_working_directory = job_working_directory, outputs_to_working_directory = 'True', @@ -102,7 +103,7 @@ track_jobs_in_database = 'True', job_scheduler_policy = 'FIFO', start_job_runners = 'pbs', - default_cluster_job_runner = default_cluster_job_runner, ) + default_cluster_job_runner = default_cluster_job_runner ) psu_production = True else: if 'GALAXY_TEST_DBPATH' in os.environ: @@ -161,6 +162,7 @@ allow_user_creation = True, allow_user_deletion = True, admin_users = 'test@bx.psu.edu', + allow_library_path_paste = True, library_import_dir = galaxy_test_file_dir, user_library_import_dir = os.path.join( galaxy_test_file_dir, 'users' ), global_conf = global_conf, Binary file test-data/3.bam has changed --- a/test/base/twilltestcase.py Wed Mar 23 11:46:10 2011 -0400 +++ b/test/base/twilltestcase.py Wed Mar 23 12:18:57 2011 -0400 @@ -2007,9 +2007,9 @@ # Library dataset stuff def upload_library_dataset( self, cntrller, library_id, folder_id, filename='', server_dir='', replace_id='', upload_option='upload_file', file_type='auto', dbkey='hg18', space_to_tab='', - link_data_only='copy_files', preserve_dirs='Yes', roles=[], ldda_message='', hda_ids='', - template_refresh_field_name='1_field_name', template_refresh_field_contents='', template_fields=[], - show_deleted='False', strings_displayed=[] ): + link_data_only='copy_files', preserve_dirs='Yes', filesystem_paths='', roles=[], + ldda_message='', hda_ids='', template_refresh_field_name='1_field_name', + template_refresh_field_contents='', template_fields=[], show_deleted='False', strings_displayed=[] ): """Add datasets to library using any upload_option""" # NOTE: due to the library_wait() method call at the end of this method, no tests should be done # for strings_displayed_after_submit. @@ -2050,10 +2050,10 @@ tc.fv( "add_history_datasets_to_library", "hda_ids", '1' ) tc.submit( 'add_history_datasets_to_library_button' ) else: - if upload_option == 'filesystem_paths' or upload_option == 'upload_directory': + if upload_option in [ 'upload_paths', 'upload_directory' ]: tc.fv( "1", "link_data_only", link_data_only ) - if upload_option == 'filesystem_paths' and preserve_dirs == 'Yes': - tc.fv( "1", "preserve_dirs", preserve_dirs ) + if upload_option == 'upload_paths': + tc.fv( "1", "filesystem_paths", filesystem_paths ) if upload_option == 'upload_directory' and server_dir: tc.fv( "1", "server_dir", server_dir ) if upload_option == 'upload_file': @@ -2085,6 +2085,19 @@ for check_str in strings_displayed: self.check_page_for_string( check_str ) self.home() + def ldda_info( self, cntrller, library_id, folder_id, ldda_id, strings_displayed=[], strings_not_displayed=[] ): + """View library_dataset_dataset_association information""" + self.visit_url( "%s/library_common/ldda_info?cntrller=%s&library_id=%s&folder_id=%s&id=%s" % \ + ( self.url, cntrller, library_id, folder_id, ldda_id ) ) + for check_str in strings_displayed: + self.check_page_for_string( check_str ) + for check_str in strings_not_displayed: + try: + self.check_page_for_string( check_str ) + raise AssertionError, "String (%s) should not have been displayed on ldda info page." % check_str + except: + pass + self.home() def ldda_edit_info( self, cntrller, library_id, folder_id, ldda_id, ldda_name, new_ldda_name='', template_refresh_field_name='1_field_name', template_refresh_field_contents='', template_fields=[], strings_displayed=[], strings_not_displayed=[] ): """Edit library_dataset_dataset_association information, optionally template element information""" --- a/test/functional/test_get_data.py Wed Mar 23 11:46:10 2011 -0400 +++ b/test/functional/test_get_data.py Wed Mar 23 12:18:57 2011 -0400 @@ -395,18 +395,18 @@ assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 1.bam" self.delete_history( id=self.security.encode_id( history.id ) ) def test_0155_upload_file( self ): - """Test uploading 3.bam, which is an unsorted Bam file, NOT setting the file format""" + """Test uploading 3unsorted.bam, which is an unsorted Bam file, NOT setting the file format""" self.check_history_for_string( 'Your history is empty' ) history = get_latest_history_for_user( admin_user ) - self.upload_file( '3.bam' ) + self.upload_file( '3unsorted.bam' ) hda = get_latest_hda() assert hda is not None, "Problem retrieving hda from database" - # Since 3.bam is not sorted, we cannot verify dataset correctness since the uploaded + # Since 3unsorted.bam is not sorted, we cannot verify dataset correctness since the uploaded # dataset will be sorted. However, the check below to see if the index was created is # sufficient. self.check_history_for_string( '<span class="bam">bam</span>' ) # Make sure the Bam index was created - assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 3.bam" + assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 3unsorted.bam" self.delete_history( id=self.security.encode_id( history.id ) ) def test_0160_url_paste( self ): """Test url paste behavior""" --- a/test/functional/test_library_features.py Wed Mar 23 11:46:10 2011 -0400 +++ b/test/functional/test_library_features.py Wed Mar 23 12:18:57 2011 -0400 @@ -534,6 +534,25 @@ self.browse_library( cntrller='library_admin', library_id=self.security.encode_id( library3.id ), strings_displayed=[ folder5.name, folder5.description, ldda7.name ] ) + def test_135_upload_unsorted_bam_to_library_using_file_path_with_link_to_file( self ): + """Test uploading 3unsorted.bam, using filesystem_paths option in combination with link_to_files""" + filename = '3unsorted.bam' + self.upload_library_dataset( cntrller='library_admin', + library_id=self.security.encode_id( library2.id ), + folder_id=self.security.encode_id( library2.root_folder.id ), + upload_option='upload_paths', + link_data_only='link_to_files', + filesystem_paths='test-data/3unsorted.bam' ) + global ldda8 + ldda8 = get_latest_ldda_by_name( filename ) + assert ldda8 is not None, 'Problem retrieving LibraryDatasetDatasetAssociation ldda8 from the database' + # The upload above should produce an error condition in the uploaded library dataset since + # the uploaded bam file is not sorted, and we are linking to the file. + self.ldda_info( cntrller='library_admin', + library_id=self.security.encode_id( library2.id ), + folder_id=self.security.encode_id( library2.root_folder.id ), + ldda_id=self.security.encode_id( ldda8.id ), + strings_displayed=[ 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be' ] ) def test_999_reset_data_for_later_test_runs( self ): """Reseting data to enable later test runs to pass""" # Logged in as admin_user --- a/tools/samtools/bam_to_sam.xml Wed Mar 23 11:46:10 2011 -0400 +++ b/tools/samtools/bam_to_sam.xml Wed Mar 23 12:18:57 2011 -0400 @@ -18,7 +18,7 @@ <test><!-- Bam-to-Sam command: - samtools view -o bam_to_sam_out1.sam test-data/3.bam + samtools view -o bam_to_sam_out1.sam test-data/3unsorted.bam --><param name="input1" value="1.bam" ftype="bam" /><output name="output1" file="bam_to_sam_out1.sam" sorted="True" /> @@ -28,7 +28,7 @@ Bam-to-Sam command: samtools view -o bam_to_sam_out2.sam test-data/1.bam --> - <param name="input1" value="3.bam" ftype="bam" /> + <param name="input1" value="3unsorted.bam" ftype="bam" /><output name="output1" file="bam_to_sam_out2.sam" sorted="True" /></test></tests> --- a/tools/samtools/samtools_flagstat.xml Wed Mar 23 11:46:10 2011 -0400 +++ b/tools/samtools/samtools_flagstat.xml Wed Mar 23 12:18:57 2011 -0400 @@ -13,7 +13,7 @@ </outputs><tests><test> - <param name="input1" value="3.bam" ftype="bam" /> + <param name="input1" value="3unsorted.bam" ftype="bam" /><output name="output1" file="samtools_flagstat_out1.txt" /></test></tests> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket