commit/galaxy-central: 2 new changesets
2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/becbbe264e9d/ Changeset: becbbe264e9d User: roalva1 Date: 2015-02-25 22:25:32+00:00 Summary: Restore function necessary for splitting. Affected #: 1 file diff -r df564c852b3c1b462d095d2a0249ce5b84e1cd9f -r becbbe264e9d2780949dd5b2df27809db02c7fb7 lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -203,7 +203,26 @@ return None raise NotImplementedError("Can't split generic sequence files") + def get_split_commands_sequential(is_compressed, input_name, output_name, start_sequence, sequence_count): + """ + Does a brain-dead sequential scan & extract of certain sequences + >>> Sequence.get_split_commands_sequential(True, './input.gz', './output.gz', start_sequence=0, sequence_count=10) + ['zcat "./input.gz" | ( tail -n +1 2> /dev/null) | head -40 | gzip -c > "./output.gz"'] + >>> Sequence.get_split_commands_sequential(False, './input.fastq', './output.fastq', start_sequence=10, sequence_count=10) + ['tail -n +41 "./input.fastq" 2> /dev/null | head -40 > "./output.fastq"'] + """ + start_line = start_sequence * 4 + line_count = sequence_count * 4 + # TODO: verify that tail can handle 64-bit numbers + if is_compressed: + cmd = 'zcat "%s" | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c' % (input_name, start_line+1, line_count) + else: + cmd = 'tail -n +%s "%s" 2> /dev/null | head -%s' % (start_line+1, input_name, line_count) + cmd += ' > "%s"' % output_name + return [cmd] + get_split_commands_sequential = staticmethod(get_split_commands_sequential) + class Alignment( data.Text ): """Class describing an alignment""" https://bitbucket.org/galaxy/galaxy-central/commits/8b70692766ec/ Changeset: 8b70692766ec User: roalva1 Date: 2015-02-26 15:00:12+00:00 Summary: added get_split_commands_with_toc method Affected #: 1 file diff -r becbbe264e9d2780949dd5b2df27809db02c7fb7 -r 8b70692766ec172eb4ac141a5d7266b88526f852 lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -203,6 +203,75 @@ return None raise NotImplementedError("Can't split generic sequence files") + def get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count): + """ + Uses a Table of Contents dict, parsed from an FQTOC file, to come up with a set of + shell commands that will extract the parts necessary + >>> three_sections=[dict(start=0, end=74, sequences=10), dict(start=74, end=148, sequences=10), dict(start=148, end=148+76, sequences=10)] + >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=10) + ['dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null >> ./output.gz'] + >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=1, sequence_count=5) + ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +5 2> /dev/null) | head -20 | gzip -c >> ./output.gz'] + >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=20) + ['dd bs=1 skip=0 count=148 if=./input.gz 2> /dev/null >> ./output.gz'] + >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=10) + ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', '(dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz'] + >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=10, sequence_count=10) + ['dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz'] + >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=20) + ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', 'dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz', '(dd bs=1 skip=148 count=76 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz'] + """ + sections = toc_file['sections'] + result = [] + + current_sequence = long(0) + i=0 + # skip to the section that contains my starting sequence + while i < len(sections) and start_sequence >= current_sequence + long(sections[i]['sequences']): + current_sequence += long(sections[i]['sequences']) + i += 1 + if i == len(sections): # bad input data! + raise Exception('No FQTOC section contains starting sequence %s' % start_sequence) + + # These two variables act as an accumulator for consecutive entire blocks that + # can be copied verbatim (without decompressing) + start_chunk = long(-1) + end_chunk = long(-1) + copy_chunk_cmd = 'dd bs=1 skip=%s count=%s if=%s 2> /dev/null >> %s' + + while sequence_count > 0 and i < len(sections): + # we need to extract partial data. So, find the byte offsets of the chunks that contain the data we need + # use a combination of dd (to pull just the right sections out) tail (to skip lines) and head (to get the + # right number of lines + sequences = long(sections[i]['sequences']) + skip_sequences = start_sequence-current_sequence + sequences_to_extract = min(sequence_count, sequences-skip_sequences) + start_copy = long(sections[i]['start']) + end_copy = long(sections[i]['end']) + if sequences_to_extract < sequences: + if start_chunk > -1: + result.append(copy_chunk_cmd % (start_chunk, end_chunk-start_chunk, input_name, output_name)) + start_chunk = -1 + # extract, unzip, trim, recompress + result.append('(dd bs=1 skip=%s count=%s if=%s 2> /dev/null )| zcat | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c >> %s' % + (start_copy, end_copy-start_copy, input_name, skip_sequences*4+1, sequences_to_extract*4, output_name)) + else: # whole section - add it to the start_chunk/end_chunk accumulator + if start_chunk == -1: + start_chunk = start_copy + end_chunk = end_copy + sequence_count -= sequences_to_extract + start_sequence += sequences_to_extract + current_sequence += sequences + i += 1 + if start_chunk > -1: + result.append(copy_chunk_cmd % (start_chunk, end_chunk-start_chunk, input_name, output_name)) + + if sequence_count > 0: + raise Exception('%s sequences not found in file' % sequence_count) + + return result + get_split_commands_with_toc = staticmethod(get_split_commands_with_toc) + def get_split_commands_sequential(is_compressed, input_name, output_name, start_sequence, sequence_count): """ Does a brain-dead sequential scan & extract of certain sequences @@ -222,7 +291,7 @@ return [cmd] get_split_commands_sequential = staticmethod(get_split_commands_sequential) - + class Alignment( data.Text ): """Class describing an alignment""" Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org