commit/galaxy-central: 2 new changesets

27 Feb 2015

2 new commits in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/becbbe264e9d/
Changeset:   becbbe264e9d
User:        roalva1
Date:        2015-02-25 22:25:32+00:00
Summary:     Restore function necessary for splitting.
Affected #:  1 file

diff -r df564c852b3c1b462d095d2a0249ce5b84e1cd9f -r becbbe264e9d2780949dd5b2df27809db02c7fb7 lib/galaxy/datatypes/sequence.py

--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -203,7 +203,26 @@
             return None
         raise NotImplementedError("Can't split generic sequence files")
 
+    def get_split_commands_sequential(is_compressed, input_name, output_name, start_sequence, sequence_count):
+        """
+        Does a brain-dead sequential scan & extract of certain sequences
+        >>> Sequence.get_split_commands_sequential(True, './input.gz', './output.gz', start_sequence=0, sequence_count=10)
+        ['zcat "./input.gz" | ( tail -n +1 2> /dev/null) | head -40 | gzip -c > "./output.gz"']
+        >>> Sequence.get_split_commands_sequential(False, './input.fastq', './output.fastq', start_sequence=10, sequence_count=10)
+        ['tail -n +41 "./input.fastq" 2> /dev/null | head -40 > "./output.fastq"']
+        """
+        start_line = start_sequence * 4
+        line_count = sequence_count * 4
+        # TODO: verify that tail can handle 64-bit numbers
+        if is_compressed:
+            cmd = 'zcat "%s" | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c' % (input_name, start_line+1, line_count)
+        else:
+            cmd = 'tail -n +%s "%s" 2> /dev/null | head -%s'  % (start_line+1, input_name, line_count)
+        cmd += ' > "%s"' % output_name
 
+        return [cmd]
+    get_split_commands_sequential = staticmethod(get_split_commands_sequential)
+    
 class Alignment( data.Text ):
     """Class describing an alignment"""
 


https://bitbucket.org/galaxy/galaxy-central/commits/8b70692766ec/
Changeset:   8b70692766ec
User:        roalva1
Date:        2015-02-26 15:00:12+00:00
Summary:     added get_split_commands_with_toc method
Affected #:  1 file

diff -r becbbe264e9d2780949dd5b2df27809db02c7fb7 -r 8b70692766ec172eb4ac141a5d7266b88526f852 lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -203,6 +203,75 @@
             return None
         raise NotImplementedError("Can't split generic sequence files")
 
+    def get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count):
+        """
+        Uses a Table of Contents dict, parsed from an FQTOC file, to come up with a set of
+        shell commands that will extract the parts necessary
+        >>> three_sections=[dict(start=0, end=74, sequences=10), dict(start=74, end=148, sequences=10), dict(start=148, end=148+76, sequences=10)]
+        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=10)
+        ['dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null >> ./output.gz']
+        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=1, sequence_count=5)
+        ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +5 2> /dev/null) | head -20 | gzip -c >> ./output.gz']
+        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=20)
+        ['dd bs=1 skip=0 count=148 if=./input.gz 2> /dev/null >> ./output.gz']
+        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=10)
+        ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', '(dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz']
+        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=10, sequence_count=10)
+        ['dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz']
+        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=20)
+        ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', 'dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz', '(dd bs=1 skip=148 count=76 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz']
+        """
+        sections = toc_file['sections']
+        result = []
+
+        current_sequence = long(0)
+        i=0
+        # skip to the section that contains my starting sequence
+        while i < len(sections) and start_sequence >= current_sequence + long(sections[i]['sequences']):
+            current_sequence += long(sections[i]['sequences'])
+            i += 1
+        if i == len(sections): # bad input data!
+            raise Exception('No FQTOC section contains starting sequence %s' % start_sequence)
+
+        # These two variables act as an accumulator for consecutive entire blocks that
+        # can be copied verbatim (without decompressing)
+        start_chunk = long(-1)
+        end_chunk = long(-1)
+        copy_chunk_cmd = 'dd bs=1 skip=%s count=%s if=%s 2> /dev/null >> %s'
+
+        while sequence_count > 0 and i < len(sections):
+            # we need to extract partial data. So, find the byte offsets of the chunks that contain the data we need
+            # use a combination of dd (to pull just the right sections out) tail (to skip lines) and head (to get the
+            # right number of lines
+            sequences = long(sections[i]['sequences'])
+            skip_sequences = start_sequence-current_sequence
+            sequences_to_extract = min(sequence_count, sequences-skip_sequences)
+            start_copy = long(sections[i]['start'])
+            end_copy = long(sections[i]['end'])
+            if sequences_to_extract < sequences:
+                if start_chunk > -1:
+                    result.append(copy_chunk_cmd % (start_chunk, end_chunk-start_chunk, input_name, output_name))
+                    start_chunk = -1
+                # extract, unzip, trim, recompress
+                result.append('(dd bs=1 skip=%s count=%s if=%s 2> /dev/null )| zcat | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c >> %s' %
+                              (start_copy, end_copy-start_copy, input_name, skip_sequences*4+1, sequences_to_extract*4, output_name))
+            else: # whole section - add it to the start_chunk/end_chunk accumulator
+                if start_chunk == -1:
+                    start_chunk = start_copy
+                end_chunk = end_copy
+            sequence_count -= sequences_to_extract
+            start_sequence += sequences_to_extract
+            current_sequence += sequences
+            i += 1
+        if start_chunk > -1:
+            result.append(copy_chunk_cmd % (start_chunk, end_chunk-start_chunk, input_name, output_name))
+
+        if sequence_count > 0:
+            raise Exception('%s sequences not found in file' % sequence_count)
+
+        return result
+    get_split_commands_with_toc = staticmethod(get_split_commands_with_toc)
+
     def get_split_commands_sequential(is_compressed, input_name, output_name, start_sequence, sequence_count):
         """
         Does a brain-dead sequential scan & extract of certain sequences
@@ -222,7 +291,7 @@
 
         return [cmd]
     get_split_commands_sequential = staticmethod(get_split_commands_sequential)
-    
+
 class Alignment( data.Text ):
     """Class describing an alignment"""

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)