[hg] galaxy 3146: Fixes for Bam data type's set_meta() and sam_t...
details: http://www.bx.psu.edu/hg/galaxy/rev/495dda4c0693 changeset: 3146:495dda4c0693 user: Greg Von Kuster <greg@bx.psu.edu> date: Fri Dec 04 19:51:46 2009 -0500 description: Fixes for Bam data type's set_meta() and sam_to_bam tool, indexes will now be properly created for bam datasets. Fixes also for uploading Bam files, they will no longer be uncompressed on upload. diffstat: lib/galaxy/datatypes/binary.py | 49 ++-- lib/galaxy/tools/util/maf_utilities.py | 11 +- test-data/1.bam | test-data/2.sam | 10 + test-data/3.sam | 10 + test-data/chrM.fa | 335 ------------------------------------- test-data/chr_m.fasta | 335 +++++++++++++++++++++++++++++++++++++ test-data/sam_to_bam_in1.sam | 10 - test-data/sam_to_bam_in2.sam | 10 - test-data/sam_to_bam_out1.bam | test-data/sam_to_bam_out2.bam | tools/data_source/data_source.py | 2 + tools/data_source/upload.py | 57 ++++-- tools/samtools/sam_to_bam.py | 134 +++++++++----- tools/samtools/sam_to_bam.xml | 43 ++-- tools/sr_mapping/bowtie_wrapper.xml | 4 +- tools/sr_mapping/lastz_wrapper.xml | 31 +- 17 files changed, 545 insertions(+), 496 deletions(-) diffs (1323 lines): diff -r 93474ea5c366 -r 495dda4c0693 lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py Thu Dec 03 16:16:12 2009 -0500 +++ b/lib/galaxy/datatypes/binary.py Fri Dec 04 19:51:46 2009 -0500 @@ -7,7 +7,7 @@ from galaxy.datatypes import metadata from galaxy.datatypes.sniff import * from urllib import urlencode, quote_plus -import zipfile +import zipfile, gzip import os, subprocess, tempfile log = logging.getLogger(__name__) @@ -54,32 +54,35 @@ def init_meta( self, dataset, copy_from=None ): Binary.init_meta( self, dataset, copy_from=copy_from ) - """ - GVK 12/2/09: just noticed this - not good and doesn't work, so commenting out for now. - def set_meta( self, dataset, overwrite = True, **kwd ): - # Sets index for BAM file. - index_file = dataset.metadata.bam_index - if not index_file: - index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset ) + def set_meta( self, dataset, overwrite = True, **kwd ): + """ Sets index for BAM file. """ + # These metadata values are not accessible by users, always overwrite + index_file = dataset.metadata.bam_index + if not index_file: + index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset ) + try: + # Using a symlink from ~/database/files/dataset_XX.dat, create a temporary file + # to store the indexex generated from samtools, something like ~/tmp/dataset_XX.dat.bai tmp_dir = tempfile.gettempdir() - tmpf1 = tempfile.NamedTemporaryFile( dir=tmp_dir ) - tmpf1bai = '%s.bai' % tmpf1.name - try: - os.system( 'cd %s' % tmp_dir ) - os.system( 'cp %s %s' % ( dataset.file_name, tmpf1.name ) ) - os.system( 'samtools index %s' % tmpf1.name ) - os.system( 'cp %s %s' % ( tmpf1bai, index_file.file_name ) ) - except Exception, ex: - sys.stderr.write( 'There was a problem creating the index for the BAM file\n%s\n' + str( ex ) ) - tmpf1.close() - if os.path.exists( tmpf1bai ): - os.remove( tmpf1bai ) - dataset.metadata.bam_index = index_file - """ + tmp_file_path = os.path.join( tmp_dir, os.path.basename( dataset.file_name ) ) + # Here tmp_file_path looks something like /tmp/dataset_XX.dat + os.symlink( dataset.file_name, tmp_file_path ) + command = 'samtools index %s' % tmp_file_path + proc = subprocess.Popen( args=command, shell=True ) + proc.wait() + except: + err_msg = 'Error creating index file (%s) for BAM file (%s)' % ( str( tmp_file_path ), str( dataset.file_name ) ) + log.exception( err_msg ) + sys.stderr.write( err_msg ) + # Move the temporary index file ~/tmp/dataset_XX.dat.bai to be ~/database/files/_metadata_files/dataset_XX.dat + shutil.move( '%s.bai' % ( tmp_file_path ), index_file.file_name ) + os.unlink( tmp_file_path ) + dataset.metadata.bam_index = index_file def sniff( self, filename ): + # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy. # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. try: - header = open( filename ).read(4) + header = gzip.open( filename ).read(4) if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ): return True return False diff -r 93474ea5c366 -r 495dda4c0693 lib/galaxy/tools/util/maf_utilities.py --- a/lib/galaxy/tools/util/maf_utilities.py Thu Dec 03 16:16:12 2009 -0500 +++ b/lib/galaxy/tools/util/maf_utilities.py Fri Dec 04 19:51:46 2009 -0500 @@ -191,13 +191,16 @@ def build_maf_index_species_chromosomes( filename, index_species = None ): species = [] species_chromosomes = {} - indexes = bx.interval_index_file.Indexes() + indexes = bx.interval_index_file.Indexes() + blocks = 0 try: maf_reader = bx.align.maf.Reader( open( filename ) ) while True: pos = maf_reader.file.tell() block = maf_reader.next() - if block is None: break + if block is None: + break + blocks += 1 for c in block.components: spec = c.src chrom = None @@ -225,11 +228,11 @@ #most likely a bad MAF log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) return ( None, [], {} ) - return ( indexes, species, species_chromosomes ) + return ( indexes, species, species_chromosomes, blocks ) #builds and returns ( index, index_filename ) for specified maf_file def build_maf_index( maf_file, species = None ): - indexes, found_species, species_chromosomes = build_maf_index_species_chromosomes( maf_file, species ) + indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) if indexes is not None: fd, index_filename = tempfile.mkstemp() out = os.fdopen( fd, 'w' ) diff -r 93474ea5c366 -r 495dda4c0693 test-data/1.bam Binary file test-data/1.bam has changed diff -r 93474ea5c366 -r 495dda4c0693 test-data/2.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.sam Fri Dec 04 19:51:46 2009 -0500 @@ -0,0 +1,10 @@ +HWI-EAS91_1_30788AAXX:1:1:1095:605 0 chrM 23 25 36M * 0 0 AAGCAAGNNACTGAAAATGCCTAGATGAGTATTCTT IIIIIII""IIIIIIIIIIIIIIIEIIIIIIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1650:1185 0 chrM 14956 25 36M * 0 0 ACCCCAGNNAACCCTCTCAGCACTCCCCCTCATATT IIIIIII""IIIIIIIIIIII6IIIIIIIII5I-II NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:799:192 16 chrM 8421 25 36M * 0 0 CCTGTAGCCCTAGCCGTGCGGCTAACCNNTAACATT II%::I<IIIIIEIII8IIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1082:719 16 chrM 7191 25 36M * 0 0 TAAATTAACCCATACCAGCACCATAGANNCTCAAGA <III0EII3+3I29I>III8AIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1746:1180 16 chrM 12013 25 36M * 0 0 CCTAAGCTTCAAACTAGATTACTTCTCNNTAATTTT IIIIIIIIFIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:606:460 0 chrM 4552 25 36M * 0 0 TTAATTTNNATTATAATAACACTCACAATATTCATA IIIIIII""IIIIIIIIIIIIIIIIII?I6IIIII6 NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1059:362 16 chrM 7348 25 36M * 0 0 GGCCACCAATGATACTGAAGCTACGAGNNTACCGAT II/<)2IIIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1483:1161 16 chrM 15080 25 36M * 0 0 TCCTGATCCTAGCACTCATCCCCACCCNNCACATAT HIIIIIFIIAIHIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1273:600 16 chrM 13855 25 36M * 0 0 GTATTAGACACCCATACCTCAGGATACNNCTCAGTA IIIIIIIIIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1190:1283 16 chrM 15338 25 36M * 0 0 TATATCGCACATTACCCTGGTCTTGTANNCCAGAAA EIII?-IIIIIAIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 diff -r 93474ea5c366 -r 495dda4c0693 test-data/3.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/3.sam Fri Dec 04 19:51:46 2009 -0500 @@ -0,0 +1,10 @@ +HWI-EAS91_1_30788AAXX:1:1:1513:715 16 chrM 9563 25 36M * 0 0 CTGACTACCACAACTAAACATCTATGCNNAAAAAAC I+-II?IDIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1698:516 16 chrM 2735 25 36M * 0 0 TTTACACTCAGAGGTTCAACTCCTCTCNNTAACAAC I9IIIII5IIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1491:637 16 chrM 10864 25 36M * 0 0 TGTAGAAGCCCCAATTGCCGGATCCATNNTGCTAGC DBAIIIIIIIIIIIFIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1711:249 16 chrM 10617 25 36M * 0 0 ACCAAACAGAACGCCTGAACGCAGGCCNNTACTTCC IIIIIIIIIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1634:211 0 chrM 9350 25 36M * 0 0 GAAGCAGNNGCTTGATACTGACACTTCGTCGACGTA IIIIIII""IIIIIIIIIIIIIIIIIIIIII9IIDF NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1218:141 16 chrM 14062 25 36M * 0 0 ACAAAACTAACAACAAAAATAACACTCNNAATAAAC I+IIII1IIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1398:854 16 chrM 3921 25 36M * 0 0 CACCCTTCCCGTACTAATAAATCCCCTNNTCTTCAC IIIII=AIIIIIIIIIIIIIIBIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1310:991 16 chrM 10002 25 36M * 0 0 CTCCTATGCCTAGAAGGAATAATACTANNACTATTC I:2IEI:IIDIIIIII4IIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1716:413 0 chrM 6040 25 36M * 0 0 GATCCAANNCTTTATCAACACCTATTCTGATTCTTC IIIIIII""IIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 +HWI-EAS91_1_30788AAXX:1:1:1630:59 16 chrM 12387 25 36M * 0 0 TCATACTCGACCCCAACCTTACCAACCNNCCGCTCC FIIHII;IIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 diff -r 93474ea5c366 -r 495dda4c0693 test-data/chrM.fa --- a/test-data/chrM.fa Thu Dec 03 16:16:12 2009 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,335 +0,0 @@ ->chrM -GTTAATGTAGCTTAATAATATAAAGCAAGGCACTGAAAATGCCTAGATGA -GTATTCTTACTCCATAAACACATAGGCTTGGTCCTAGCCTTTTTATTAGT -TATTAATAGAATTACACATGCAAGTATCCGCACCCCAGTGAGAATGCCCT -CTAAATCACGTCTCTACGATTAAAAGGAGCAGGTATCAAGCACACTAGAA -AGTAGCTCATAACACCTTGCTCAGCCACACCCCCACGGGACACAGCAGTG -ATAAAAATTAAGCTATGAACGAAAGTTCGACTAAGTCATATTAAATAAGG -GTTGGTAAATTTCGTGCCAGCCACCGCGGTCATACGATTAACCCAAATTA -ATAAATCTCCGGCGTAAAGCGTGTCAAAGACTAATACCAAAATAAAGTTA -AAACCCAGTTAAGCCGTAAAAAGCTACAACCAAAGTAAAATAGACTACGA -AAGTGACTTTAATACCTCTGACTACACGATAGCTAAGACCCAAACTGGGA -TTAGATACCCCACTATGCTTAGCCCTAAACTAAAATAGCTTACCACAACA -AAGCTATTCGCCAGAGTACTACTAGCAACAGCCTAAAACTCAAAGGACTT -GGCGGTGCTTTACATCCCTCTAGAGGAGCCTGTTCCATAATCGATAAACC -CCGATAAACCCCACCATCCCTTGCTAATTCAGCCTATATACCGCCATCTT -CAGCAAACCCTAAACAAGGTACCGAAGTAAGCACAAATATCCAACATAAA -AACGTTAGGTCAAGGTGTAGCCCATGGGATGGAGAGAAATGGGCTACATT -TTCTACCCTAAGAACAAGAACTTTAACCCGGACGAAAGTCTCCATGAAAC -TGGAGACTAAAGGAGGATTTAGCAGTAAATTAAGAATAGAGAGCTTAATT -GAATCAGGCCATGAAGCGCGCACACACCGCCCGTCACCCTCCTTAAATAT -CACAAATCATAACATAACATAAAACCGTGACCCAAACATATGAAAGGAGA -CAAGTCGTAACAAGGTAAGTATACCGGAAGGTGTACTTGGATAACCAAAG -TGTAGCTTAAACAAAGCATCCAGCTTACACCTAGAAGATTTCACTCAAAA -TGAACACTTTGAACTAAAGCTAGCCCAAACAATACCTAATTCAATTACCC -TTAGTCACTTAACTAAAACATTCACCAAACCATTAAAGTATAGGAGATAG -AAATTTTAACTTGGCGCTATAGAGAAAGTACCGTAAGGGAACGATGAAAG -ATGCATTAAAAGTACTAAACAGCAAAGCTTACCCCTTTTACCTTTTGCAT -AATGATTTAACTAGAATAAACTTAGCAAAGAGAACTTAAGCTAAGCACCC -CGAAACCAGACGAGCTACCTATGAACAGTTACAAATGAACCAACTCATCT -ATGTCGCAAAATAGTGAGAAGATTCGTAGGTAGAGGTGAAAAGCCCAACG -AGCCTGGTGATAGCTGGTTGTCCAGAAACAGAATTTCAGTTCAAATTTAA -ATTTACCTAAAAACTACTCAATTCTAATGTAAATTTAAATTATAGTCTAA -AAAGGTACAGCTTTTTAGATACAGGTTACAACCTTCATTAGAGAGTAAGA -ACAAGATAAACCCATAGTTGGCTTAAAAGCAGCCATCAATTAAGAAAGCG -TTCAAGCTCAACGACACATCTATCTTAATCCCAACAATCAACCCAAACTA -ACTCCTAATCTCATACTGGACTATTCTATCAACACATAGAAGCAATAATG -TTAATATGAGTAACAAGAATTATTTCTCCTTGCATAAGCTTATATCAGAA -CGAATACTCACTGATAGTTAACAACAAGATAGGGATAATCCAAAAACTAA -TCATCTATTTAAACCATTGTTAACCCAACACAGGCATGCATCTATAAGGA -AAGATTAAAAGAAGTAAAAGGAACTCGGCAAACACAAACCCCGCCTGTTT -ACCAAAAACATCACCTCTAGCATTTCCAGTATTAGAGGCACTGCCTGCCC -AGTGACATCTGTTtaaacggccgcggtatcctaaccgtgcaaaggtagca -taatcacttgttccctaaatagggacttgtatgaatggccacacgagggt -tttactgtctcttacttccaatcagtgaaattgaccttcccgtgaagagg -cgggaatgactaaataagacgagaagaccctatggagcttTAATTAACTG -ATTCACAAAAAACAACACACAAACCTTAACCTTCAGGGACAACAAAACTT -TTGATTGAATCAGCAATTTCGGTTGGGGTGACCTCGGAGAACAAAACAAC -CTCCGAGTGATTTAAATCCAGACTAACCAGTCAAAATATATAATCACTTA -TTGATCCAAACCATTGATCAACGGAACAAGTTACCCTAGGGATAACAGCG -CAATCCTATTCCAGAGTCCATATCGACAATTAGGGTTTACGACCTCGATG -TTGGATCAAGACATCCTAATGGTGCAACCGCTATTAAGGGTTCGTTTGTT -CAACGATTAAAGTCTTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGT -CGGTTTCTATCTATTCTATACTTTTCCCAGTACGAAAGGACAAGAAAAGT -AGGGCCCACTTTACAAGAAGCGCCCTCAAACTAATAGATGACATAATCTA -AATCTAACTAATTTATAACTTCTACCGCCCTAGAACAGGGCTCgttaggg -tggcagagcccggaaattgcataaaacttaaacctttacactcagaggtt -caactcctctccctaacaacaTGTTCATAATTAACGTCCTCCTCCTAATT -GTCCCAATCTTGCTCGCCGTAGCATTCCTCACACTAGTTGAACGAAAAGT -CTTAGGCTATATGCAACTTCGCAAAGGACCCAACATCGTAGGCCCCTATG -GCCTACTACAACCTATTGCCGATGCCCTCAAACTATTTATCAAAGAGCCA -CTACAACCACTAACATCATCGACATCCATATTCATCATCGCACCAATCCT -AGCCCTAACCCTGGCCTTAACCATATGAATCCCTCTGCCCATACCATACC -CACTAATCAACATAAACCTAGGAATTCTATTCATACTAGCCATGTCCAGC -CTAGCTGTCTACTCAATCCTTTGATCAGGATGGGCCTCAAACTCAAAATA -CGCCCTAATTGGAGCTCTACGAGCAGTAGCACAAACCATCTCATACGAAG -TAACTCTAGCAATCATCCTACTCTCAGTCCTCCTAATAAGCGGATCATTC -ACATTATCAACACTTATTATTACCCAAGAATACCTCTGATTAATCTTCCC -ATCATGACCCTTAGCCATAATGTGATTCATCTCAACATTAGCCGAAACCA -ACCGAGCTCCATTTGACCTAACAGAAGGAGAATCAGAACTCGTCTCTGGA -TTCAACGTTGAATACGCAGCCGGCCCATTTGCTCTATTCTTCCTAGCAGA -ATACGCAAACATCATCATGATAAACATCTTCACAACAACCCTATTTCTAG -GAGCATTTCACAACCCCTACCTGCCAGAACTCTACTCAATTAATTTCACC -ATTAAAGCTCTCCTTCTAACATGTTCCTTCCTATGAATCCGAGCATCCTA -CCCACGATTCCGATATGACCAACTTATACACCTCCTATGAAAGAACTTCC -TACCACTCACACTAGCCCTCTGCATATGACACGTCTCACTTCCAATCATA -CTATCCAGCATCCCACCACAAACATAGGAAATATGTCTGACAAAAGAGTT -ACTTTGATAGAGTAAAACATAGAGGCTCAAACCCTCTTATTTctagaact -acaggaattgaacctgctcctgagaattcaaaatcctccgtgctaccgaa -ttacaccatgtcctaCAAGTAAGGTCAGCTAAATAAGCTATCGGGCCCAT -ACCCCGAAAATGTTGGATTACACCCTTCCCGTACTAATAAATCCCCTTAT -CTTCACAACTATTCTAATAACAGTTCTTCTAGGAACTATAATCGTTATAA -TAAGCTCACACTGACTAATAATCTGAATCGGATTTGAAATAAATCTACTA -GCCATTATCCCTATCCTAATAAAAAAGTACAATCCCCGAACCATAGAAGC -CTCCACCAAATATTTTCTAACCCAAGCCACCGCATCAATACTCCTCATAA -TAGCGATCATCATTAACCTCATACACTCAGGCCAATGAACAATCACAAAA -GTCTTCAACCCCACAGCGTCCATCATTATAACTTCAGCTCTCGCCATAAA -ACTTGGACTCACACCATTCCACTTCTGAGTACCCGAAGTCACACAGGGCA -TCTCATTAACATCAGGTCTCATCCTACTTACATGACAAAAACTAGCCCCA -ATATCAATCCTATATCAAATCTCACCCTCAATTAACCTAAATATCTTATT -AACTATAGCCGTACTGTCAATCCTAGTAGGAGGCTGAGGCGGTCTCAACC -AAACCCAACTACGAAAAATCATAGCATACTCGTCAATCGCGCATATAGGA -TGAATAACAGCTGTCCTAGTATATAACCCAACACTAACAATACTAAACAT -ATTAATTTACATTATAATAACACTCACAATATTCATACTATTTATCCACA -GCTCCTCTACTACAACACTATCACTCTCCCACACATGAAACAAAATACCT -CTAACCACTACACTAATCTTAATTACCTTACTATCCATAGGAGGCCTCCC -CCCACTATCAGGATTCATACCCAAATGAATAATCATTCAAGAGCTCACCA -AAAATAGCAGCATCATCCTCCCCACACTAATAGCCATTATAGCACTACTC -AACCTCTACTTCTACATACGACTAACCTATTCCACCTCACTGACCATATT -CCCATCCACAAACAACATAAAAATAAAATGACAATTCGAAACCAAACGAA -TTACTCTCTTACCCCCGTTAATTGTTATATCCTCCCTACTCCTCCCCCTA -ACCCCCATACTATCAATTTTGGACTAGGAATTTAGGTTAACATCCCAGAC -CAAGAGCCTTCAAAGCTCTAAGCAAGTGAATCCACTTAATTCCTGCATAC -TAAGGACTGCGAGACTCTATCTCACATCAATTGAACGCAAATCAAACTCT -TTTATTAAGCTAAGCCCTTACTAGATTGGTGGGCTACCATCCCACGAAAT -TTTAGTTAACAGCTAAATACCCTAATCAACTGGCTTCAATCTACTTCTCC -CGCCGCCTAGAAAAAAAGGCGGGAGAAGCCCCGGCAGAAATTGAAGCTGC -TCCTTTGAATTTGCAATTCAATGTGAAAATTCACCACGGGACTTGATAAG -AAGAGGATTCCAACCCCTGTCTTTAGATTTACAGTCTAATGCTTACTCAG -CCATCTTACCTATGTTCATCAACCGCTGACTATTTTCAACTAACCACAAA -GACATCGGCACTCTGTACCTCCTATTCGGCGCTTGAGCTGGAATAGTAGG -AACTGCCCTAAGCCTCCTAATCCGTGCTGAATTAGGCCAACCTGGGACCC -TACTAGGAGATGATCAGATCTACAATGTCATTGTAACCGCCCATGCATTC -GTAATAATTTTCTTTATGGTCATACCCATTATAATCGGAGGATTCGGAAA -CTGATTAGTCCCCCTGATAATTGGAGCACCTGATATAGCTTTCCCCCGAA -TAAACAACATAAGCTTCTGATTACTTCCCCCATCATTCCTACTTCTTCTC -GCTTCCTCAATAATTGAAGCAGGTGCCGGAACAGGCTGAACCGTATATCC -TCCTCTAGCTGGAAATCTGGCGCATGCAGGAGCCTCTGTTGACTTAACCA -TTTTCTCTCTCCACCTAGCTGGGGTGTCCTCGATTTTAGGTGCCATCAAC -TTTATTACCACAATCATTAACATAAAACCACCAGCCCTATCCCAATATCA -AACCCCCCTATTCGTTTGATCTGTCCTTATTACGGCAGTACTCCTTCTCC -TAGCCCTCCCGGTCCTAGCAGCAGGCATTACCATGCTTCTCACAGACCGT -AACCTGAACACTACTTTCTTCGACCCCGCAGGAGGAGGGGATCCAATCCT -TTATCAACACCTATTCTGATTCTTCGGACACCCCGAAGTCTATATTCTTA -TCCTACCAGGCTTCGGTATAATCTCACACATCGTCACATACTACTCAGGT -AAAAAGGAACCTTTTGGCTACATGGGTATAGTGTGAGCTATAATATCCAT -TGGCTTTCTAGGCTTCATCGTATGGGCTCACCACATGTTTACAGTAGGGA -TAGACGTTGACACACGAGCATACTTCACATCAGCTACCATAATCATCGCT -ATCCCTACTGGTGTAAAAGTATTCAGCTGACTAGCCACCCTGCACGGAGG -AAATATCAAATGATCTCCAGCTATACTCTGAGCTCTAGGCTTCATCTTCT -TATTCACAGTAGGAGGTCTAACAGGAATCGTCCTAGCTAACTCATCCCTA -GATATTGTTCTCCACGATACTTATTATGTAGTAGCACATTTCCATTATGT -CCTGTCTATAGGAGCAGTCTTCGCCATTATGGGGGGATTTGTACACTGAT -TCCCTCTATTCTCAGGATACACACTCAACCAAACCTGAGCAAAAATCCAC -TTTACAATTATATTCGTAGGGGTAAATATAACCTTCTTCCCACAACATTT -CCTTGGCCTCTCAGGAATGCCACGACGCTATTCTGATTATCCAGACGCAT -ATACAACATGAAATACCATCTCATCCATAGGATCTTTTATCTCACTTACA -GCAGTGATACTAATAATTTTCATAATTTGAGAAGCGTTCGCATCCAAACG -AGAAGTGTCTACAGTAGAATTAACCTCAACTAATCTGGAATGACTACACG -GATGCCCCCCACCATACCACACATTTGAAGAACCCACCTACGTAAACCTA -AAAtaagaaaggaaggaatcgaaccccctctaactggtttcaagccaata -tcataaccactatgtctttctcCATCAATTGAGGTATTAGTAAAAATTAC -ATGACTTTGTCAAAGTTAAATTATAGGTTAAACCCCTATATACCTCTATG -GCCTACCCCTTCCAACTAGGATTCCAAGACGCAACATCCCCTATTATAGA -AGAACTCCTACACTTCCACGACCACACACTAATAATCGTATTCCTAATTA -GCTCTCTAGTATTATATATTATCTCATCAATACTAACAACTAAATTAACC -CATACCAGCACCATAGATGCTCAAGAAGTAGAGACAATTTGAACGATTTT -ACCAGCCATCATCCTTATTCTAATCGCCCTCCCATCCCTACGAATTCTAT -ATATAATAGATGAAATCAATAATCCGTCCCTCACAGTCAAAACAATAGGC -CACCAATGATACTGAAGCTACGAGTATACCGATTACGAAGACTTGACCTT -TGACTCCTACATGATCCCCACATCAGACCTAAAACCAGGAGAATTACGTC -TTCTAGAAGTCGACAATCGAGTGGTTCTCCCCATAGAAATAACCATCCGA -ATGCTAATTTCATCCGAAGACGTCCTACACTCATGAGCTGTGCCCTCCCT -AGGCCTAAAAACAGACGCTATCCCTGGGCGCCTAAATCAGACAACTCTCG -TGGCCTCTCGACCAGGACTTTACTACGGTCAATGCTCAGAGATCTGCGGA -TCAAACCACAGCTTTATACCAATTGTCCTTGAACTAGTTCCACTGAAACA -CTTCGAAGAATGATCTGCATCAATATTATAAAGTCACTAAGAAGCTATTA -TAGCATTAACCTTTTAAGTTAAAGATTGAGGGTTCAACCCCCTCCCTAGT -GATATGCCACAGTTGGATACATCAACATGATTTATTAATATCGTCTCAAT -AATCCTAACTCTATTTATTGTATTTCAACTAAAAATCTCAAAGCACTCCT -ATCCGACACACCCAGAAGTAAAGACAACCAAAATAACAAAACACTCTGCC -CCTTGAGAATCAAAATGAACGAAAATCTATTCGCCTCTTTCGCTACCCCA -ACAATAGTAGGCCTCCCTATTGTAATTCTGATCATCATATTTCCCAGCAT -CCTATTCCCCTCACCCAACCGACTAATCAACAATCGCCTAATCTCAATTC -AACAATGGCTAGTCCAACTTACATCAAAACAAATAATAGCTATCCATAAC -AGCAAAGGACAAACCTGAACTCTTATACTCATATCACTGATCCTATTCAT -TGGCTCAACAAACTTATTAGGCCTACTACCTCACTCATTTACACCAACAA -CACAACTATCAATAAACCTAGGCATAGCTATTCCCCTATGGGCAGGGACA -GTATTCATAGGCTTTCGTCACAAAACAAAAGCAGCCCTAGCCCACTTTCT -ACCTCAAGGGACGCCCATTTTCCTCATCCCCATACTAGTAATTATCGAGA -CTATCAGCCTATTTATTCAACCTGTAGCCCTAGCCGTGCGGCTAACCGCT -AACATTACCGCCGGACACCTCCTAATACACCTCATCGGAGGGGCAACACT -AGCCCTCATAAGCATCAGCCCCTCAACAGCCCTTATTACGTTTATCATCC -TAATTCTACTAACTATCCTCGAATTCGCAGTAGCTATAATCCAAGCCTAC -GTATTCACTCTCCTGGTAAGCCTTTACTTACACGACAACACCTAATGACC -CACCAAACCCACGCTTACCACATAGTAAACCCCAGCCCATGACCACTTAC -AGGAGCCCTATCAGCCCTCCTGATAACATCAGGACTAGCCATGTGATTTC -ACTTTAACTCAACCTTACTTCTAGCTATAGGGCTATTAACTAACATCCTT -ACCATATATCAATGATGACGAGACATCATCCGAGAAAGCACATTCCAAGG -CCATCACACATCAATCGTTCAAAAGGGACTCCGATATGGCATAATCCTTT -TTATTATCTCAGAAGTCTTCTTCTTCTCTGGCTTCTTCTGAGCCTTTTAC -CACTCAAGCCTAGCCCCCACACCCGAACTAGGCGGCTGCTGACCACCCAC -AGGTATCCACCCCTTAAACCCCCTAGAAGTCCCCTTACTCAACACCTCAG -TGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCACCATAGCCTAATA -GAAGGAAACCGTAAAAATATGCTCCAAGGCCTATTCATCACAATTTCACT -AGGCGTATACTTCACCCTTCTCCAAGCCTCAGAATACTATGAAGCCTCAT -TTACTATTTCAGATGGAGTATACGGATCAACATTTTTCGTAGCAACAGGG -TTCCACGGACTACACGTAATTATCGGATCTACCTTCCTCATTGTATGTTT -CCTACGCCAACTAAAATTCCACTTTACATCCAGCCACCACTTCGGATTCG -AAGCAGCCGCTTGATACTGACACTTCGTCGACGTAGTCTGACTATTCTTG -TACGTCTCTATTTATTGATGAGGATCCTATTCTTTTAGTATTGACCAGTA -CAATTGACTTCCAATCAATCAGCTTCGGTATAACCCGAAAAAGAATAATA -AACCTCATACTGACACTCCTCACTAACACATTACTAGCCTCGCTACTCGT -ACTCATCGCATTCTGACTACCACAACTAAACATCTATGCAGAAAAAACCA -GCCCATATGAATGCGGATTTGACCCTATAGGGTCAGCACGCCTCCCCTTC -TCAATAAAATTTTTCTTAGTGGCCATTACATTTCTGCTATTCGACTTAGA -AATTGCCCTCCTATTACCCCTTCCATGAGCATCCCAAACAACTAACCTAA -ACACTATACTTATCATAGCACTAGTCCTAATCTCTCTTCTAGCCATCAGC -CTAGCCTACGAATGAACCCAAAAAGGACTAGAATGAACTGAGTATGGTAA -TTAGTTTAAACCAAAACAAATGATTTCGACTCATTAAACTATGATTAACT -TCATAATTACCAACATGTCACTAGTCCATATTAATATCTTCCTAGCATTC -ACAGTATCCCTCGTAGGCCTACTAATGTACCGATCCCACCTAATATCCTC -ACTCCTATGCCTAGAAGGAATAATACTATCACTATTCGTCATAGCAACCA -TAATAGTCCTAAACACCCACTTCACACTAGCTAGTATAATACCTATCATC -TTACTAGTATTTGCTGCCTGCGAACGAGCTCTAGGATTATCCCTACTAGT -CATAGTCTCCAATACTTATGGAGTAGACCACGTACAAAACCTTAACCTCC -TCCAATGCTAAAAATTATCATTCCCACAATCATACTTATGCCCCTTACAT -GACTATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATAGTCTA -TTAATCAGCCTTATCAGCCTATCCCTCCTAAACCAACCTAGCAACAATAG -CCTAAACTTCTCACTAATATTCTTCTCCGATCCCCTATCAGCCCCACTTC -TGGTGTTGACAACATGACTACTGCCACTAATACTCATAGCCAGCCAACAC -CATCTATCTAAGGAACCACTAATCCGAAAAAAACTCTACATCACCATGCT -AACCATACTTCAAACTTTCCTAATCATGACTTTTACCGCCACAGAACTAA -TCTCCTTCTACATCCTATTTGAAGCCACATTAGTTCCAACACTAATTATC -ATCACCCGCTGAGGCAACCAAACAGAACGCCTGAACGCAGGCCTCTACTT -CCTATTCTACACACTAATAGGTTCCCTCCCACTCTTAGTTGCACTAATCT -CTATCCAAAACCTAACAGGCTCACTAAACTTCCTATTAATTCAATACTGA -AACCAAGCACTACCCGACTCTTGATCCAATATTTTCCTATGACTAGCATG -TATAATAGCATTCATAGTCAAAATACCGGTATATGGTCTTCACCTCTGAC -TCCCAAAAGCCCATGTAGAAGCCCCAATTGCCGGATCCATAGTGCTAGCA -GCCATTCTACTAAAACTAGGAGGCTACGGAATACTACGAATTACAACAAT -ACTAAACCCCCAAACTAGCTTTATAGCCTACCCCTTCCTCATACTATCCC -TGTGAGGAATAATCATAACTAGTTCCATCTGCTTGCGACAAACCGATCTA -AAATCACTTATTGCATACTCCTCTGTCAGCCACATAGCCCTAGTAATCGT -AGCCGTCCTCATCCAAACACCATGAAGTTATATAGGAGCTACAGCCCTAA -TAATCGCTCACGGCCTTACATCATCAATACTATTCTGCCTGGCAAACTCA -AATTACGAACGTACCCATAGCCGAACTATAATCCTAGCCCGCGGGCTTCA -AACACTTCTTCCCCTTATAGCAGCCTGATGACTATTAGCCAGCCTAACCA -ACCTGGCCCTCCCTCCCAGCATTAACCTAATTGGAGAGCTATTCGTAGTA -ATATCATCATTCTCATGATCAAATATTACCATTATCCTAATAGGAGCCAA -TATCACCATCACCGCCCTCTACTCCCTATACATACTAATCACAACACAAC -GAGGGAAATACACACACCATATCAACAGCATTAAACCTTCATTTACACGA -GAAAACGCACTCATGGCCCTCCACATGACTCCCCTACTACTCCTATCACT -TAACCCTAAAATTATCCTAGGCTTTACGTACTGTAAATATAGTTTAACAA -AAACACTAGATTGTGGATCTAGAAACAGAAACTTAATATTTCTTATTTAC -CGAGAAAGTATGCAAGAACTGCTAATTCATGCCCCCATGTCCAACAAACA -TGGCTCTCTCAAACTTTTAAAGGATAGGAGCTATCCGTTGGTCTTAGGAA -CCAAAAAATTGGTGCAACTCCAAATAAAAGTAATCAACATGTTCTCCTCC -CTCATACTAGTTTCACTATTAGTACTAACCCTCCCAATCATATTATCAAT -CTTCAATACCTACAAAAACAGCACGTTCCCGCATCATGTAAAAAACACTA -TCTCATATGCCTTCATTACTAGCCTAATTCCCACTATAATATTTATTCAC -TCTGGACAAGAAACAATTATCTCAAACTGACACTGAATAACCATACAAAC -CCTCAAACTATCCCTAAGCTTCAAACTAGATTACTTCTCAATAATTTTCG -TACCAGTAGCCCTATTCGTAACATGATCTATTATGGAATTCTCCCTATGA -TACATGCACTCAGATCCTTACATTACTCGATTTTTTAAATACTTACTTAC -ATTCCTCATCACTATAATAATTCTAGTCACAGCTAACAACCTTTTCCAAC -TGTTCATCGGATGGGAGGGAGTAGGCATCATGTCATTCTTACTAATCGGA -TGATGATACGGCCGAACAGATGCCAACACCGCGGCCCTTCAAGCAATCCT -TTATAACCGCATCGGGGATATCGGCTTCATCATGGCCATAGCCTGATTCC -TATTCAACACCAACACATGAGACCTCCAACAAATCTTCATACTCGACCCC -AACCTTACCAACCTCCCGCTCCTAGGCCTCCTCCTAGCCGCAACTGGCAA -ATCCGCTCAATTTGGACTCCACCCATGACTTCCTTCAGCCATAGAGGGCC -CTACACCAGTCTCAGCCCTACTCCACTCCAGCACAATAGTTGTAGCAGGC -GTCTTCCTGCTAATCCGCTTCCATCCACTAATAGAAAACAACAAAACAAT -CCAGTCACTTACCCTATGCCTAGGAGCCATCACCACACTATTCACAGCAA -TCTGCGCACTCACTCAAAACGATATCAAAAAAATCATTGCTTTCTCCACC -TCCAGCCAACTAGGCCTGATAATCGTAACCATCGGTATCAATCAACCCTA -CCTAGCATTCCTCCACATTTGCACTCACGCATTCTTCAAAGCTATACTAT -TTATATGTTCCGGATCCATTATCCACAGCCTAAATGACGAGCAAGATATC -CGAAAAATAGGCGGACTATTTAATGCAATACCCTTCACCACCACATCTCT -AATTATTGGCAGCCTTGCACTCACCGGAATTCCTTTCCTCACAGGCTTCT -ACTCCAAAGACCTCATCATCGAAACCGCCAACACATCGTACACCAACGCC -TGAGCCCTACTAATAACTCTCATTGCCACATCCCTCACAGCTGTCTACAG -TACCCGAATCATCTTCTTTGCACTCCTAGGGCAACCCCGCTTCCTCCCTC -TGACCTCAATCAACGAAAATAACCCCTTTCTAATTAACTCCATCAAACGC -CTCTTAATTGGCAGCATTTTTGCCGGATTCTTCATCTCCAACAATATCTA -CCCCACAACCGTCCCAGAAATAACCATACCTACTTACATAAAACTCACCG -CCCTCGCAGTAACCATCCTAGGATTTACACTAGCCCTAGAACTAAGCTTG -ATAACCCATAACTTAAAACTAGAACACTCCACCAACGTATTCAAATTCTC -CAACCTCCTAGGATACTACCCAACAATTATACACCGACTCCCACCGCTCG -CTAACCTATCAATAAGCCAAAAATCAGCATCACTTCTACTAGACTCAATC -TGACTAGAAAACATCCTGCCAAAATCTATCTCCCAGTTCCAAATAAAAAC -CTCGATCCTAATTTCCACCCAAAAAGGACAAATCAAATTATATTTCCTCT -CATTCCTCATCACCCTTACCCTAAGCATACTACTTTTTAATCTCCACGAG -TAACCTCTAAAATTACCAAGACCCCAACAAGCAACGATCAACCAGTCACA -ATCACAACCCAAGCCCCATAACTATACAATGCAGCAGCCCCTATAATTTC -CTCACTAAACGCCCCAGAATCTCCAGTATCATAAATAGCTCAAGCCCCCA -CACCACTAAACTTAAACACTACCCCCACTTCCTCACTCTTCAGAACATAT -AAAACCAACATAACCTCCATCAACAACCCTAAAAGAAATACCCCCATAAC -AGTCGTATTAGACACCCATACCTCAGGATACTGCTCAGTAGCCATAGCCG -TTGTATAACCAAAAACAACCAACATTCCTCCCAAATAAATCAAAAACACC -ATCAACCCCAAAAAGGACCCTCCAAAATTCATAATAATACCACAACCTAC -CCCTCCACTTACAATCAGCACTAAACCCCCATAAATAGGTGAAGGTTTTG -AAGAAAACCCCACAAAACTAACAACAAAAATAACACTCAAAATAAACACA -ATATATGTCATCATTATTCCCACGTGGAATCTAACCACGACCAATGACAT -GAAAAATCATCGTTGTATTTCAACTATAAGAACACCAATGACAAACATCC -GGAAATCTCACCCACTAATTAAAATCATCAATCACTCTTTTATTGACCTA -CCAGCCCCCTCAAACATTTCATCATGATGAAACTTCGGCTCCCTCCTAGG -AATCTGCCTAATCCTCCAAATCTTAACAGGCCTATTCCTAGCCATACACT -ACACATCAGACACGACAACTGCCTTCTCATCCGTCACTCACATCTGCCGA -GACGTTAACTACGGATGAATTATTCGCTACCTCCATGCCAACGGAGCATC -AATATTTTTTATCTGCCTCTTCATTCACGTAGGACGCGGCCTCTACTACG -GCTCTTACACATTCCTAGAGACATGAAACATTGGAATCATCCTACTTTTC -ACAGTTATAGCTACAGCATTCATGGGCTATGTCCTACCATGAGGCCAAAT -ATCCTTTTGAGGAGCAACAGTCATCACGAACCTCCTATCAGCAATTCCCT -ACATCGGTACTACCCTCGTCGAGTGAATCTGAGGTGGATTCTCAGTAGAC -AAAGCCACCCTTACCCGATTTTTTGCTTTCCACTTCATCCTACCCTTCAT -CATCACAGCCCTGGTAGTCGTACATTTACTATTTCTTCACGAAACAGGAT -CTAATAACCCCTCAGGAATCCCATCCGATATGGACAAAATCCCATTCCAC -CCATATTATACAATTAAAGACATCCTAGGACTCCTCCTCCTGATCTTGCT -CCTACTAACTCTAGTATTATTCTCCCCCGACCTCCTAGGAGACCCAGACA -ACTACACCCCAGCTAACCCTCTCAGCACTCCCCCTCATATTAAACCAGAA -TGGTACTTCCTGTTTGCCTACGCCATCCTACGCTCCATTCCCAACAAACT -AGGCGGCGTATTAGCCCTAATCCTCTCCATCCTGATCCTAGCACTCATCC -CCACCCTCCACATATCAAAACAACGAAGCATAATATTCCGGCCTCTCAGC -CAATGCGTATTCTGACTCTTAGTGGCAGACTTACTGACACTAACATGAAT -CGGCGGACAGCCAGTGGAACACCCATACGTAATTATCGGCCAACTGGCCT -CAATCCTCTACTTCTCCCTAATTCTCATTTTTATACCACTCGCAAGCACC -ATCGAAAACAATCTTCTAAAATGAAGAGTCCCTGTAGTATATCGCACATT -ACCCTGGTCTTGTAAACCAGAAAAGGGGGAAAACGTTTCCTCCCAAGGAC -TATCAAGGAAGAAGCTCTAGCTCCACCATCAACACCCAAAGCTGAAATTC -TACTTAAACTATTCCTTGATTTCTTCCCCTAAACGACAACAATTTACCCT -CATGTGCTATGTCAGTATCAGATTATACCCCCACATAACACCATACCCAC -CTGACATGCAATATCTTATGAATGGCCTATGTACGTCGTGCATTAAATTG -TCTGCCCCATGAATAATAAGCATGTACATAATATCATTTATCTTACATAA -GTACATTATATTATTGATCGTGCATACCCCATCCAAGTCAAATCATTTCC -AGTCAACACGCATATCACAGCCCATGTTCCACGAGCTTAATCACCAAGCC -GCGGGAAATCAGCAACCCTCCCAACTACGTGTCCCAATCCTCGCTCCGGG -CCCATCCAAACGTGGGGGTTTCTACAATGAAACTATACCTGGCATCTGGT -TCTTTCTTCAGGGCCATTCCCACCCAACCTCGCCCATTCTTTCCCCTTAA -ATAAGACATCTCGATGGACTAATGACTAATCAGCCCATGCTCACACATAA -CTGTGATTTCATGCATTTGGTATCTTTTTATATTTGGGGATGCTATGACT -CAGCTATGGCCGTCAAAGGCCTCGACGCAGTCAATTAAATTGAAGCTGGA -CTTAAATTGAACGTTATTCCTCCGCATCAGCAACCATAAGGTGTTATTCA -GTCCATGGTAGCGGGACATAGGAAACAAgtgcacctgtgcacctgtgcac -ctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacct -gtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgt -gcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgc -acctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcac -ctgtgcacctACCCGCGCAGTAAGCAAGTAATATAGCTTTCTTAATCAAA -CCCCCCCTACCCCCCATTAAACTCCACATATGTACATTCAACACAATCTT -GCCAAACCCCAAAAACAAGACTAAACAATGCACAATACTTCATGAAGCTT -AACCCTCGCATGCCAACCATAATAACTCAACACACCTAACAATCTTAACA -GAACTTTCCCCCCGCCATTAATACCAACATGCTACTTTAATCAATAAAAT -TTCCATAGACAGGCATCCCCCTAGATCTAATTTTCTAAATCTGTCAACCC -TTCTTCCCCC diff -r 93474ea5c366 -r 495dda4c0693 test-data/chr_m.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_m.fasta Fri Dec 04 19:51:46 2009 -0500 @@ -0,0 +1,335 @@ +>chrM +GTTAATGTAGCTTAATAATATAAAGCAAGGCACTGAAAATGCCTAGATGA +GTATTCTTACTCCATAAACACATAGGCTTGGTCCTAGCCTTTTTATTAGT +TATTAATAGAATTACACATGCAAGTATCCGCACCCCAGTGAGAATGCCCT +CTAAATCACGTCTCTACGATTAAAAGGAGCAGGTATCAAGCACACTAGAA +AGTAGCTCATAACACCTTGCTCAGCCACACCCCCACGGGACACAGCAGTG +ATAAAAATTAAGCTATGAACGAAAGTTCGACTAAGTCATATTAAATAAGG +GTTGGTAAATTTCGTGCCAGCCACCGCGGTCATACGATTAACCCAAATTA +ATAAATCTCCGGCGTAAAGCGTGTCAAAGACTAATACCAAAATAAAGTTA +AAACCCAGTTAAGCCGTAAAAAGCTACAACCAAAGTAAAATAGACTACGA +AAGTGACTTTAATACCTCTGACTACACGATAGCTAAGACCCAAACTGGGA +TTAGATACCCCACTATGCTTAGCCCTAAACTAAAATAGCTTACCACAACA +AAGCTATTCGCCAGAGTACTACTAGCAACAGCCTAAAACTCAAAGGACTT +GGCGGTGCTTTACATCCCTCTAGAGGAGCCTGTTCCATAATCGATAAACC +CCGATAAACCCCACCATCCCTTGCTAATTCAGCCTATATACCGCCATCTT +CAGCAAACCCTAAACAAGGTACCGAAGTAAGCACAAATATCCAACATAAA +AACGTTAGGTCAAGGTGTAGCCCATGGGATGGAGAGAAATGGGCTACATT +TTCTACCCTAAGAACAAGAACTTTAACCCGGACGAAAGTCTCCATGAAAC +TGGAGACTAAAGGAGGATTTAGCAGTAAATTAAGAATAGAGAGCTTAATT +GAATCAGGCCATGAAGCGCGCACACACCGCCCGTCACCCTCCTTAAATAT +CACAAATCATAACATAACATAAAACCGTGACCCAAACATATGAAAGGAGA +CAAGTCGTAACAAGGTAAGTATACCGGAAGGTGTACTTGGATAACCAAAG +TGTAGCTTAAACAAAGCATCCAGCTTACACCTAGAAGATTTCACTCAAAA +TGAACACTTTGAACTAAAGCTAGCCCAAACAATACCTAATTCAATTACCC +TTAGTCACTTAACTAAAACATTCACCAAACCATTAAAGTATAGGAGATAG +AAATTTTAACTTGGCGCTATAGAGAAAGTACCGTAAGGGAACGATGAAAG +ATGCATTAAAAGTACTAAACAGCAAAGCTTACCCCTTTTACCTTTTGCAT +AATGATTTAACTAGAATAAACTTAGCAAAGAGAACTTAAGCTAAGCACCC +CGAAACCAGACGAGCTACCTATGAACAGTTACAAATGAACCAACTCATCT +ATGTCGCAAAATAGTGAGAAGATTCGTAGGTAGAGGTGAAAAGCCCAACG +AGCCTGGTGATAGCTGGTTGTCCAGAAACAGAATTTCAGTTCAAATTTAA +ATTTACCTAAAAACTACTCAATTCTAATGTAAATTTAAATTATAGTCTAA +AAAGGTACAGCTTTTTAGATACAGGTTACAACCTTCATTAGAGAGTAAGA +ACAAGATAAACCCATAGTTGGCTTAAAAGCAGCCATCAATTAAGAAAGCG +TTCAAGCTCAACGACACATCTATCTTAATCCCAACAATCAACCCAAACTA +ACTCCTAATCTCATACTGGACTATTCTATCAACACATAGAAGCAATAATG +TTAATATGAGTAACAAGAATTATTTCTCCTTGCATAAGCTTATATCAGAA +CGAATACTCACTGATAGTTAACAACAAGATAGGGATAATCCAAAAACTAA +TCATCTATTTAAACCATTGTTAACCCAACACAGGCATGCATCTATAAGGA +AAGATTAAAAGAAGTAAAAGGAACTCGGCAAACACAAACCCCGCCTGTTT +ACCAAAAACATCACCTCTAGCATTTCCAGTATTAGAGGCACTGCCTGCCC +AGTGACATCTGTTtaaacggccgcggtatcctaaccgtgcaaaggtagca +taatcacttgttccctaaatagggacttgtatgaatggccacacgagggt +tttactgtctcttacttccaatcagtgaaattgaccttcccgtgaagagg +cgggaatgactaaataagacgagaagaccctatggagcttTAATTAACTG +ATTCACAAAAAACAACACACAAACCTTAACCTTCAGGGACAACAAAACTT +TTGATTGAATCAGCAATTTCGGTTGGGGTGACCTCGGAGAACAAAACAAC +CTCCGAGTGATTTAAATCCAGACTAACCAGTCAAAATATATAATCACTTA +TTGATCCAAACCATTGATCAACGGAACAAGTTACCCTAGGGATAACAGCG +CAATCCTATTCCAGAGTCCATATCGACAATTAGGGTTTACGACCTCGATG +TTGGATCAAGACATCCTAATGGTGCAACCGCTATTAAGGGTTCGTTTGTT +CAACGATTAAAGTCTTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGT +CGGTTTCTATCTATTCTATACTTTTCCCAGTACGAAAGGACAAGAAAAGT +AGGGCCCACTTTACAAGAAGCGCCCTCAAACTAATAGATGACATAATCTA +AATCTAACTAATTTATAACTTCTACCGCCCTAGAACAGGGCTCgttaggg +tggcagagcccggaaattgcataaaacttaaacctttacactcagaggtt +caactcctctccctaacaacaTGTTCATAATTAACGTCCTCCTCCTAATT +GTCCCAATCTTGCTCGCCGTAGCATTCCTCACACTAGTTGAACGAAAAGT +CTTAGGCTATATGCAACTTCGCAAAGGACCCAACATCGTAGGCCCCTATG +GCCTACTACAACCTATTGCCGATGCCCTCAAACTATTTATCAAAGAGCCA +CTACAACCACTAACATCATCGACATCCATATTCATCATCGCACCAATCCT +AGCCCTAACCCTGGCCTTAACCATATGAATCCCTCTGCCCATACCATACC +CACTAATCAACATAAACCTAGGAATTCTATTCATACTAGCCATGTCCAGC +CTAGCTGTCTACTCAATCCTTTGATCAGGATGGGCCTCAAACTCAAAATA +CGCCCTAATTGGAGCTCTACGAGCAGTAGCACAAACCATCTCATACGAAG +TAACTCTAGCAATCATCCTACTCTCAGTCCTCCTAATAAGCGGATCATTC +ACATTATCAACACTTATTATTACCCAAGAATACCTCTGATTAATCTTCCC +ATCATGACCCTTAGCCATAATGTGATTCATCTCAACATTAGCCGAAACCA +ACCGAGCTCCATTTGACCTAACAGAAGGAGAATCAGAACTCGTCTCTGGA +TTCAACGTTGAATACGCAGCCGGCCCATTTGCTCTATTCTTCCTAGCAGA +ATACGCAAACATCATCATGATAAACATCTTCACAACAACCCTATTTCTAG +GAGCATTTCACAACCCCTACCTGCCAGAACTCTACTCAATTAATTTCACC +ATTAAAGCTCTCCTTCTAACATGTTCCTTCCTATGAATCCGAGCATCCTA +CCCACGATTCCGATATGACCAACTTATACACCTCCTATGAAAGAACTTCC +TACCACTCACACTAGCCCTCTGCATATGACACGTCTCACTTCCAATCATA +CTATCCAGCATCCCACCACAAACATAGGAAATATGTCTGACAAAAGAGTT +ACTTTGATAGAGTAAAACATAGAGGCTCAAACCCTCTTATTTctagaact +acaggaattgaacctgctcctgagaattcaaaatcctccgtgctaccgaa +ttacaccatgtcctaCAAGTAAGGTCAGCTAAATAAGCTATCGGGCCCAT +ACCCCGAAAATGTTGGATTACACCCTTCCCGTACTAATAAATCCCCTTAT +CTTCACAACTATTCTAATAACAGTTCTTCTAGGAACTATAATCGTTATAA +TAAGCTCACACTGACTAATAATCTGAATCGGATTTGAAATAAATCTACTA +GCCATTATCCCTATCCTAATAAAAAAGTACAATCCCCGAACCATAGAAGC +CTCCACCAAATATTTTCTAACCCAAGCCACCGCATCAATACTCCTCATAA +TAGCGATCATCATTAACCTCATACACTCAGGCCAATGAACAATCACAAAA +GTCTTCAACCCCACAGCGTCCATCATTATAACTTCAGCTCTCGCCATAAA +ACTTGGACTCACACCATTCCACTTCTGAGTACCCGAAGTCACACAGGGCA +TCTCATTAACATCAGGTCTCATCCTACTTACATGACAAAAACTAGCCCCA +ATATCAATCCTATATCAAATCTCACCCTCAATTAACCTAAATATCTTATT +AACTATAGCCGTACTGTCAATCCTAGTAGGAGGCTGAGGCGGTCTCAACC +AAACCCAACTACGAAAAATCATAGCATACTCGTCAATCGCGCATATAGGA +TGAATAACAGCTGTCCTAGTATATAACCCAACACTAACAATACTAAACAT +ATTAATTTACATTATAATAACACTCACAATATTCATACTATTTATCCACA +GCTCCTCTACTACAACACTATCACTCTCCCACACATGAAACAAAATACCT +CTAACCACTACACTAATCTTAATTACCTTACTATCCATAGGAGGCCTCCC +CCCACTATCAGGATTCATACCCAAATGAATAATCATTCAAGAGCTCACCA +AAAATAGCAGCATCATCCTCCCCACACTAATAGCCATTATAGCACTACTC +AACCTCTACTTCTACATACGACTAACCTATTCCACCTCACTGACCATATT +CCCATCCACAAACAACATAAAAATAAAATGACAATTCGAAACCAAACGAA +TTACTCTCTTACCCCCGTTAATTGTTATATCCTCCCTACTCCTCCCCCTA +ACCCCCATACTATCAATTTTGGACTAGGAATTTAGGTTAACATCCCAGAC +CAAGAGCCTTCAAAGCTCTAAGCAAGTGAATCCACTTAATTCCTGCATAC +TAAGGACTGCGAGACTCTATCTCACATCAATTGAACGCAAATCAAACTCT +TTTATTAAGCTAAGCCCTTACTAGATTGGTGGGCTACCATCCCACGAAAT +TTTAGTTAACAGCTAAATACCCTAATCAACTGGCTTCAATCTACTTCTCC +CGCCGCCTAGAAAAAAAGGCGGGAGAAGCCCCGGCAGAAATTGAAGCTGC +TCCTTTGAATTTGCAATTCAATGTGAAAATTCACCACGGGACTTGATAAG +AAGAGGATTCCAACCCCTGTCTTTAGATTTACAGTCTAATGCTTACTCAG +CCATCTTACCTATGTTCATCAACCGCTGACTATTTTCAACTAACCACAAA +GACATCGGCACTCTGTACCTCCTATTCGGCGCTTGAGCTGGAATAGTAGG +AACTGCCCTAAGCCTCCTAATCCGTGCTGAATTAGGCCAACCTGGGACCC +TACTAGGAGATGATCAGATCTACAATGTCATTGTAACCGCCCATGCATTC +GTAATAATTTTCTTTATGGTCATACCCATTATAATCGGAGGATTCGGAAA +CTGATTAGTCCCCCTGATAATTGGAGCACCTGATATAGCTTTCCCCCGAA +TAAACAACATAAGCTTCTGATTACTTCCCCCATCATTCCTACTTCTTCTC +GCTTCCTCAATAATTGAAGCAGGTGCCGGAACAGGCTGAACCGTATATCC +TCCTCTAGCTGGAAATCTGGCGCATGCAGGAGCCTCTGTTGACTTAACCA +TTTTCTCTCTCCACCTAGCTGGGGTGTCCTCGATTTTAGGTGCCATCAAC +TTTATTACCACAATCATTAACATAAAACCACCAGCCCTATCCCAATATCA +AACCCCCCTATTCGTTTGATCTGTCCTTATTACGGCAGTACTCCTTCTCC +TAGCCCTCCCGGTCCTAGCAGCAGGCATTACCATGCTTCTCACAGACCGT +AACCTGAACACTACTTTCTTCGACCCCGCAGGAGGAGGGGATCCAATCCT +TTATCAACACCTATTCTGATTCTTCGGACACCCCGAAGTCTATATTCTTA +TCCTACCAGGCTTCGGTATAATCTCACACATCGTCACATACTACTCAGGT +AAAAAGGAACCTTTTGGCTACATGGGTATAGTGTGAGCTATAATATCCAT +TGGCTTTCTAGGCTTCATCGTATGGGCTCACCACATGTTTACAGTAGGGA +TAGACGTTGACACACGAGCATACTTCACATCAGCTACCATAATCATCGCT +ATCCCTACTGGTGTAAAAGTATTCAGCTGACTAGCCACCCTGCACGGAGG +AAATATCAAATGATCTCCAGCTATACTCTGAGCTCTAGGCTTCATCTTCT +TATTCACAGTAGGAGGTCTAACAGGAATCGTCCTAGCTAACTCATCCCTA +GATATTGTTCTCCACGATACTTATTATGTAGTAGCACATTTCCATTATGT +CCTGTCTATAGGAGCAGTCTTCGCCATTATGGGGGGATTTGTACACTGAT +TCCCTCTATTCTCAGGATACACACTCAACCAAACCTGAGCAAAAATCCAC +TTTACAATTATATTCGTAGGGGTAAATATAACCTTCTTCCCACAACATTT +CCTTGGCCTCTCAGGAATGCCACGACGCTATTCTGATTATCCAGACGCAT +ATACAACATGAAATACCATCTCATCCATAGGATCTTTTATCTCACTTACA +GCAGTGATACTAATAATTTTCATAATTTGAGAAGCGTTCGCATCCAAACG +AGAAGTGTCTACAGTAGAATTAACCTCAACTAATCTGGAATGACTACACG +GATGCCCCCCACCATACCACACATTTGAAGAACCCACCTACGTAAACCTA +AAAtaagaaaggaaggaatcgaaccccctctaactggtttcaagccaata +tcataaccactatgtctttctcCATCAATTGAGGTATTAGTAAAAATTAC +ATGACTTTGTCAAAGTTAAATTATAGGTTAAACCCCTATATACCTCTATG +GCCTACCCCTTCCAACTAGGATTCCAAGACGCAACATCCCCTATTATAGA +AGAACTCCTACACTTCCACGACCACACACTAATAATCGTATTCCTAATTA +GCTCTCTAGTATTATATATTATCTCATCAATACTAACAACTAAATTAACC +CATACCAGCACCATAGATGCTCAAGAAGTAGAGACAATTTGAACGATTTT +ACCAGCCATCATCCTTATTCTAATCGCCCTCCCATCCCTACGAATTCTAT +ATATAATAGATGAAATCAATAATCCGTCCCTCACAGTCAAAACAATAGGC +CACCAATGATACTGAAGCTACGAGTATACCGATTACGAAGACTTGACCTT +TGACTCCTACATGATCCCCACATCAGACCTAAAACCAGGAGAATTACGTC +TTCTAGAAGTCGACAATCGAGTGGTTCTCCCCATAGAAATAACCATCCGA +ATGCTAATTTCATCCGAAGACGTCCTACACTCATGAGCTGTGCCCTCCCT +AGGCCTAAAAACAGACGCTATCCCTGGGCGCCTAAATCAGACAACTCTCG +TGGCCTCTCGACCAGGACTTTACTACGGTCAATGCTCAGAGATCTGCGGA +TCAAACCACAGCTTTATACCAATTGTCCTTGAACTAGTTCCACTGAAACA +CTTCGAAGAATGATCTGCATCAATATTATAAAGTCACTAAGAAGCTATTA +TAGCATTAACCTTTTAAGTTAAAGATTGAGGGTTCAACCCCCTCCCTAGT +GATATGCCACAGTTGGATACATCAACATGATTTATTAATATCGTCTCAAT +AATCCTAACTCTATTTATTGTATTTCAACTAAAAATCTCAAAGCACTCCT +ATCCGACACACCCAGAAGTAAAGACAACCAAAATAACAAAACACTCTGCC +CCTTGAGAATCAAAATGAACGAAAATCTATTCGCCTCTTTCGCTACCCCA +ACAATAGTAGGCCTCCCTATTGTAATTCTGATCATCATATTTCCCAGCAT +CCTATTCCCCTCACCCAACCGACTAATCAACAATCGCCTAATCTCAATTC +AACAATGGCTAGTCCAACTTACATCAAAACAAATAATAGCTATCCATAAC +AGCAAAGGACAAACCTGAACTCTTATACTCATATCACTGATCCTATTCAT +TGGCTCAACAAACTTATTAGGCCTACTACCTCACTCATTTACACCAACAA +CACAACTATCAATAAACCTAGGCATAGCTATTCCCCTATGGGCAGGGACA +GTATTCATAGGCTTTCGTCACAAAACAAAAGCAGCCCTAGCCCACTTTCT +ACCTCAAGGGACGCCCATTTTCCTCATCCCCATACTAGTAATTATCGAGA +CTATCAGCCTATTTATTCAACCTGTAGCCCTAGCCGTGCGGCTAACCGCT +AACATTACCGCCGGACACCTCCTAATACACCTCATCGGAGGGGCAACACT +AGCCCTCATAAGCATCAGCCCCTCAACAGCCCTTATTACGTTTATCATCC +TAATTCTACTAACTATCCTCGAATTCGCAGTAGCTATAATCCAAGCCTAC +GTATTCACTCTCCTGGTAAGCCTTTACTTACACGACAACACCTAATGACC +CACCAAACCCACGCTTACCACATAGTAAACCCCAGCCCATGACCACTTAC +AGGAGCCCTATCAGCCCTCCTGATAACATCAGGACTAGCCATGTGATTTC +ACTTTAACTCAACCTTACTTCTAGCTATAGGGCTATTAACTAACATCCTT +ACCATATATCAATGATGACGAGACATCATCCGAGAAAGCACATTCCAAGG +CCATCACACATCAATCGTTCAAAAGGGACTCCGATATGGCATAATCCTTT +TTATTATCTCAGAAGTCTTCTTCTTCTCTGGCTTCTTCTGAGCCTTTTAC +CACTCAAGCCTAGCCCCCACACCCGAACTAGGCGGCTGCTGACCACCCAC +AGGTATCCACCCCTTAAACCCCCTAGAAGTCCCCTTACTCAACACCTCAG +TGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCACCATAGCCTAATA +GAAGGAAACCGTAAAAATATGCTCCAAGGCCTATTCATCACAATTTCACT +AGGCGTATACTTCACCCTTCTCCAAGCCTCAGAATACTATGAAGCCTCAT +TTACTATTTCAGATGGAGTATACGGATCAACATTTTTCGTAGCAACAGGG +TTCCACGGACTACACGTAATTATCGGATCTACCTTCCTCATTGTATGTTT +CCTACGCCAACTAAAATTCCACTTTACATCCAGCCACCACTTCGGATTCG +AAGCAGCCGCTTGATACTGACACTTCGTCGACGTAGTCTGACTATTCTTG +TACGTCTCTATTTATTGATGAGGATCCTATTCTTTTAGTATTGACCAGTA +CAATTGACTTCCAATCAATCAGCTTCGGTATAACCCGAAAAAGAATAATA +AACCTCATACTGACACTCCTCACTAACACATTACTAGCCTCGCTACTCGT +ACTCATCGCATTCTGACTACCACAACTAAACATCTATGCAGAAAAAACCA +GCCCATATGAATGCGGATTTGACCCTATAGGGTCAGCACGCCTCCCCTTC +TCAATAAAATTTTTCTTAGTGGCCATTACATTTCTGCTATTCGACTTAGA +AATTGCCCTCCTATTACCCCTTCCATGAGCATCCCAAACAACTAACCTAA +ACACTATACTTATCATAGCACTAGTCCTAATCTCTCTTCTAGCCATCAGC +CTAGCCTACGAATGAACCCAAAAAGGACTAGAATGAACTGAGTATGGTAA +TTAGTTTAAACCAAAACAAATGATTTCGACTCATTAAACTATGATTAACT +TCATAATTACCAACATGTCACTAGTCCATATTAATATCTTCCTAGCATTC +ACAGTATCCCTCGTAGGCCTACTAATGTACCGATCCCACCTAATATCCTC +ACTCCTATGCCTAGAAGGAATAATACTATCACTATTCGTCATAGCAACCA +TAATAGTCCTAAACACCCACTTCACACTAGCTAGTATAATACCTATCATC +TTACTAGTATTTGCTGCCTGCGAACGAGCTCTAGGATTATCCCTACTAGT +CATAGTCTCCAATACTTATGGAGTAGACCACGTACAAAACCTTAACCTCC +TCCAATGCTAAAAATTATCATTCCCACAATCATACTTATGCCCCTTACAT +GACTATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATAGTCTA +TTAATCAGCCTTATCAGCCTATCCCTCCTAAACCAACCTAGCAACAATAG +CCTAAACTTCTCACTAATATTCTTCTCCGATCCCCTATCAGCCCCACTTC +TGGTGTTGACAACATGACTACTGCCACTAATACTCATAGCCAGCCAACAC +CATCTATCTAAGGAACCACTAATCCGAAAAAAACTCTACATCACCATGCT +AACCATACTTCAAACTTTCCTAATCATGACTTTTACCGCCACAGAACTAA +TCTCCTTCTACATCCTATTTGAAGCCACATTAGTTCCAACACTAATTATC +ATCACCCGCTGAGGCAACCAAACAGAACGCCTGAACGCAGGCCTCTACTT +CCTATTCTACACACTAATAGGTTCCCTCCCACTCTTAGTTGCACTAATCT +CTATCCAAAACCTAACAGGCTCACTAAACTTCCTATTAATTCAATACTGA +AACCAAGCACTACCCGACTCTTGATCCAATATTTTCCTATGACTAGCATG +TATAATAGCATTCATAGTCAAAATACCGGTATATGGTCTTCACCTCTGAC +TCCCAAAAGCCCATGTAGAAGCCCCAATTGCCGGATCCATAGTGCTAGCA +GCCATTCTACTAAAACTAGGAGGCTACGGAATACTACGAATTACAACAAT +ACTAAACCCCCAAACTAGCTTTATAGCCTACCCCTTCCTCATACTATCCC +TGTGAGGAATAATCATAACTAGTTCCATCTGCTTGCGACAAACCGATCTA +AAATCACTTATTGCATACTCCTCTGTCAGCCACATAGCCCTAGTAATCGT +AGCCGTCCTCATCCAAACACCATGAAGTTATATAGGAGCTACAGCCCTAA +TAATCGCTCACGGCCTTACATCATCAATACTATTCTGCCTGGCAAACTCA +AATTACGAACGTACCCATAGCCGAACTATAATCCTAGCCCGCGGGCTTCA +AACACTTCTTCCCCTTATAGCAGCCTGATGACTATTAGCCAGCCTAACCA +ACCTGGCCCTCCCTCCCAGCATTAACCTAATTGGAGAGCTATTCGTAGTA +ATATCATCATTCTCATGATCAAATATTACCATTATCCTAATAGGAGCCAA +TATCACCATCACCGCCCTCTACTCCCTATACATACTAATCACAACACAAC +GAGGGAAATACACACACCATATCAACAGCATTAAACCTTCATTTACACGA +GAAAACGCACTCATGGCCCTCCACATGACTCCCCTACTACTCCTATCACT +TAACCCTAAAATTATCCTAGGCTTTACGTACTGTAAATATAGTTTAACAA +AAACACTAGATTGTGGATCTAGAAACAGAAACTTAATATTTCTTATTTAC +CGAGAAAGTATGCAAGAACTGCTAATTCATGCCCCCATGTCCAACAAACA +TGGCTCTCTCAAACTTTTAAAGGATAGGAGCTATCCGTTGGTCTTAGGAA +CCAAAAAATTGGTGCAACTCCAAATAAAAGTAATCAACATGTTCTCCTCC +CTCATACTAGTTTCACTATTAGTACTAACCCTCCCAATCATATTATCAAT +CTTCAATACCTACAAAAACAGCACGTTCCCGCATCATGTAAAAAACACTA +TCTCATATGCCTTCATTACTAGCCTAATTCCCACTATAATATTTATTCAC +TCTGGACAAGAAACAATTATCTCAAACTGACACTGAATAACCATACAAAC +CCTCAAACTATCCCTAAGCTTCAAACTAGATTACTTCTCAATAATTTTCG +TACCAGTAGCCCTATTCGTAACATGATCTATTATGGAATTCTCCCTATGA +TACATGCACTCAGATCCTTACATTACTCGATTTTTTAAATACTTACTTAC +ATTCCTCATCACTATAATAATTCTAGTCACAGCTAACAACCTTTTCCAAC +TGTTCATCGGATGGGAGGGAGTAGGCATCATGTCATTCTTACTAATCGGA +TGATGATACGGCCGAACAGATGCCAACACCGCGGCCCTTCAAGCAATCCT +TTATAACCGCATCGGGGATATCGGCTTCATCATGGCCATAGCCTGATTCC +TATTCAACACCAACACATGAGACCTCCAACAAATCTTCATACTCGACCCC +AACCTTACCAACCTCCCGCTCCTAGGCCTCCTCCTAGCCGCAACTGGCAA +ATCCGCTCAATTTGGACTCCACCCATGACTTCCTTCAGCCATAGAGGGCC +CTACACCAGTCTCAGCCCTACTCCACTCCAGCACAATAGTTGTAGCAGGC +GTCTTCCTGCTAATCCGCTTCCATCCACTAATAGAAAACAACAAAACAAT +CCAGTCACTTACCCTATGCCTAGGAGCCATCACCACACTATTCACAGCAA +TCTGCGCACTCACTCAAAACGATATCAAAAAAATCATTGCTTTCTCCACC +TCCAGCCAACTAGGCCTGATAATCGTAACCATCGGTATCAATCAACCCTA +CCTAGCATTCCTCCACATTTGCACTCACGCATTCTTCAAAGCTATACTAT +TTATATGTTCCGGATCCATTATCCACAGCCTAAATGACGAGCAAGATATC +CGAAAAATAGGCGGACTATTTAATGCAATACCCTTCACCACCACATCTCT +AATTATTGGCAGCCTTGCACTCACCGGAATTCCTTTCCTCACAGGCTTCT +ACTCCAAAGACCTCATCATCGAAACCGCCAACACATCGTACACCAACGCC +TGAGCCCTACTAATAACTCTCATTGCCACATCCCTCACAGCTGTCTACAG +TACCCGAATCATCTTCTTTGCACTCCTAGGGCAACCCCGCTTCCTCCCTC +TGACCTCAATCAACGAAAATAACCCCTTTCTAATTAACTCCATCAAACGC +CTCTTAATTGGCAGCATTTTTGCCGGATTCTTCATCTCCAACAATATCTA +CCCCACAACCGTCCCAGAAATAACCATACCTACTTACATAAAACTCACCG +CCCTCGCAGTAACCATCCTAGGATTTACACTAGCCCTAGAACTAAGCTTG +ATAACCCATAACTTAAAACTAGAACACTCCACCAACGTATTCAAATTCTC +CAACCTCCTAGGATACTACCCAACAATTATACACCGACTCCCACCGCTCG +CTAACCTATCAATAAGCCAAAAATCAGCATCACTTCTACTAGACTCAATC +TGACTAGAAAACATCCTGCCAAAATCTATCTCCCAGTTCCAAATAAAAAC +CTCGATCCTAATTTCCACCCAAAAAGGACAAATCAAATTATATTTCCTCT +CATTCCTCATCACCCTTACCCTAAGCATACTACTTTTTAATCTCCACGAG +TAACCTCTAAAATTACCAAGACCCCAACAAGCAACGATCAACCAGTCACA +ATCACAACCCAAGCCCCATAACTATACAATGCAGCAGCCCCTATAATTTC +CTCACTAAACGCCCCAGAATCTCCAGTATCATAAATAGCTCAAGCCCCCA +CACCACTAAACTTAAACACTACCCCCACTTCCTCACTCTTCAGAACATAT +AAAACCAACATAACCTCCATCAACAACCCTAAAAGAAATACCCCCATAAC +AGTCGTATTAGACACCCATACCTCAGGATACTGCTCAGTAGCCATAGCCG +TTGTATAACCAAAAACAACCAACATTCCTCCCAAATAAATCAAAAACACC +ATCAACCCCAAAAAGGACCCTCCAAAATTCATAATAATACCACAACCTAC +CCCTCCACTTACAATCAGCACTAAACCCCCATAAATAGGTGAAGGTTTTG +AAGAAAACCCCACAAAACTAACAACAAAAATAACACTCAAAATAAACACA +ATATATGTCATCATTATTCCCACGTGGAATCTAACCACGACCAATGACAT +GAAAAATCATCGTTGTATTTCAACTATAAGAACACCAATGACAAACATCC +GGAAATCTCACCCACTAATTAAAATCATCAATCACTCTTTTATTGACCTA +CCAGCCCCCTCAAACATTTCATCATGATGAAACTTCGGCTCCCTCCTAGG +AATCTGCCTAATCCTCCAAATCTTAACAGGCCTATTCCTAGCCATACACT +ACACATCAGACACGACAACTGCCTTCTCATCCGTCACTCACATCTGCCGA +GACGTTAACTACGGATGAATTATTCGCTACCTCCATGCCAACGGAGCATC +AATATTTTTTATCTGCCTCTTCATTCACGTAGGACGCGGCCTCTACTACG +GCTCTTACACATTCCTAGAGACATGAAACATTGGAATCATCCTACTTTTC +ACAGTTATAGCTACAGCATTCATGGGCTATGTCCTACCATGAGGCCAAAT +ATCCTTTTGAGGAGCAACAGTCATCACGAACCTCCTATCAGCAATTCCCT +ACATCGGTACTACCCTCGTCGAGTGAATCTGAGGTGGATTCTCAGTAGAC +AAAGCCACCCTTACCCGATTTTTTGCTTTCCACTTCATCCTACCCTTCAT +CATCACAGCCCTGGTAGTCGTACATTTACTATTTCTTCACGAAACAGGAT +CTAATAACCCCTCAGGAATCCCATCCGATATGGACAAAATCCCATTCCAC +CCATATTATACAATTAAAGACATCCTAGGACTCCTCCTCCTGATCTTGCT +CCTACTAACTCTAGTATTATTCTCCCCCGACCTCCTAGGAGACCCAGACA +ACTACACCCCAGCTAACCCTCTCAGCACTCCCCCTCATATTAAACCAGAA +TGGTACTTCCTGTTTGCCTACGCCATCCTACGCTCCATTCCCAACAAACT +AGGCGGCGTATTAGCCCTAATCCTCTCCATCCTGATCCTAGCACTCATCC +CCACCCTCCACATATCAAAACAACGAAGCATAATATTCCGGCCTCTCAGC +CAATGCGTATTCTGACTCTTAGTGGCAGACTTACTGACACTAACATGAAT +CGGCGGACAGCCAGTGGAACACCCATACGTAATTATCGGCCAACTGGCCT +CAATCCTCTACTTCTCCCTAATTCTCATTTTTATACCACTCGCAAGCACC +ATCGAAAACAATCTTCTAAAATGAAGAGTCCCTGTAGTATATCGCACATT +ACCCTGGTCTTGTAAACCAGAAAAGGGGGAAAACGTTTCCTCCCAAGGAC +TATCAAGGAAGAAGCTCTAGCTCCACCATCAACACCCAAAGCTGAAATTC +TACTTAAACTATTCCTTGATTTCTTCCCCTAAACGACAACAATTTACCCT +CATGTGCTATGTCAGTATCAGATTATACCCCCACATAACACCATACCCAC +CTGACATGCAATATCTTATGAATGGCCTATGTACGTCGTGCATTAAATTG +TCTGCCCCATGAATAATAAGCATGTACATAATATCATTTATCTTACATAA +GTACATTATATTATTGATCGTGCATACCCCATCCAAGTCAAATCATTTCC +AGTCAACACGCATATCACAGCCCATGTTCCACGAGCTTAATCACCAAGCC +GCGGGAAATCAGCAACCCTCCCAACTACGTGTCCCAATCCTCGCTCCGGG +CCCATCCAAACGTGGGGGTTTCTACAATGAAACTATACCTGGCATCTGGT +TCTTTCTTCAGGGCCATTCCCACCCAACCTCGCCCATTCTTTCCCCTTAA +ATAAGACATCTCGATGGACTAATGACTAATCAGCCCATGCTCACACATAA +CTGTGATTTCATGCATTTGGTATCTTTTTATATTTGGGGATGCTATGACT +CAGCTATGGCCGTCAAAGGCCTCGACGCAGTCAATTAAATTGAAGCTGGA +CTTAAATTGAACGTTATTCCTCCGCATCAGCAACCATAAGGTGTTATTCA +GTCCATGGTAGCGGGACATAGGAAACAAgtgcacctgtgcacctgtgcac +ctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacct +gtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgt +gcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgc +acctgtgcacctgtgcacctgtgcacctgtgcacctgtgcacctgtgcac +ctgtgcacctACCCGCGCAGTAAGCAAGTAATATAGCTTTCTTAATCAAA +CCCCCCCTACCCCCCATTAAACTCCACATATGTACATTCAACACAATCTT +GCCAAACCCCAAAAACAAGACTAAACAATGCACAATACTTCATGAAGCTT +AACCCTCGCATGCCAACCATAATAACTCAACACACCTAACAATCTTAACA +GAACTTTCCCCCCGCCATTAATACCAACATGCTACTTTAATCAATAAAAT +TTCCATAGACAGGCATCCCCCTAGATCTAATTTTCTAAATCTGTCAACCC +TTCTTCCCCC diff -r 93474ea5c366 -r 495dda4c0693 test-data/sam_to_bam_in1.sam --- a/test-data/sam_to_bam_in1.sam Thu Dec 03 16:16:12 2009 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -HWI-EAS91_1_30788AAXX:1:1:1513:715 16 chrM 9563 25 36M * 0 0 CTGACTACCACAACTAAACATCTATGCNNAAAAAAC I+-II?IDIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1698:516 16 chrM 2735 25 36M * 0 0 TTTACACTCAGAGGTTCAACTCCTCTCNNTAACAAC I9IIIII5IIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1491:637 16 chrM 10864 25 36M * 0 0 TGTAGAAGCCCCAATTGCCGGATCCATNNTGCTAGC DBAIIIIIIIIIIIFIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1711:249 16 chrM 10617 25 36M * 0 0 ACCAAACAGAACGCCTGAACGCAGGCCNNTACTTCC IIIIIIIIIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1634:211 0 chrM 9350 25 36M * 0 0 GAAGCAGNNGCTTGATACTGACACTTCGTCGACGTA IIIIIII""IIIIIIIIIIIIIIIIIIIIII9IIDF NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1218:141 16 chrM 14062 25 36M * 0 0 ACAAAACTAACAACAAAAATAACACTCNNAATAAAC I+IIII1IIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1398:854 16 chrM 3921 25 36M * 0 0 CACCCTTCCCGTACTAATAAATCCCCTNNTCTTCAC IIIII=AIIIIIIIIIIIIIIBIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1310:991 16 chrM 10002 25 36M * 0 0 CTCCTATGCCTAGAAGGAATAATACTANNACTATTC I:2IEI:IIDIIIIII4IIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1716:413 0 chrM 6040 25 36M * 0 0 GATCCAANNCTTTATCAACACCTATTCTGATTCTTC IIIIIII""IIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1630:59 16 chrM 12387 25 36M * 0 0 TCATACTCGACCCCAACCTTACCAACCNNCCGCTCC FIIHII;IIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 diff -r 93474ea5c366 -r 495dda4c0693 test-data/sam_to_bam_in2.sam --- a/test-data/sam_to_bam_in2.sam Thu Dec 03 16:16:12 2009 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -HWI-EAS91_1_30788AAXX:1:1:1095:605 0 chrM 23 25 36M * 0 0 AAGCAAGNNACTGAAAATGCCTAGATGAGTATTCTT IIIIIII""IIIIIIIIIIIIIIIEIIIIIIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1650:1185 0 chrM 14956 25 36M * 0 0 ACCCCAGNNAACCCTCTCAGCACTCCCCCTCATATT IIIIIII""IIIIIIIIIIII6IIIIIIIII5I-II NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:799:192 16 chrM 8421 25 36M * 0 0 CCTGTAGCCCTAGCCGTGCGGCTAACCNNTAACATT II%::I<IIIIIEIII8IIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1082:719 16 chrM 7191 25 36M * 0 0 TAAATTAACCCATACCAGCACCATAGANNCTCAAGA <III0EII3+3I29I>III8AIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1746:1180 16 chrM 12013 25 36M * 0 0 CCTAAGCTTCAAACTAGATTACTTCTCNNTAATTTT IIIIIIIIFIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:606:460 0 chrM 4552 25 36M * 0 0 TTAATTTNNATTATAATAACACTCACAATATTCATA IIIIIII""IIIIIIIIIIIIIIIIII?I6IIIII6 NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1059:362 16 chrM 7348 25 36M * 0 0 GGCCACCAATGATACTGAAGCTACGAGNNTACCGAT II/<)2IIIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1483:1161 16 chrM 15080 25 36M * 0 0 TCCTGATCCTAGCACTCATCCCCACCCNNCACATAT HIIIIIFIIAIHIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1273:600 16 chrM 13855 25 36M * 0 0 GTATTAGACACCCATACCTCAGGATACNNCTCAGTA IIIIIIIIIIIIIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 -HWI-EAS91_1_30788AAXX:1:1:1190:1283 16 chrM 15338 25 36M * 0 0 TATATCGCACATTACCCTGGTCTTGTANNCCAGAAA EIII?-IIIIIAIIIIIIIIIIIIIII""IIIIIII NM:i:1 X1:i:1 MD:Z:7N0N27 diff -r 93474ea5c366 -r 495dda4c0693 test-data/sam_to_bam_out1.bam Binary file test-data/sam_to_bam_out1.bam has changed diff -r 93474ea5c366 -r 495dda4c0693 test-data/sam_to_bam_out2.bam Binary file test-data/sam_to_bam_out2.bam has changed diff -r 93474ea5c366 -r 495dda4c0693 tools/data_source/data_source.py --- a/tools/data_source/data_source.py Thu Dec 03 16:16:12 2009 -0500 +++ b/tools/data_source/data_source.py Fri Dec 04 19:51:46 2009 -0500 @@ -12,6 +12,7 @@ sys.exit() def check_gzip( filename ): + # TODO: This needs to check for BAM files since they are compressed and must remain so ( see upload.py ) temp = open( filename, "U" ) magic_check = temp.read( 2 ) temp.close() @@ -66,6 +67,7 @@ out.write( chunk ) out.close() if check_gzip( filename ): + # TODO: This needs to check for BAM files since they are compressed and must remain so ( see upload.py ) fd, uncompressed = tempfile.mkstemp() gzipped_file = gzip.GzipFile( filename ) while 1: diff -r 93474ea5c366 -r 495dda4c0693 tools/data_source/upload.py --- a/tools/data_source/upload.py Thu Dec 03 16:16:12 2009 -0500 +++ b/tools/data_source/upload.py Fri Dec 04 19:51:46 2009 -0500 @@ -4,7 +4,7 @@ # WARNING: Changes in this tool (particularly as related to parsing) may need # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools -import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs +import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii from galaxy import eggs # need to import model before sniff to resolve a circular import dependency import galaxy.model @@ -18,7 +18,6 @@ def stop_err( msg, ret=1 ): sys.stderr.write( msg ) sys.exit( ret ) - def file_err( msg, dataset, json_file ): json_file.write( to_json_string( dict( type = 'dataset', ext = 'data', @@ -28,7 +27,6 @@ os.remove( dataset.path ) except: pass - def safe_dict(d): """ Recursively clone json structure with UTF-8 dictionary keys @@ -40,7 +38,6 @@ return [safe_dict(x) for x in d] else: return d - def check_html( temp_name, chunk=None ): if chunk is None: temp = open(temp_name, "U") @@ -64,7 +61,6 @@ if chunk is None: temp.close() return False - def check_binary( temp_name, chunk=None ): if chunk is None: temp = open( temp_name, "U" ) @@ -85,21 +81,42 @@ if chunk is None: temp.close() return False - def check_gzip( temp_name ): + # This is sort of hacky. BAM is compressed in the BGZF format, and must + # not be uncompressed in upon upload ( it will be detected as gzipped ). + # The tuple we're returning from here contains boolean values for + # ( is_compressed, is_valid, is_bam ). temp = open( temp_name, "U" ) magic_check = temp.read( 2 ) temp.close() if magic_check != util.gzip_magic: - return ( False, False ) + return ( False, False, False ) CHUNK_SIZE = 2**15 # 32Kb gzipped_file = gzip.GzipFile( temp_name ) chunk = gzipped_file.read( CHUNK_SIZE ) gzipped_file.close() - if check_html( temp_name, chunk=chunk ) or check_binary( temp_name, chunk=chunk ): - return( True, False ) - return ( True, True ) - + if check_html( temp_name, chunk=chunk ): + return ( True, False, False ) + if check_binary( temp_name, chunk=chunk ): + # We do support some binary data types, so check if the compressed binary file is valid + # We currently only check for [ 'sff', 'bam' ] + # TODO: this should be fixed to more easily support future-supported binary data types. + # This is currently just copied from the sniff methods. + # The first 4 bytes of any bam file is 'BAM\1', and the file is binary. + try: + header = gzip.open( temp_name ).read(4) + if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ): + return ( True, True, True ) + except: + pass + try: + header = gzip.open( temp_name ).read(4) + if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ): + return ( True, True, False ) + except: + pass + return ( True, False, False ) + return ( True, True, False ) def check_zip( temp_name ): if not zipfile.is_zipfile( temp_name ): return ( False, False, None ) @@ -116,14 +133,12 @@ if ext != test_ext: return ( True, False, test_ext ) return ( True, True, test_ext ) - def parse_outputs( args ): rval = {} for arg in args: id, files_path, path = arg.split( ':', 2 ) rval[int( id )] = ( path, files_path ) return rval - def add_file( dataset, json_file, output_path ): data_type = None line_count = None @@ -153,15 +168,19 @@ ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress - is_gzipped, is_valid = check_gzip( dataset.path ) + is_gzipped, is_valid, is_bam = check_gzip( dataset.path ) if is_gzipped and not is_valid: file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) return - elif is_gzipped and is_valid: - # We need to uncompress the temp_name file + elif is_gzipped and is_valid and is_bam: + ext = 'bam' + data_type = 'bam' + elif is_gzipped and is_valid and not is_bam: + # We need to uncompress the temp_name file, but BAM files must remain compressed + # in order for samtools to function on them CHUNK_SIZE = 2**20 # 1Mb - fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ) ) - gzipped_file = gzip.GzipFile( dataset.path ) + fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) + gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE ) @@ -229,7 +248,7 @@ if check_html( dataset.path ): file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) return - if data_type != 'binary' and data_type != 'zip': + if data_type != 'bam' and data_type != 'binary' and data_type != 'zip': if dataset.space_to_tab: line_count = sniff.convert_newlines_sep2tabs( dataset.path ) else: diff -r 93474ea5c366 -r 495dda4c0693 tools/samtools/sam_to_bam.py --- a/tools/samtools/sam_to_bam.py Thu Dec 03 16:16:12 2009 -0500 +++ b/tools/samtools/sam_to_bam.py Fri Dec 04 19:51:46 2009 -0500 @@ -1,31 +1,27 @@ #! /usr/bin/python - """ -Converts SAM data to BAM format. - -usage: %prog [options] - -i, --input1=i: SAM file to be converted - -d, --dbkey=d: dbkey value - -r, --ref_file=r: Reference file if choosing from history - -o, --output1=o: BAM output - -x, --index_dir=x: Index directory - -usage: %prog input_file dbkey ref_list output_file +Converts SAM data to sorted BAM data. +usage: sam_to_bam.py [options] + --input1: SAM file to be converted + --dbkey: dbkey value + --ref_file: Reference file if choosing from history + --output1: output dataset in bam format + --index_dir: GALAXY_DATA_INDEX_DIR """ -import os, sys, tempfile +import optparse, os, sys, subprocess, tempfile, shutil, gzip from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) from bx.cookbook import doc_optparse +from galaxy import util def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() -def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ): - seq_file = "%s/sam_fa_indices.loc" % GALAXY_DATA_INDEX_DIR +def check_seq_file( dbkey, cached_seqs_pointer_file ): seq_path = '' - for line in open( seq_file ): + for line in open( cached_seqs_pointer_file ): line = line.rstrip( '\r\n' ) if line and not line.startswith( "#" ) and line.startswith( 'index' ): fields = line.split( '\t' ) @@ -38,48 +34,80 @@ def __main__(): #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - seq_path = check_seq_file( options.dbkey, options.index_dir ) + parser = optparse.OptionParser() + parser.add_option( '', '--input1', dest='input1', help='The input SAM dataset' ) + parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) + parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) + parser.add_option( '', '--output1', dest='output1', help='The output BAM dataset' ) + parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) + ( options, args ) = parser.parse_args() + + cached_seqs_pointer_file = "%s/sam_fa_indices.loc" % options.index_dir + if not os.path.exists( cached_seqs_pointer_file ): + stop_err( "The required file (%s) does not exist." % cached_seqs_pointer_file ) + # If found for the dbkey, seq_path will look something like /depot/data2/galaxy/equCab2/sam_index/equCab2.fa, + # and the equCab2.fa file will contain fasta sequences. + seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) tmp_dir = tempfile.gettempdir() - os.chdir(tmp_dir) - tmpf1 = tempfile.NamedTemporaryFile(dir=tmp_dir) - tmpf1fai = '%s.fai' % tmpf1.name - tmpf2 = tempfile.NamedTemporaryFile(dir=tmp_dir) - tmpf3 = tempfile.NamedTemporaryFile(dir=tmp_dir) - tmpf3bam = '%s.bam' % tmpf3.name if options.ref_file == "None": - full_path = "%s.fai" % seq_path - if not os.path.exists( full_path ): - stop_err( "No sequences are available for '%s', request them by reporting this error." % options.dbkey ) - cmd1 = "cp %s %s; cp %s %s" % (seq_path, tmpf1.name, full_path, tmpf1fai) + # We're using locally cached reference sequences( e.g., /depot/data2/galaxy/equCab2/sam_index/equCab2.fa ). + # The indexes for /depot/data2/galaxy/equCab2/sam_index/equCab2.fa will be contained in + # a file named /depot/data2/galaxy/equCab2/sam_index/equCab2.fa.fai + fai_index_file_path = "%s.fai" % seq_path + if not os.path.exists( fai_index_file_path ): + stop_err( "No sequences are available for build (%s), request them by reporting this error." % options.dbkey ) else: - cmd1 = "cp %s %s; samtools faidx %s 2>/dev/null" % (options.ref_file, tmpf1.name, tmpf1.name) - cmd2 = "samtools view -bt %s -o %s %s 2>/dev/null" % (tmpf1fai, tmpf2.name, options.input1) - cmd3 = "samtools sort %s %s 2>/dev/null" % (tmpf2.name, tmpf3.name) - cmd4 = "cp %s %s" % (tmpf3bam, options.output1) - # either create index based on fa file or copy provided index to temp directory + try: + # Create indexes for history reference ( e.g., ~/database/files/000/dataset_1.dat ) using samtools faidx, which will: + # - index reference sequence in the FASTA format or extract subsequence from indexed reference sequence + # - if no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk + # - if regions are specified, the subsequences will be retrieved and printed to stdout in the FASTA format + # - the input file can be compressed in the RAZF format. + # IMPORTANT NOTE: a real weakness here is that we are creating indexes for the history dataset + # every time we run this tool. It would be nice if we could somehow keep track of user's specific + # index files so they could be re-used. + fai_index_file_path = os.path.join( tmp_dir, os.path.basename( options.ref_file ) ) + # At this point, fai_index_file_path will look something like /tmp/dataset_13.dat + os.symlink( options.ref_file, fai_index_file_path ) + command = "samtools faidx %s 2>/dev/null" % fai_index_file_path + proc = subprocess.Popen( args=command, shell=True ) + proc.wait() + except Exception, e: + stop_err( 'Error creating indexes from reference (%s), %s' % ( options.ref_file, str( e ) ) ) try: - os.system(cmd1) - except Exception, eq: - stop_err("Error creating the reference list index.\n" + str(eq)) - # create original bam file + # Extract all alignments from the input SAM file to BAM format ( since no region is specified, all the alignments will be extracted ). + tmp_aligns_file = tempfile.NamedTemporaryFile() + tmp_aligns_file_name = tmp_aligns_file.name + tmp_aligns_file.close() + # IMPORTANT NOTE: for some reason the samtools view command gzips the resulting bam file without warning, + # and the docs do not currently state that this occurs ( very bad ). + command = "samtools view -bt %s -o %s %s 2>/dev/null" % ( fai_index_file_path, tmp_aligns_file_name, options.input1 ) + proc = subprocess.Popen( args=command, shell=True ) + proc.wait() + except Exception, e: + stop_err( 'Error extracting alignments from (%s), %s' % ( options.input1, str( e ) ) ) try: - os.system(cmd2) - except Exception, eq: - stop_err("Error running view command.\n" + str(eq)) - # sort original bam file to produce sorted output bam file - try: - os.system(cmd3) - os.system(cmd4) - except Exception, eq: - stop_err("Error sorting data and creating output file.\n" + str(eq)) - # cleanup temp files - tmpf1.close() - tmpf2.close() - tmpf3.close() - if os.path.exists(tmpf1fai): - os.remove(tmpf1fai) - if os.path.exists(tmpf3bam): - os.remove(tmpf3bam) + # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created. This command + # may also create temporary files <out.prefix>.%d.bam when the whole alignment cannot be fitted + # into memory ( controlled by option -m ). + tmp_sorted_aligns_file = tempfile.NamedTemporaryFile() + tmp_sorted_aligns_file_name = tmp_sorted_aligns_file.name + tmp_sorted_aligns_file.close() + command = "samtools sort %s %s 2>/dev/null" % ( tmp_aligns_file_name, tmp_sorted_aligns_file_name ) + proc = subprocess.Popen( args=command, shell=True ) + proc.wait() + except Exception, e: + stop_err( 'Error sorting alignments from (%s), %s' % ( tmp_aligns_file_name, str( e ) ) ) + # Move tmp_aligns_file_name to our output dataset location + sorted_bam_file = '%s.bam' % tmp_sorted_aligns_file_name + shutil.move( sorted_bam_file, options.output1 ) + if options.ref_file != "None": + # Remove the symlink from /tmp/dataset_13.dat to ~/database/files/000/dataset_13.dat + os.unlink( fai_index_file_path ) + # Remove the index file + index_file_name = '%s.fai' % fai_index_file_path + os.unlink( index_file_name ) + # Remove the tmp_aligns_file_name + os.unlink( tmp_aligns_file_name ) if __name__=="__main__": __main__() diff -r 93474ea5c366 -r 495dda4c0693 tools/samtools/sam_to_bam.xml --- a/tools/samtools/sam_to_bam.xml Thu Dec 03 16:16:12 2009 -0500 +++ b/tools/samtools/sam_to_bam.xml Fri Dec 04 19:51:46 2009 -0500 @@ -1,32 +1,29 @@ <tool id="sam_to_bam" name="SAM-to-BAM" version="1.0.0"> <description>converts SAM format to BAM format</description> <command interpreter="python"> - sam_to_bam.py - --input1=$source.input1 - --dbkey=${input1.metadata.dbkey} - #if $source.indexSource == "history": - --ref_file=$ref_file - #else - --ref_file="None" - #end if - --output1=$output1 - --index_dir=${GALAXY_DATA_INDEX_DIR} +sam_to_bam.py --input1=$source.input1 --dbkey=${input1.metadata.dbkey} +#if $source.index_source == "history": +--ref_file=$source.ref_file +#else +--ref_file="None" +#end if +--output1=$output1 --index_dir=${GALAXY_DATA_INDEX_DIR} </command> <inputs> <conditional name="source"> - <param name="indexSource" type="select" label="Choose the source for the reference list"> - <option value="built_in">Built-in</option> + <param name="index_source" type="select" label="Choose the source for the reference list"> + <option value="cached">Locally cached</option> <option value="history">History</option> </param> - <when value="built_in"> + <when value="cached"> <param name="input1" type="data" format="sam" label="SAM File to Convert"> <validator type="unspecified_build" /> <validator type="dataset_metadata_in_file" filename="sam_fa_indices.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="index" /> </param> </when> <when value="history"> - <param name="input1" type="data" format="sam" label="SAM File to Convert" /> - <param name="ref_file" type="data" format="fasta" label="Choose the reference file" /> + <param name="input1" type="data" format="sam" label="Convert SAM file" /> + <param name="ref_file" type="data" format="fasta" label="Using reference file" /> </when> </conditional> </inputs> @@ -34,19 +31,16 @@ <data name="output1" format="bam"/> </outputs> <tests> + <!-- + # IMPORTANT NOTE: for some reason the samtools view command gzips the resulting bam file without warning, + # and the docs do not currently state that this occurs ( very bad ). + --> <test> - <param name="indexSource" value="history" /> - <param name="input1" value="sam_to_bam_in1.sam" ftype="sam" /> - <param name="ref_file" value="chrM.fa" ftype="fasta" /> + <param name="index_source" value="history" /> + <param name="input1" value="3.sam" ftype="sam" /> + <param name="ref_file" value="chr_m.fasta" ftype="fasta" /> <output name="output1" file="sam_to_bam_out1.bam" /> </test> -<!-- chrM is not a built-in dbkey (in the builds.txt list) so can't be tested - <test> - <param name="indexSource" value="built_in" /> - <param name="input1" value="sam_to_bam_in2.sam" ftype="sam" dbkey="chrM" /> - <output name="output1" file="sam_to_bam_out2.bam" /> - </test> ---> </tests> <help> diff -r 93474ea5c366 -r 495dda4c0693 tools/sr_mapping/bowtie_wrapper.xml --- a/tools/sr_mapping/bowtie_wrapper.xml Thu Dec 03 16:16:12 2009 -0500 +++ b/tools/sr_mapping/bowtie_wrapper.xml Fri Dec 04 19:51:46 2009 -0500 @@ -338,7 +338,7 @@ </test> <test> <param name="genomeSource" value="history" /> - <param name="ownFile" value="chrM.fa" /> + <param name="ownFile" value="chr_m.fasta" /> <param name="index_settings" value="index_pre_set" /> <param name="sPaired" value="paired" /> <param name="input1" ftype="fastqsanger" value="bowtie_in2.fastq" /> @@ -349,7 +349,7 @@ </test> <test> <param name="genomeSource" value="history" /> - <param name="ownFile" value="chrM.fa" /> + <param name="ownFile" value="chr_m.fasta" /> <param name="index_settings" value="index_full" /> <param name="auto_b" value="set" /> <param name="packed" value="unpacked" /> diff -r 93474ea5c366 -r 495dda4c0693 tools/sr_mapping/lastz_wrapper.xml --- a/tools/sr_mapping/lastz_wrapper.xml Thu Dec 03 16:16:12 2009 -0500 +++ b/tools/sr_mapping/lastz_wrapper.xml Fri Dec 04 19:51:46 2009 -0500 @@ -125,10 +125,13 @@ <requirement type="binary">lastz</requirement> </requirements> <tests> - <test> <!-- Lastz command: lastz phiX.2bit/PHIX174[nickname=Ref] test-data/b1.fasta +nogfextend +nochain +gapped +strand=both +seed=12of19 +transition O=400 E=30 X=910 Y=9370 K=3000 L=3000 +noentropy +ambiguousn +nolaj +identity=0..100 +coverage=0 +format=sam- > lastz_wrapper_out2.sam - You need to point to phiX.2bit somewhere on your system. b1.fasta is located in galaxy's test-data - You will have to replace all the pluses before the commands with 2 dashes, - as double-dash can't appear in an XML comment --> + <test> + <!-- + Lastz command: + lastz phiX.2bit/PHIX174[nickname=Ref] test-data/b1.fasta +nogfextend +nochain +gapped +strand=both +seed=12of19 +transition O=400 E=30 X=910 Y=9370 K=3000 L=3000 +noentropy +ambiguousn +nolaj +identity=0..100 +coverage=0 +format=sam- > lastz_wrapper_out2.sam + You need to point to phiX.2bit somewhere on your system. b1.fasta is located in galaxy's test-data. You will have to replace all the pluses before the + commands with 2 dashes, as double-dash can't appear in an XML comment. + --> <param name="input2" value="b1.fasta" ftype="fasta" /> <param name="ref_source" value="cached" /> <param name="input1_2bit" value="phiX" /> @@ -156,10 +159,13 @@ <param name="num_threads" value="4" /> <output name="output1" file="lastz_wrapper_out2.sam" /> </test> - <test> <!-- Lastz command: lastz test-data/phiX.fasta test-data/b1.fasta[fullnames] +yasra95short +ambiguousn +nolaj +identity=0..100 +coverage=0 +format=diffs > lastz_wrapper_out3.tabular - phiX.fasta and b1.fasta are located in galaxy's test-data - You will have to replace all the pluses before the commands with 2 dashes, - as double-dash can't appear in an XML comment --> + <test> + <!-- + Lastz command: + lastz test-data/phiX.fasta test-data/b1.fasta[fullnames] +yasra95short +ambiguousn +nolaj +identity=0..100 +coverage=0 +format=diffs > lastz_wrapper_out3.tabular + phiX.fasta and b1.fasta are located in galaxy's test-data. You will have to replace all the pluses before the commands with 2 dashes, + as double-dash can't appear in an XML comment. + --> <param name="input2" value="b1.fasta" ftype="fasta" /> <param name="ref_source" value="history" /> <param name="input1" value="phiX.fasta" ftype="fasta" /> @@ -173,14 +179,7 @@ <param name="num_threads" value="4" /> <output name="output1" file="lastz_wrapper_out3.tabular" /> </test> - <test> <!-- Lastz command: first you will need to split the file phiX_split.fasta into two files, - phiX1.fasta and phiX2.fasta, each with 1 sequence (phiX1 and phiX2, respectively). Then: - lastz phiX1.fasta test-data/b1.fasta *yasra95short *ambiguousn *nolaj *identity=0..100 *coverage=0 *format=general:score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle > lastz_wrapper_out4.tabular - lastz phiX2.fasta test-data/b1.fasta *yasra95short *ambiguousn *nolaj *identity=0..100 *coverage=0 *format=general:score,name1,strand1,size1,start1,zstart1,end1,length1,text1,name2,strand2,size2,start2,zstart2,end2,start2+,zstart2+,end2+,length2,text2,diff,cigar,identity,coverage,gaprate,diagonal,shingle >> lastz_wrapper_out4.tabular - You need to point to phiX1.fasta and phiX2.fasta somewhere on your system. - phiX_split.fasta and b1.fasta are located in galaxy's test-data - You will have to replace all the asterisks before the commands with 2 dashes, - as double-dash can't appear in an XML comment --> + <test> <param name="input2" value="b1.fasta" ftype="fasta" /> <param name="ref_source" value="history" /> <param name="input1" value="phiX_split.fasta" ftype="fasta" />
participants (1)
-
Greg Von Kuster