commit/galaxy-central: 6 new changesets
6 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/47681422a0ca/ Changeset: 47681422a0ca Branch: mepcotterell/allow-pip-to-run-when-venv_directory-con-1373715782557 User: natefoo Date: 2013-07-30 18:31:59 Summary: Close branch. Affected #: 0 files https://bitbucket.org/galaxy/galaxy-central/commits/c51f98990299/ Changeset: c51f98990299 User: natefoo Date: 2013-07-30 18:32:17 Summary: Merge closed branch. Affected #: 0 files https://bitbucket.org/galaxy/galaxy-central/commits/73d7a93e2f8d/ Changeset: 73d7a93e2f8d User: natefoo Date: 2013-07-30 18:44:05 Summary: Backout af20b15f7eda, these changes will be reapplied after tools using the sam_fa_indices data table have been migrated to the Tool Shed in their original state and updated to use the new table format. Affected #: 2 files diff -r c51f98990299f40ac6ce13bf9ea7cee47158c615 -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 tool-data/sam_fa_new_indices.loc.sample --- a/tool-data/sam_fa_new_indices.loc.sample +++ /dev/null @@ -1,30 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use a directory of Samtools indexed sequences data files. You will need -#to create these data files and then create a sam_fa_new_indices.loc file -#similar to this one (store it in this directory) that points to -#the directories in which those files are stored. The sam_fa_new_indices.loc -#file has this format (white space characters are TAB characters): -# -# <unique_build_id><dbkey><display_name><file_base_path> -# -#So, for example, if you had hg19 Canonical indexed stored in -# -# /depot/data2/galaxy/hg19/sam/, -# -#then the sam_fa_new_indices.loc entry would look like this: -# -#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa -# -#and your /depot/data2/galaxy/hg19/sam/ directory -#would contain hg19canon.fa and hg19canon.fa.fai files. -# -#Your sam_fa_new_indices.loc file should include an entry per line for -#each index set you have stored. The file in the path does actually -#exist, but it should never be directly used. Instead, the name serves -#as a prefix for the index file. For example: -# -#hg18canon hg18 Human (Homo sapiens): hg18 Canonical /depot/data2/galaxy/hg18/sam/hg18canon.fa -#hg18full hg18 Human (Homo sapiens): hg18 Full /depot/data2/galaxy/hg18/sam/hg18full.fa -#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa -#hg19full hg19 Human (Homo sapiens): hg19 Full /depot/data2/galaxy/hg19/sam/hg19full.fa - diff -r c51f98990299f40ac6ce13bf9ea7cee47158c615 -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample +++ b/tool_data_table_conf.xml.sample @@ -55,26 +55,10 @@ <columns>value, name, path</columns><file path="tool-data/perm_color_index.loc" /></table> - <!-- Location of SAMTools indexes and other files (new version) - Warning: until Galaxy release_2013.06.03 the format of this - table was: - + <!-- Location of SAMTools indexes and other files --> + <table name="sam_fa_indexes" comment_char="#"><columns>line_type, value, path</columns><file path="tool-data/sam_fa_indices.loc" /> - - If you are updating your tool_data_table_conf.xml to the current - version you should first migrate your - tool-data/sam_fa_indices.loc file to a new - tool-data/sam_fa_new_indices.loc file with the format specified - below, which is explained in the relative sample file - tool-data/sam_fa_new_indices.loc.sample . - By using the new format it is possible to let the user choose - among multiple indexed genome variants having the same dbkey, - e.g. hg19canon vs. hg19full variants for hg19 dbkey. - --> - <table name="sam_fa_indexes" comment_char="#"> - <columns>value, dbkey, name, path</columns> - <file path="tool-data/sam_fa_new_indices.loc" /></table><!-- Location of Picard dict file and other files --><table name="picard_indexes" comment_char="#"> https://bitbucket.org/galaxy/galaxy-central/commits/c386fc668d0e/ Changeset: c386fc668d0e User: natefoo Date: 2013-07-30 18:50:29 Summary: See previous commit message. Note that cufflinks reverts to 0.0.6, not 0.0.7, due to a version increase that came after the changes that are being rolled back. Affected #: 8 files diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cuffcompare_wrapper.py --- a/tools/ngs_rna/cuffcompare_wrapper.py +++ b/tools/ngs_rna/cuffcompare_wrapper.py @@ -8,6 +8,20 @@ sys.stderr.write( '%s\n' % msg ) sys.exit() +# Copied from sam_to_bam.py: +def check_seq_file( dbkey, cached_seqs_pointer_file ): + seq_path = '' + for line in open( cached_seqs_pointer_file ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and line.startswith( 'index' ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[1] == dbkey: + seq_path = fields[2].strip() + break + return seq_path + def __main__(): #Parse Command Line parser = optparse.OptionParser() @@ -16,7 +30,8 @@ parser.add_option( '-s', dest='use_seq_data', action="store_true", help='Causes cuffcompare to look into for fasta files with the underlying genomic sequences (one file per contig) against which your reads were aligned for some optional classification functions. For example, Cufflinks transcripts consisting mostly of lower-case bases are classified as repeats. Note that <seq_dir> must contain one fasta file per reference chromosome, and each file must be named after the chromosome, and have a .fa or .fasta extension.') # Wrapper / Galaxy options. - parser.add_option( '', '--index', dest='index', help='The path of the reference genome' ) + parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) + parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) # Outputs. @@ -45,16 +60,21 @@ # Set/link to sequence file. if options.use_seq_data: - if options.ref_file: + if options.ref_file != 'None': # Sequence data from history. # Create symbolic link to ref_file so that index will be created in working directory. seq_path = "ref.fa" os.symlink( options.ref_file, seq_path ) else: # Sequence data from loc file. - if not os.path.exists( options.index ): - stop_err( 'Reference genome %s not present, request it by reporting this error.' % options.index ) - seq_path = options.index + cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) + if not os.path.exists( cached_seqs_pointer_file ): + stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) + # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, + # and the equCab2.fa file will contain fasta sequences. + seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) + if seq_path == '': + stop_err( 'No sequence data found for dbkey %s, so sequence data cannot be used.' % options.dbkey ) # Build command. diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cuffcompare_wrapper.xml --- a/tools/ngs_rna/cuffcompare_wrapper.xml +++ b/tools/ngs_rna/cuffcompare_wrapper.xml @@ -1,4 +1,4 @@ -<tool id="cuffcompare" name="Cuffcompare" version="0.0.6"> +<tool id="cuffcompare" name="Cuffcompare" version="0.0.5"><!-- Wrapper supports Cuffcompare versions v1.3.0 and newer --><description>compare assembled transcripts to a reference annotation and track Cufflinks transcripts across multiple experiments</description><requirements> @@ -17,12 +17,14 @@ ## Use sequence data? #if $seq_data.use_seq_data == "Yes": - -s + -s #if $seq_data.seq_source.index_source == "history": --ref_file=$seq_data.seq_source.ref_file #else: - --index=${seq_data.seq_source.index.fields.path} + --ref_file="None" #end if + --dbkey=${first_input.metadata.dbkey} + --index_dir=${GALAXY_DATA_INDEX_DIR} #end if ## Outputs. @@ -64,14 +66,7 @@ <option value="cached">Locally cached</option><option value="history">History</option></param> - <when value="cached"> - <param name="index" type="select" label="Using reference genome"> - <options from_data_table="sam_fa_indexes"> - <filter type="data_meta" ref="first_input" key="dbkey" column="1" /> - <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" /> - </options> - </param> - </when> + <when value="cached"></when><when value="history"><param name="ref_file" type="data" format="fasta" label="Using reference file" /></when> diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cuffdiff_wrapper.py --- a/tools/ngs_rna/cuffdiff_wrapper.py +++ b/tools/ngs_rna/cuffdiff_wrapper.py @@ -35,6 +35,20 @@ sys.stderr.write( "%s\n" % msg ) sys.exit() +# Copied from sam_to_bam.py: +def check_seq_file( dbkey, cached_seqs_pointer_file ): + seq_path = '' + for line in open( cached_seqs_pointer_file ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and line.startswith( 'index' ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[1] == dbkey: + seq_path = fields[2].strip() + break + return seq_path + def __main__(): #Parse Command Line parser = optparse.OptionParser() @@ -69,7 +83,8 @@ # Bias correction options. parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.') - parser.add_option( '', '--index', dest='index', help='The path of the reference genome' ) + parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) + parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) # Outputs. @@ -108,16 +123,21 @@ # If doing bias correction, set/link to sequence file. if options.do_bias_correction: - if options.ref_file: + if options.ref_file != 'None': # Sequence data from history. # Create symbolic link to ref_file so that index will be created in working directory. seq_path = "ref.fa" os.symlink( options.ref_file, seq_path ) else: # Sequence data from loc file. - if not os.path.exists( options.index ): - stop_err( 'Reference genome %s not present, request it by reporting this error.' % options.index ) - seq_path = options.index + cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) + if not os.path.exists( cached_seqs_pointer_file ): + stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) + # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, + # and the equCab2.fa file will contain fasta sequences. + seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) + if seq_path == '': + stop_err( 'No sequence data found for dbkey %s, so bias correction cannot be used.' % options.dbkey ) # Build command. diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cuffdiff_wrapper.xml --- a/tools/ngs_rna/cuffdiff_wrapper.xml +++ b/tools/ngs_rna/cuffdiff_wrapper.xml @@ -1,4 +1,4 @@ -<tool id="cuffdiff" name="Cuffdiff" version="0.0.6"> +<tool id="cuffdiff" name="Cuffdiff" version="0.0.5"><!-- Wrapper supports Cuffdiff versions 2.1.0-2.1.1 --><description>find significant changes in transcript expression, splicing, and promoter use</description><requirements> @@ -42,12 +42,14 @@ ## Bias correction? #if $bias_correction.do_bias_correction == "Yes": - -b + -b #if $bias_correction.seq_source.index_source == "history": --ref_file=$bias_correction.seq_source.ref_file #else: - --index=${bias_correction.seq_source.index.fields.path} + --ref_file="None" #end if + --dbkey=${gtf_input.metadata.dbkey} + --index_dir=${GALAXY_DATA_INDEX_DIR} #end if ## Inputs. @@ -131,14 +133,7 @@ <option value="cached">Locally cached</option><option value="history">History</option></param> - <when value="cached"> - <param name="index" type="select" label="Using reference genome"> - <options from_data_table="sam_fa_indexes"> - <filter type="data_meta" ref="gtf_input" key="dbkey" column="1" /> - <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" /> - </options> - </param> - </when> + <when value="cached"></when><when value="history"><param name="ref_file" type="data" format="fasta" label="Using reference file" /></when> diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cufflinks_wrapper.py --- a/tools/ngs_rna/cufflinks_wrapper.py +++ b/tools/ngs_rna/cufflinks_wrapper.py @@ -10,6 +10,20 @@ sys.stderr.write( "%s\n" % msg ) sys.exit() +# Copied from sam_to_bam.py: +def check_seq_file( dbkey, cached_seqs_pointer_file ): + seq_path = '' + for line in open( cached_seqs_pointer_file ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and line.startswith( 'index' ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[1] == dbkey: + seq_path = fields[2].strip() + break + return seq_path + def __main__(): #Parse Command Line parser = optparse.OptionParser() @@ -39,7 +53,8 @@ # Bias correction options. parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.') - parser.add_option( '', '--index', dest='index', help='The path of the reference genome' ) + parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) + parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) # Global model. @@ -68,16 +83,21 @@ # If doing bias correction, set/link to sequence file. if options.do_bias_correction: - if options.ref_file: + if options.ref_file != 'None': # Sequence data from history. # Create symbolic link to ref_file so that index will be created in working directory. seq_path = "ref.fa" os.symlink( options.ref_file, seq_path ) else: # Sequence data from loc file. - if not os.path.exists( options.index ): - stop_err( 'Reference genome %s not present, request it by reporting this error.' % options.index ) - seq_path = options.index + cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) + if not os.path.exists( cached_seqs_pointer_file ): + stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) + # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, + # and the equCab2.fa file will contain fasta sequences. + seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) + if seq_path == '': + stop_err( 'No sequence data found for dbkey %s, so bias correction cannot be used.' % options.dbkey ) # Build command. diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cufflinks_wrapper.xml --- a/tools/ngs_rna/cufflinks_wrapper.xml +++ b/tools/ngs_rna/cufflinks_wrapper.xml @@ -1,4 +1,4 @@ -<tool id="cufflinks" name="Cufflinks" version="0.0.7"> +<tool id="cufflinks" name="Cufflinks" version="0.0.6"><!-- Wrapper supports Cufflinks versions v1.3.0 and newer --><description>transcript assembly and FPKM (RPKM) estimates for RNA-Seq data</description><requirements> @@ -29,12 +29,14 @@ ## Bias correction? #if $bias_correction.do_bias_correction == "Yes": - -b + -b #if $bias_correction.seq_source.index_source == "history": --ref_file=$bias_correction.seq_source.ref_file #else: - --index=${bias_correction.seq_source.index.fields.path} + --ref_file="None" #end if + --dbkey=${input.metadata.dbkey} + --index_dir=${GALAXY_DATA_INDEX_DIR} #end if ## Multi-read correct? @@ -65,15 +67,15 @@ <when value="No"></when><when value="Use reference annotation"><param format="gff3,gtf" name="reference_annotation_file" type="data" label="Reference Annotation" help="Gene annotation dataset in GTF or GFF3 format."/> - </when> - <when value="Use reference annotation guide"> + </when> + <when value="Use reference annotation guide"><param format="gff3,gtf" name="reference_annotation_guide_file" type="data" label="Reference Annotation" help="Gene annotation dataset in GTF or GFF3 format."/> - </when> + </when></conditional><conditional name="bias_correction"><param name="do_bias_correction" type="select" label="Perform Bias Correction" help="Bias detection and correction can significantly improve accuracy of transcript abundance estimates."><option value="No" selected="true">No</option> - <option value="Yes">Yes</option> + <option value="Yes">Yes</option></param><when value="Yes"><conditional name="seq_source"> @@ -81,14 +83,7 @@ <option value="cached" selected="true">Locally cached</option><option value="history">History</option></param> - <when value="cached"> - <param name="index" type="select" label="Using reference genome"> - <options from_data_table="sam_fa_indexes"> - <filter type="data_meta" ref="input" key="dbkey" column="1" /> - <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" /> - </options> - </param> - </when> + <when value="cached"></when><when value="history"><param name="ref_file" type="data" format="fasta" label="Using reference file" /></when> diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cuffmerge_wrapper.py --- a/tools/ngs_rna/cuffmerge_wrapper.py +++ b/tools/ngs_rna/cuffmerge_wrapper.py @@ -8,6 +8,20 @@ sys.stderr.write( '%s\n' % msg ) sys.exit() +# Copied from sam_to_bam.py: +def check_seq_file( dbkey, cached_seqs_pointer_file ): + seq_path = '' + for line in open( cached_seqs_pointer_file ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and line.startswith( 'index' ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[1] == dbkey: + seq_path = fields[2].strip() + break + return seq_path + def __main__(): #Parse Command Line parser = optparse.OptionParser() @@ -17,7 +31,8 @@ # Wrapper / Galaxy options. - parser.add_option( '', '--index', dest='index', help='The path of the reference genome' ) + parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) + parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) # Outputs. @@ -46,16 +61,21 @@ # Set/link to sequence file. if options.use_seq_data: - if options.ref_file: + if options.ref_file != 'None': # Sequence data from history. # Create symbolic link to ref_file so that index will be created in working directory. seq_path = "ref.fa" os.symlink( options.ref_file, seq_path ) else: # Sequence data from loc file. - if not os.path.exists( options.index ): - stop_err( 'Reference genome %s not present, request it by reporting this error.' % options.index ) - seq_path = options.index + cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' ) + if not os.path.exists( cached_seqs_pointer_file ): + stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) + # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, + # and the equCab2.fa file will contain fasta sequences. + seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) + if seq_path == '': + stop_err( 'No sequence data found for dbkey %s, so sequence data cannot be used.' % options.dbkey ) # Build command. diff -r 73d7a93e2f8d0bafbcb0b4dd9bf562d1fa6a88e0 -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 tools/ngs_rna/cuffmerge_wrapper.xml --- a/tools/ngs_rna/cuffmerge_wrapper.xml +++ b/tools/ngs_rna/cuffmerge_wrapper.xml @@ -1,4 +1,4 @@ -<tool id="cuffmerge" name="Cuffmerge" version="0.0.6"> +<tool id="cuffmerge" name="Cuffmerge" version="0.0.5"><!-- Wrapper supports Cuffmerge versions 1.3 and newer --><description>merge together several Cufflinks assemblies</description><requirements> @@ -16,12 +16,14 @@ ## Use sequence data? #if $seq_data.use_seq_data == "Yes": - -s + -s #if $seq_data.seq_source.index_source == "history": --ref_file=$seq_data.seq_source.ref_file #else: - --index=${seq_data.seq_source.index.fields.path} + --ref_file="None" #end if + --dbkey=${first_input.metadata.dbkey} + --index_dir=${GALAXY_DATA_INDEX_DIR} #end if ## Outputs. @@ -62,14 +64,7 @@ <option value="cached">Locally cached</option><option value="history">History</option></param> - <when value="cached"> - <param name="index" type="select" label="Using reference genome"> - <options from_data_table="sam_fa_indexes"> - <filter type="data_meta" ref="first_input" key="dbkey" column="1" /> - <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" /> - </options> - </param> - </when> + <when value="cached"></when><when value="history"><param name="ref_file" type="data" format="fasta" label="Using reference file" /></when> https://bitbucket.org/galaxy/galaxy-central/commits/951e853b0bcd/ Changeset: 951e853b0bcd User: natefoo Date: 2013-07-30 18:54:02 Summary: See previous commit message. Affected #: 5 files diff -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 tools/samtools/sam_pileup.py --- a/tools/samtools/sam_pileup.py +++ b/tools/samtools/sam_pileup.py @@ -8,7 +8,8 @@ -o, --output1=o: Output pileup -R, --ref=R: Reference file type -n, --ownFile=n: User-supplied fasta reference file - -g, --index=g: Path of the indexed reference genome + -d, --dbkey=d: dbkey of user-supplied file + -x, --indexDir=x: Index directory -b, --bamIndex=b: BAM index file -s, --lastCol=s: Print the mapping quality as the last column -i, --indels=i: Only output lines containing indels @@ -30,9 +31,24 @@ sys.stderr.write( '%s\n' % msg ) sys.exit() +def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ): + seqFile = '%s/sam_fa_indices.loc' % GALAXY_DATA_INDEX_DIR + seqPath = '' + for line in open( seqFile ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and line.startswith( 'index' ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[1] == dbkey: + seqPath = fields[2].strip() + break + return seqPath + def __main__(): #Parse Command Line options, args = doc_optparse.parse( __doc__ ) + seqPath = check_seq_file( options.dbkey, options.indexDir ) # output version # of tool try: tmp = tempfile.NamedTemporaryFile().name @@ -61,6 +77,7 @@ tmpf1 = tempfile.NamedTemporaryFile( dir=tmpDir ) tmpf1_name = tmpf1.name tmpf1.close() + tmpf1fai_name = '%s.fai' % tmpf1_name #link bam and bam index to working directory (can't move because need to leave original) os.symlink( options.input1, tmpf0bam_name ) os.symlink( options.bamIndex, tmpf0bambai_name ) @@ -83,9 +100,9 @@ try: #index reference if necessary and prepare pileup command if options.ref == 'indexed': - if not os.path.exists( "%s.fai" % options.index ): - raise Exception, "Indexed genome %s not present, request it by reporting this error." % options.index - cmd = cmd % ( opts, options.index, tmpf0bam_name, options.output1 ) + if not os.path.exists( "%s.fai" % seqPath ): + raise Exception, "No sequences are available for '%s', request them by reporting this error." % options.dbkey + cmd = cmd % ( opts, seqPath, tmpf0bam_name, options.output1 ) elif options.ref == 'history': os.symlink( options.ownFile, tmpf1_name ) cmdIndex = 'samtools faidx %s' % ( tmpf1_name ) diff -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 tools/samtools/sam_pileup.xml --- a/tools/samtools/sam_pileup.xml +++ b/tools/samtools/sam_pileup.xml @@ -1,4 +1,4 @@ -<tool id="sam_pileup" name="Generate pileup" version="1.1.2"> +<tool id="sam_pileup" name="Generate pileup" version="1.1.1"><description>from BAM dataset</description><requirements><requirement type="package" version="0.1.16">samtools</requirement> @@ -11,8 +11,10 @@ #if $refOrHistory.reference == "history": --ownFile=$refOrHistory.ownFile #else: - --index=${refOrHistory.index.fields.path} + --ownFile="None" #end if + --dbkey=${input1.metadata.dbkey} + --indexDir=${GALAXY_DATA_INDEX_DIR} --bamIndex=${input1.metadata.bam_index} --lastCol=$lastCol --indels=$indels @@ -39,13 +41,7 @@ <when value="indexed"><param name="input1" type="data" format="bam" label="Select the BAM file to generate the pileup file for"><validator type="unspecified_build" /> - <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." /> - </param> - <param name="index" type="select" label="Using reference genome"> - <options from_data_table="sam_fa_indexes"> - <filter type="data_meta" ref="input1" key="dbkey" column="1" /> - <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" /> - </options> + <validator type="dataset_metadata_in_file" filename="sam_fa_indices.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="index" /></param></when><when value="history"> @@ -104,7 +100,6 @@ --><param name="reference" value="indexed" /><param name="input1" value="sam_pileup_in1.bam" ftype="bam" dbkey="equCab2" /> - <param name="index" value="chr_m" /><param name="lastCol" value="no" /><param name="indels" value="no" /><param name="mapCap" value="60" /> diff -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 tools/samtools/sam_to_bam.py --- a/tools/samtools/sam_to_bam.py +++ b/tools/samtools/sam_to_bam.py @@ -3,24 +3,43 @@ Converts SAM data to sorted BAM data. usage: sam_to_bam.py [options] --input1: SAM file to be converted - --index: path of the indexed reference genome + --dbkey: dbkey value --ref_file: Reference file if choosing from history --output1: output dataset in bam format + --index_dir: GALAXY_DATA_INDEX_DIR """ -import optparse, os, sys, subprocess, tempfile, shutil +import optparse, os, sys, subprocess, tempfile, shutil, gzip +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.cookbook import doc_optparse +from galaxy import util def stop_err( msg ): sys.stderr.write( '%s\n' % msg ) sys.exit() +def check_seq_file( dbkey, cached_seqs_pointer_file ): + seq_path = '' + for line in open( cached_seqs_pointer_file ): + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#' ) and line.startswith( 'index' ): + fields = line.split( '\t' ) + if len( fields ) < 3: + continue + if fields[1] == dbkey: + seq_path = fields[2].strip() + break + return seq_path + def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option( '', '--input1', dest='input1', help='The input SAM dataset' ) - parser.add_option( '', '--index', dest='index', help='The path of the indexed reference genome' ) + parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' ) parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' ) parser.add_option( '', '--output1', dest='output1', help='The output BAM dataset' ) + parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' ) ( options, args ) = parser.parse_args() # output version # of tool @@ -42,17 +61,24 @@ except: sys.stdout.write( 'Could not determine Samtools version\n' ) + cached_seqs_pointer_file = '%s/sam_fa_indices.loc' % options.index_dir + if not os.path.exists( cached_seqs_pointer_file ): + stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file ) + # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa, + # and the equCab2.fa file will contain fasta sequences. + seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file ) tmp_dir = tempfile.mkdtemp( dir='.' ) if not options.ref_file or options.ref_file == 'None': # We're using locally cached reference sequences( e.g., /galaxy/data/equCab2/sam_index/equCab2.fa ). # The indexes for /galaxy/data/equCab2/sam_index/equCab2.fa will be contained in # a file named /galaxy/data/equCab2/sam_index/equCab2.fa.fai - fai_index_file_path = '%s.fai' % options.index + fai_index_file_base = seq_path + fai_index_file_path = '%s.fai' % seq_path if not os.path.exists( fai_index_file_path ): #clean up temp files if os.path.exists( tmp_dir ): shutil.rmtree( tmp_dir ) - stop_err( 'Indexed genome %s not present, request it by reporting this error.' % options.index ) + stop_err( 'No sequences are available for build (%s), request them by reporting this error.' % options.dbkey ) else: try: # Create indexes for history reference ( e.g., ~/database/files/000/dataset_1.dat ) using samtools faidx, which will: diff -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 tools/samtools/sam_to_bam.xml --- a/tools/samtools/sam_to_bam.xml +++ b/tools/samtools/sam_to_bam.xml @@ -1,4 +1,4 @@ -<tool id="sam_to_bam" name="SAM-to-BAM" version="1.1.3"> +<tool id="sam_to_bam" name="SAM-to-BAM" version="1.1.2"><description>converts SAM format to BAM format</description><requirements><requirement type="package">samtools</requirement> @@ -7,11 +7,13 @@ sam_to_bam.py --input1=$source.input1 #if $source.index_source == "history": + --dbkey=${ref_file.metadata.dbkey} --ref_file=$source.ref_file #else - --index=${source.index.fields.path} + --dbkey=${input1.metadata.dbkey} #end if --output1=$output1 + --index_dir=${GALAXY_DATA_INDEX_DIR} </command><inputs><conditional name="source"> @@ -20,19 +22,13 @@ <option value="history">History</option></param><when value="cached"> - <param name="input1" type="data" format="sam" metadata_name="dbkey" label="SAM file to convert"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." /> - </param> - <param name="index" type="select" label="Using reference genome"> - <options from_data_table="sam_fa_indexes"> - <filter type="data_meta" ref="input1" key="dbkey" column="1" /> - <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" /> - </options> + <param name="input1" type="data" format="sam" metadata_name="dbkey" label="SAM File to Convert"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="sam_fa_indices.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="index" /></param></when><when value="history"> - <param name="input1" type="data" format="sam" label="SAM file to convert" /> + <param name="input1" type="data" format="sam" label="Convert SAM file" /><param name="ref_file" type="data" format="fasta" metadata_name="dbkey" label="Using reference file" /></when></conditional> @@ -80,7 +76,6 @@ --><param name="index_source" value="cached" /><param name="input1" value="sam_to_bam_in1.sam" ftype="sam" dbkey="chrM" /> - <param name="index" value="chr_m" /><output name="output1" file="sam_to_bam_out2.bam" ftype="bam" /></test></tests> diff -r c386fc668d0ef5d829fcbd3ee314bcd35ac0e882 -r 951e853b0bcd2c62cedee0b95d46c9e36ab6c605 tools/samtools/samtools_mpileup.xml --- a/tools/samtools/samtools_mpileup.xml +++ b/tools/samtools/samtools_mpileup.xml @@ -1,4 +1,4 @@ -<tool id="samtools_mpileup" name="MPileup" version="0.0.2"> +<tool id="samtools_mpileup" name="MPileup" version="0.0.1"><description>SNP and indel caller</description><requirements><requirement type="package">samtools</requirement> @@ -59,22 +59,22 @@ </param><when value="cached"><repeat name="input_bams" title="BAM file" min="1"> - <param name="input_bam" type="data" format="bam" label="BAM file"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." /><!-- fixme!!! this needs to be a select --> - </param> + <param name="input_bam" type="data" format="bam" label="BAM file"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_data_table" table_name="sam_fa_indexes" metadata_name="dbkey" metadata_column="value" message="Sequences are not currently available for the specified build." /><!-- fixme!!! this needs to be a select --> + </param></repeat><param name="ref_file" type="select" label="Using reference genome"><options from_data_table="sam_fa_indexes"> - <!-- <filter type="data_meta" ref="input_bam" key="dbkey" column="1" /> does not yet work in a repeat...--> + <!-- <filter type="data_meta" key="dbkey" ref="input_bam" column="value"/> does not yet work in a repeat...--></options></param></when><when value="history"><!-- FIX ME!!!! --><repeat name="input_bams" title="BAM file" min="1"> - <param name="input_bam" type="data" format="bam" label="BAM file"> - <validator type="metadata" check="bam_index" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue." /> - </param> + <param name="input_bam" type="data" format="bam" label="BAM file" > + <validator type="metadata" check="bam_index" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue."/> + </param></repeat><param name="ref_file" type="data" format="fasta" label="Using reference file" /></when> https://bitbucket.org/galaxy/galaxy-central/commits/866d8b29854f/ Changeset: 866d8b29854f Branch: next-stable User: natefoo Date: 2013-07-30 18:56:17 Summary: Open next-stable for the next release. Affected #: 360 files diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 .coveragerc --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[run] +branch = True +include = lib/galaxy/* diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 .hgignore --- a/.hgignore +++ b/.hgignore @@ -60,7 +60,7 @@ job_conf.xml data_manager_conf.xml shed_data_manager_conf.xml -visualizations_conf.xml +config/visualizations/*.xml static/welcome.html.* static/welcome.html @@ -82,6 +82,9 @@ # Test output run_functional_tests.html test/tool_shed/tmp/* +.coverage +htmlcov +run_unit_tests.html # Project files *.kpf diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 .hgtags --- a/.hgtags +++ b/.hgtags @@ -2,3 +2,4 @@ 1c717491139269651bb59687563da9410b84c65d release_2013.02.08 75f09617abaadbc8cc732bb8ee519decaeb56ea7 release_2013.04.01 2cc8d10988e03257dc7b97f8bb332c7df745d1dd security_2013.04.08 +524f246ca85395082719ae7a6ff72260d7ad5612 release_2013.06.03 diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 README.txt --- a/README.txt +++ b/README.txt @@ -7,7 +7,7 @@ HOW TO START ============ -Galaxy requires Python 2.5, 2.6 or 2.7. To check your python version, run: +Galaxy requires Python 2.6 or 2.7. To check your python version, run: % python -V Python 2.7.3 diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 buildbot_setup.sh --- a/buildbot_setup.sh +++ b/buildbot_setup.sh @@ -65,6 +65,7 @@ " SAMPLES=" +tool_conf.xml.sample datatypes_conf.xml.sample universe_wsgi.ini.sample tool_data_table_conf.xml.sample diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 config/visualizations/circster.xml.sample --- /dev/null +++ b/config/visualizations/circster.xml.sample @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE visualization SYSTEM "visualization.dtd"> +<visualization name="circster"> + <data_sources> + <data_source> + <model_class>HistoryDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Data</test> + <to_param param_attr="id">dataset_id</to_param> + <to_param assign="hda">hda_ldda</to_param> + </data_source> + <data_source> + <model_class>LibraryDatasetDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Data</test> + <to_param param_attr="id">dataset_id</to_param> + <to_param assign="ldda">hda_ldda</to_param> + </data_source> + </data_sources> + <params> + <param type="visualization">id</param> + <param type="hda_or_ldda">dataset_id</param> + <param_modifier type="string" modifies="dataset_id">hda_ldda</param_modifier> + <param type="dbkey">dbkey</param> + </params> + <template>circster.mako</template> + <render_location>_top</render_location> +</visualization> diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 config/visualizations/phyloviz.xml.sample --- /dev/null +++ b/config/visualizations/phyloviz.xml.sample @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE visualization SYSTEM "visualization.dtd"> +<visualization name="phyloviz"> + <data_sources> + <data_source> + <model_class>HistoryDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Newick</test> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Nexus</test> + <to_param param_attr="id">dataset_id</to_param> + </data_source> + </data_sources> + <params> + <param type="dataset" var_name_in_template="hda" required="true">dataset_id</param> + <param type="integer" default="0">tree_index</param> + </params> + <template>phyloviz.mako</template> + <render_location>_top</render_location> +</visualization> diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 config/visualizations/scatterplot.xml.sample --- /dev/null +++ b/config/visualizations/scatterplot.xml.sample @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE visualization SYSTEM "visualization.dtd"> +<visualization name="scatterplot"> + <data_sources> + <data_source> + <model_class>HistoryDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">tabular.Tabular</test> + <to_param param_attr="id">dataset_id</to_param> + </data_source> + </data_sources> + <params> + <param type="dataset" var_name_in_template="hda" required="true">dataset_id</param> + </params> + <template>scatterplot.mako</template> +</visualization> diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 config/visualizations/sweepster.xml.sample --- /dev/null +++ b/config/visualizations/sweepster.xml.sample @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE visualization SYSTEM "visualization.dtd"> +<visualization name="sweepster"> + <data_sources> + <data_source> + <model_class>HistoryDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Data</test> + <to_param param_attr="id">dataset_id</to_param> + <to_param assign="hda">hda_ldda</to_param> + </data_source> + <data_source> + <model_class>LibraryDatasetDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Data</test> + <to_param param_attr="id">dataset_id</to_param> + <to_param assign="ldda">hda_ldda</to_param> + </data_source> + </data_sources> + <params> + <param type="visualization" var_name_in_template="viz">visualization</param> + <param type="hda_or_ldda" var_name_in_template="dataset">dataset_id</param> + <param_modifier type="string" modifies="dataset_id">hda_ldda</param_modifier> + </params> + <template>sweepster.mako</template> + <render_location>_top</render_location> +</visualization> diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 config/visualizations/trackster.xml.sample --- /dev/null +++ b/config/visualizations/trackster.xml.sample @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE visualization SYSTEM "visualization.dtd"> +<visualization name="trackster"> + <!--not tested yet --> + <data_sources> + <data_source> + <model_class>HistoryDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Data</test> + <to_param param_attr="id">dataset_id</to_param> + <to_param assign="hda">hda_ldda</to_param> + <to_param param_attr="dbkey">dbkey</to_param> + </data_source> + <data_source> + <model_class>LibraryDatasetDatasetAssociation</model_class> + <test type="isinstance" test_attr="datatype" result_type="datatype">data.Data</test> + <to_param param_attr="id">dataset_id</to_param> + <to_param assign="ldda">hda_ldda</to_param> + </data_source> + </data_sources> + <params> + <param type="visualization">id</param> + <param type="dataset">dataset_id</param> + <param type="genome_region">genome_region</param> + <param type="dbkey">dbkey</param> + </params> + <template_root>tracks</template_root> + <template>browser.mako</template> + <render_location>_top</render_location> +</visualization> diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 config/visualizations/visualization.dtd --- /dev/null +++ b/config/visualizations/visualization.dtd @@ -0,0 +1,132 @@ +<!-- runnable on NIX with xmllint --> + +<!-- each visualization must have a template (all other elements are optional) --> +<!ELEMENT visualization (data_sources*,params*,template_root*,template,link_text*,render_location*)> +<!-- visualization name (e.g. 'trackster', 'scatterplot', etc.) is required --> +<!ATTLIST visualization + name CDATA #REQUIRED +> + +<!ELEMENT data_sources (data_source*)> +<!-- data sources are elements that describe what objects (HDAs, LDDAs, Job, User, etc.) + are applicable to a visualization. Often these are used to fetch applicable links + to the visualizations that use them. +--> + <!ELEMENT data_source (model_class,(test|to_param)*)> + <!ELEMENT model_class (#PCDATA)> + <!-- model_class is currently the class name of the object you want to make a visualization + applicable to (e.g. HistoryDatasetAssociation). Currently only classes in galaxy.model + can be used. + REQUIRED and currently limited to: 'HistoryDatasetAssociation', 'LibraryDatasetDatasetAssociation' + --> + <!ELEMENT test (#PCDATA)> + <!-- tests help define what conditions the visualization can be applied to the model_class/target. + Currently, all tests are OR'd and there is no logical grouping. Tests are run in order. + (text): the text of this element is what the given target will be compared to (REQUIRED) + type: what type of test to run (e.g. when the target is an HDA the test will often be of type 'isinstance' + and test whether the HDA's datatype isinstace of a class) + DEFAULT: string comparison. + test_attr: what attribute of the target object should be used in the test. For instance, 'datatype' + will attempt to get the HDA.datatype from a target HDA. If the given object doesn't have + that attribute the test will fail (with no error). test_attr can be dot separated attributes, + looking up each in turn. For example, if the target was a history, one could access the + history.user.email by setting test_attr to 'user.email' (why you would want that, I don't know) + DEFAULT: to comparing the object itself (and not any of it's attributes) + result_type: if the result (the text of the element mentioned above) needs to be parsed into + something other than a string, result_type will tell the registry how to do this. E.g. + if result_type is 'datatype' the registry will assume the text is a datatype class name + and parse it into the proper class before the test (often 'isinstance') is run. + DEFAULT: no parsing (result should be a string) + --> + <!ATTLIST test + type CDATA #IMPLIED + test_attr CDATA #IMPLIED + result_type CDATA #IMPLIED + > + + <!ELEMENT to_param (#PCDATA)> + <!-- to_param tells the registry how to parse the data_source into a query string param. + For example, HDA data_sources can set param_to text to 'dataset_id' and param_attr to 'id' and the + the target HDA (if it passes the tests) will be passed as "dataset_id=HDA.id" + (text): the query string param key this source will be parsed into (e.g. dataset_id) + REQUIRED + param_attr: the attribute of the data_source object to use as the value in the query string param. + E.g. param_attr='id' for an HDA data_source would use the (encoded) id. + NOTE: a to_param MUST have either a param_attr or assign + assign: you can use this to directly assign a value to a query string's param. E.g. if the + data_source is a LDDA we can set 'hda_or_ldda=ldda' using assign='ldda'. + NOTE: a to_param MUST have either a param_attr or assign + --> + <!ATTLIST to_param + param_attr CDATA #IMPLIED + assign CDATA #IMPLIED + > + +<!ELEMENT params ((param|param_modifier)*)> +<!-- params describe what data will be sent to a visualization template and + how to convert them from a query string in a URL into variables usable in a template. + For example, + param_modifiers are a special class of parameters that modify other params + (e.g. hda_ldda can be 'hda' or 'ldda' and modifies/informs dataset_id to fetch an HDA or LDDA) +--> + <!ELEMENT param (#PCDATA)> + <!-- param tells the registry how to parse the query string param back into a resource/data_source. + For example, if a query string has "dataset_id=NNN" and the type is 'dataset', the registry + will attempt to fetch the hda with id of NNN from the database and pass it to the template. + (text): the query string param key this source will be parsed from (e.g. dataset_id) + REQUIRED + type: the type of the resource. + Can be: str (DEFAULT), bool, int, float, json, visualization, dbkey, dataset, or hda_ldda. + default: if a param is not passed on the query string (and is not required) OR the given param + fails to parse, this value is used instead. + DEFAULT: None + required: set this to true if the param is required for the template. Rendering will with an error + if the param hasn't been sent. + DEFAULT: false + csv: set this to true if the param is a comma separated list. The registry will attempt to + parse each value as the given type and send the result as a list to the template. + DEFAULT: false + constrain_to: (currently unused) constain a param to a set of values, error if not valid. + DEFAULT: don't constrain + var_name_in_template: a new name for the resource/variable to use in the template. E.g. an initial + query string param key might be 'dataset_id' in the URL, the registry parses it into an HDA, + and if var_name_in_template is set to 'hda', the template will be able to access the HDA + with the variable name 'hda' (as in hda.title). + DEFAULT: keep the original query string name + --> + <!ATTLIST param + type CDATA #IMPLIED + default CDATA #IMPLIED + required CDATA #IMPLIED + csv CDATA #IMPLIED + constrain_to CDATA #IMPLIED + var_name_in_template CDATA #IMPLIED + > + <!-- param_modifiers are the same as param but have a REQUIRED 'modifies' attribute. + 'modifies' must point to the param name (the text part of param element) that it will modify. + E.g. <param_modifier modifies="dataset_id">hda_ldda</param_modifier> + --> + <!ELEMENT param_modifier (#PCDATA)> + <!ATTLIST param_modifier + modifies CDATA #REQUIRED + type CDATA #IMPLIED + default CDATA #IMPLIED + required CDATA #IMPLIED + csv CDATA #IMPLIED + constrain_to CDATA #IMPLIED + var_name_in_template CDATA #IMPLIED + > + +<!-- template_root: the directory to search for the template relative to templates/webapps/galaxy + (optional) DEFAULT: visualizations +--> +<!ELEMENT template_root (#PCDATA)> +<!-- template: the template used to render the visualization. REQUIRED --> +<!ELEMENT template (#PCDATA)> +<!-- link_text: the text component of an html anchor displayed when the registry builds the link information --> +<!ELEMENT link_text (#PCDATA)> +<!-- render_location: used as the target attribute of the link to the visualization. + Can be 'galaxy_main', '_top', '_blank'. DEFAULT: 'galaxy_main' +--> +<!-- TODO: rename -> render_target --> +<!ELEMENT render_location (#PCDATA)> diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 eggs.ini --- a/eggs.ini +++ b/eggs.ini @@ -19,7 +19,7 @@ mercurial = 2.2.3 MySQL_python = 1.2.3c1 numpy = 1.6.0 -pbs_python = 4.1.0 +pbs_python = 4.3.5 psycopg2 = 2.0.13 pycrypto = 2.5 pysam = 0.4.2 diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 job_conf.xml.sample_advanced --- a/job_conf.xml.sample_advanced +++ b/job_conf.xml.sample_advanced @@ -54,7 +54,15 @@ <param id="shell_hostname">foo.example.org</param><param id="Job_Execution_Time">24:00:00</param></destination> - <destination id="condor" runner="condor"/> + <destination id="condor" runner="condor"> + <!-- With no params, jobs are submitted to the 'vanilla' universe with: + notification = NEVER + getenv = true + Additional/override query ClassAd params can be specified with + <param> tags. + --> + <param id="request_cpus">8</param> + </destination></destinations><tools><!-- Tools can be configured to use specific destinations or handlers, diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/app.py --- a/lib/galaxy/app.py +++ b/lib/galaxy/app.py @@ -92,7 +92,7 @@ # Load additional entries defined by self.config.shed_tool_data_table_config into tool data tables. self.tool_data_tables.load_from_config_file( config_filename=self.config.shed_tool_data_table_config, tool_data_path=self.tool_data_tables.tool_data_path, - from_shed_config=True ) + from_shed_config=False ) # Initialize the job management configuration self.job_config = jobs.JobConfiguration(self) # Initialize the tools, making sure the list of tool configs includes the reserved migrated_tools_conf.xml file. @@ -123,8 +123,10 @@ # Load genome indexer tool. load_genome_index_tools( self.toolbox ) # visualizations registry: associates resources with visualizations, controls how to render - self.visualizations_registry = ( VisualizationsRegistry( self.config.root, self.config.visualizations_conf_path ) - if self.config.visualizations_conf_path else None ) + self.visualizations_registry = None + if self.config.visualizations_config_directory: + self.visualizations_registry = VisualizationsRegistry( self.config.root, + self.config.visualizations_config_directory ) # Load security policy. self.security_agent = self.model.security_agent self.host_security_agent = galaxy.security.HostAgent( model=self.security_agent.model, permitted_actions=self.security_agent.permitted_actions ) diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -64,21 +64,33 @@ tcf = kwargs[ 'tool_config_files' ] else: tcf = 'tool_conf.xml' + self.tool_filters = listify( kwargs.get( "tool_filters", [] ) ) + self.tool_label_filters = listify( kwargs.get( "tool_label_filters", [] ) ) + self.tool_section_filters = listify( kwargs.get( "tool_section_filters", [] ) ) self.tool_configs = [ resolve_path( p, self.root ) for p in listify( tcf ) ] + self.shed_tool_data_path = kwargs.get( "shed_tool_data_path", None ) + if self.shed_tool_data_path: + self.shed_tool_data_path = resolve_path( self.shed_tool_data_path, self.root ) + else: + self.shed_tool_data_path = self.tool_data_path self.tool_data_table_config_path = resolve_path( kwargs.get( 'tool_data_table_config_path', 'tool_data_table_conf.xml' ), self.root ) self.shed_tool_data_table_config = resolve_path( kwargs.get( 'shed_tool_data_table_config', 'shed_tool_data_table_conf.xml' ), self.root ) self.enable_tool_shed_check = string_as_bool( kwargs.get( 'enable_tool_shed_check', False ) ) + self.hours_between_check = kwargs.get( 'hours_between_check', 12 ) try: - self.hours_between_check = kwargs.get( 'hours_between_check', 12 ) - if isinstance( self.hours_between_check, float ): + hbc_test = int( self.hours_between_check ) + self.hours_between_check = hbc_test + if self.hours_between_check < 1 or self.hours_between_check > 24: + self.hours_between_check = 12 + except: + try: # Float values are supported for functional tests. + hbc_test = float( self.hours_between_check ) + self.hours_between_check = hbc_test if self.hours_between_check < 0.001 or self.hours_between_check > 24.0: self.hours_between_check = 12.0 - else: - if self.hours_between_check < 1 or self.hours_between_check > 24: - self.hours_between_check = 12 - except: - self.hours_between_check = 12 + except: + self.hours_between_check = 12 self.update_integrated_tool_panel = kwargs.get( "update_integrated_tool_panel", True ) self.enable_data_manager_user_view = string_as_bool( kwargs.get( "enable_data_manager_user_view", "False" ) ) self.data_manager_config_file = resolve_path( kwargs.get('data_manager_config_file', 'data_manager_conf.xml' ), self.root ) @@ -154,6 +166,7 @@ self.ucsc_display_sites = kwargs.get( 'ucsc_display_sites', "main,test,archaea,ucla" ).lower().split(",") self.gbrowse_display_sites = kwargs.get( 'gbrowse_display_sites', "modencode,sgd_yeast,tair,wormbase,wormbase_ws120,wormbase_ws140,wormbase_ws170,wormbase_ws180,wormbase_ws190,wormbase_ws200,wormbase_ws204,wormbase_ws210,wormbase_ws220,wormbase_ws225" ).lower().split(",") self.brand = kwargs.get( 'brand', None ) + self.welcome_url = kwargs.get( 'welcome_url', '/static/welcome.html' ) # Configuration for the message box directly below the masthead. self.message_box_visible = kwargs.get( 'message_box_visible', False ) self.message_box_content = kwargs.get( 'message_box_content', None ) @@ -275,8 +288,8 @@ self.fluent_log = string_as_bool( kwargs.get( 'fluent_log', False ) ) self.fluent_host = kwargs.get( 'fluent_host', 'localhost' ) self.fluent_port = int( kwargs.get( 'fluent_port', 24224 ) ) - # visualizations registry config path - self.visualizations_conf_path = kwargs.get( 'visualizations_conf_path', None ) + # visualization registries config directory + self.visualizations_config_directory = kwargs.get( 'visualizations_config_directory', None ) @property def sentry_dsn_public( self ): diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -22,6 +22,7 @@ from galaxy.datatypes.metadata import MetadataElement from galaxy.datatypes import metadata from galaxy.datatypes.sniff import * +import dataproviders log = logging.getLogger(__name__) @@ -74,6 +75,7 @@ trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext) return open( dataset.file_name ) + class Ab1( Binary ): """Class describing an ab1 binary sequence file""" file_ext = "ab1" @@ -93,12 +95,15 @@ Binary.register_unsniffable_binary_ext("ab1") + class GenericAsn1Binary( Binary ): """Class for generic ASN.1 binary format""" file_ext = "asn1-binary" Binary.register_unsniffable_binary_ext("asn1-binary") + +@dataproviders.decorators.has_dataproviders class Bam( Binary ): """Class describing a BAM binary file""" file_ext = "bam" @@ -255,9 +260,92 @@ return dataset.peek except: return "Binary bam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) ) + + # ------------- Dataproviders + # pipe through samtools view + #ALSO: (as Sam) + # bam does not use '#' to indicate comments/headers - we need to strip out those headers from the std. providers + #TODO:?? seems like there should be an easier way to do/inherit this - metadata.comment_char? + #TODO: incorporate samtools options to control output: regions first, then flags, etc. + @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings ) + def line_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.line.FilteredLineDataProvider( samtools_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings ) + def regex_line_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.line.RegexLineDataProvider( samtools_source, **settings ) + @dataproviders.decorators.dataprovider_factory( 'column', dataproviders.column.ColumnarDataProvider.settings ) + def column_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.column.ColumnarDataProvider( samtools_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'dict', dataproviders.column.DictDataProvider.settings ) + def dict_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.column.DictDataProvider( samtools_source, **settings ) + + # these can't be used directly - may need BamColumn, BamDict (Bam metadata -> column/dict) + # OR - see genomic_region_dataprovider + #@dataproviders.decorators.dataprovider_factory( 'dataset-column', dataproviders.column.ColumnarDataProvider.settings ) + #def dataset_column_dataprovider( self, dataset, **settings ): + # settings[ 'comment_char' ] = '@' + # return super( Sam, self ).dataset_column_dataprovider( dataset, **settings ) + + #@dataproviders.decorators.dataprovider_factory( 'dataset-dict', dataproviders.column.DictDataProvider.settings ) + #def dataset_dict_dataprovider( self, dataset, **settings ): + # settings[ 'comment_char' ] = '@' + # return super( Sam, self ).dataset_dict_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'header', dataproviders.line.RegexLineDataProvider.settings ) + def header_dataprovider( self, dataset, **settings ): + # in this case we can use an option of samtools view to provide just what we need (w/o regex) + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset, '-H' ) + return dataproviders.line.RegexLineDataProvider( samtools_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'id-seq-qual', dataproviders.column.DictDataProvider.settings ) + def id_seq_qual_dataprovider( self, dataset, **settings ): + settings[ 'indeces' ] = [ 0, 9, 10 ] + settings[ 'column_types' ] = [ 'str', 'str', 'str' ] + settings[ 'column_names' ] = [ 'id', 'seq', 'qual' ] + return self.dict_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'genomic-region', dataproviders.column.ColumnarDataProvider.settings ) + def genomic_region_dataprovider( self, dataset, **settings ): + # GenomicRegionDataProvider currently requires a dataset as source - may not be necc. + #TODO:?? consider (at least) the possible use of a kwarg: metadata_source (def. to source.dataset), + # or remove altogether... + #samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + #return dataproviders.dataset.GenomicRegionDataProvider( samtools_source, metadata_source=dataset, + # 2, 3, 3, **settings ) + + # instead, set manually and use in-class column gen + settings[ 'indeces' ] = [ 2, 3, 3 ] + settings[ 'column_types' ] = [ 'str', 'int', 'int' ] + return self.column_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict', dataproviders.column.DictDataProvider.settings ) + def genomic_region_dict_dataprovider( self, dataset, **settings ): + settings[ 'indeces' ] = [ 2, 3, 3 ] + settings[ 'column_types' ] = [ 'str', 'int', 'int' ] + settings[ 'column_names' ] = [ 'chrom', 'start', 'end' ] + return self.dict_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'samtools' ) + def samtools_dataprovider( self, dataset, **settings ): + """Generic samtools interface - all options available through settings.""" + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.dataset.SamtoolsDataProvider( dataset_source, **settings ) + Binary.register_sniffable_binary_format("bam", "bam", Bam) + class H5( Binary ): """Class describing an HDF5 file""" file_ext = "h5" @@ -277,6 +365,7 @@ Binary.register_unsniffable_binary_ext("h5") + class Scf( Binary ): """Class describing an scf binary sequence file""" file_ext = "scf" @@ -296,6 +385,7 @@ Binary.register_unsniffable_binary_ext("scf") + class Sff( Binary ): """ Standard Flowgram Format (SFF) """ file_ext = "sff" @@ -327,6 +417,7 @@ Binary.register_sniffable_binary_format("sff", "sff", Sff) + class BigWig(Binary): """ Accessing binary BigWig files from UCSC. @@ -363,6 +454,7 @@ Binary.register_sniffable_binary_format("bigwig", "bigwig", BigWig) + class BigBed(BigWig): """BigBed support from UCSC.""" @@ -375,6 +467,7 @@ Binary.register_sniffable_binary_format("bigbed", "bigbed", BigBed) + class TwoBit (Binary): """Class describing a TwoBit format nucleotide file""" @@ -399,3 +492,5 @@ return dataset.peek except: return "Binary TwoBit format nucleotide file (%s)" % (data.nice_size(dataset.get_size())) + +Binary.register_sniffable_binary_format("twobit", "twobit", TwoBit) diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/converters/bam_to_bigwig_converter.xml --- a/lib/galaxy/datatypes/converters/bam_to_bigwig_converter.xml +++ b/lib/galaxy/datatypes/converters/bam_to_bigwig_converter.xml @@ -1,5 +1,9 @@ <tool id="CONVERTER_bam_to_bigwig_0" name="Convert BAM to BigWig" version="1.0.0" hidden="true"><!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --> + <requirements> + <requirement type="package">ucsc_tools</requirement> + <requirement type="package">bedtools</requirement> + </requirements><command> bedtools genomecov -bg -split -ibam $input -g $chromInfo diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/converters/bed_gff_or_vcf_to_bigwig_converter.xml --- a/lib/galaxy/datatypes/converters/bed_gff_or_vcf_to_bigwig_converter.xml +++ b/lib/galaxy/datatypes/converters/bed_gff_or_vcf_to_bigwig_converter.xml @@ -1,5 +1,9 @@ <tool id="CONVERTER_bed_gff_or_vcf_to_bigwig_0" name="Convert BED, GFF, or VCF to BigWig" version="1.0.0" hidden="true"><!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --> + <requirements> + <requirement type="package">ucsc_tools</requirement> + <requirement type="package">bedtools</requirement> + </requirements><command> ## Remove comments and sort by chromosome. grep -v '^#' $input | sort -k1,1 | diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/converters/interval_to_bigwig_converter.xml --- a/lib/galaxy/datatypes/converters/interval_to_bigwig_converter.xml +++ b/lib/galaxy/datatypes/converters/interval_to_bigwig_converter.xml @@ -1,6 +1,10 @@ <tool id="CONVERTER_interval_to_bigwig_0" name="Convert Genomic Intervals To Coverage"><!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --><!-- Used on the metadata edit page. --> + <requirements> + <requirement type="package">ucsc_tools</requirement> + <requirement type="package">bedtools</requirement> + </requirements><command> ## Remove comments and sort by chromosome. diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py --- a/lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py +++ b/lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py @@ -11,15 +11,13 @@ from __future__ import division -import sys, fileinput, optparse +import optparse from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from galaxy.visualization.tracks.summary import * -from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed +eggs.require( "bx-python" ) from bx.interval_index_file import Indexes def main(): - + # Read options, args. parser = optparse.OptionParser() parser.add_option( '-c', '--chr-col', type='int', dest='chrom_col', default=1 ) @@ -27,12 +25,12 @@ parser.add_option( '-e', '--end-col', type='int', dest='end_col', default=3 ) (options, args) = parser.parse_args() input_fname, output_fname = args - + # Make column indices 0-based. options.chrom_col -= 1 options.start_col -= 1 options.end_col -= 1 - + # Do conversion. index = Indexes() offset = 0 @@ -46,9 +44,9 @@ chrom_end = int( feature[ options.end_col ] ) index.add( chrom, chrom_start, chrom_end, offset ) offset += len(line) - + index.write( open(output_fname, "w") ) -if __name__ == "__main__": +if __name__ == "__main__": main() - \ No newline at end of file + diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/converters/pileup_to_interval_index_converter.py --- a/lib/galaxy/datatypes/converters/pileup_to_interval_index_converter.py +++ b/lib/galaxy/datatypes/converters/pileup_to_interval_index_converter.py @@ -8,20 +8,18 @@ from __future__ import division -import sys, fileinput, optparse +import optparse from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from galaxy.visualization.tracks.summary import * -from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed +eggs.require( "bx-python" ) from bx.interval_index_file import Indexes def main(): - + # Read options, args. parser = optparse.OptionParser() (options, args) = parser.parse_args() input_fname, output_fname = args - + # Do conversion. index = Indexes() offset = 0 @@ -31,9 +29,9 @@ start = int( start ) - 1 index.add( chrom, start, start + 1, offset ) offset += len( line ) - + index.write( open(output_fname, "w") ) -if __name__ == "__main__": +if __name__ == "__main__": main() - \ No newline at end of file + diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/converters/sam_to_bigwig_converter.xml --- a/lib/galaxy/datatypes/converters/sam_to_bigwig_converter.xml +++ b/lib/galaxy/datatypes/converters/sam_to_bigwig_converter.xml @@ -1,4 +1,9 @@ <tool id="CONVERTER_sam_to_bigwig_0" name="Convert SAM to BigWig" version="1.0.0" hidden="true"> + <requirements> + <requirement type="package">ucsc_tools</requirement> + <requirement type="package">samtools</requirement> + <requirement type="package">bedtools</requirement> + </requirements><command> samtools view -bh $input | bedtools genomecov -bg -split -ibam stdin -g $chromInfo diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/data.py --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -14,6 +14,8 @@ from galaxy.util.odict import odict from galaxy.util.sanitize_html import sanitize_html +import dataproviders + from galaxy import eggs eggs.require( "Paste" ) import paste @@ -56,6 +58,7 @@ cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls metadata.Statement.process( cls ) +@dataproviders.decorators.has_dataproviders class Data( object ): """ Base class for all datatypes. Implements basic interfaces as well @@ -545,7 +548,13 @@ def has_resolution(self): return False - + def matches_any( self, target_datatypes ): + """ + Check if this datatype is of any of the target_datatypes or is + a subtype thereof. + """ + datatype_classes = tuple( [ datatype.__class__ for datatype in target_datatypes ] ) + return isinstance( self, datatype_classes ) def merge( split_files, output_file): """ Merge files with copy.copyfileobj() will not hit the @@ -572,6 +581,39 @@ return [ 'trackster', 'circster' ] return [] + # ------------- Dataproviders + def has_dataprovider( self, data_format ): + """ + Returns True if `data_format` is available in `dataproviders`. + """ + return ( data_format in self.dataproviders ) + + def dataprovider( self, dataset, data_format, **settings ): + """ + Base dataprovider factory for all datatypes that returns the proper provider + for the given `data_format` or raises a `NoProviderAvailable`. + """ + if self.has_dataprovider( data_format ): + return self.dataproviders[ data_format ]( self, dataset, **settings ) + raise dataproviders.exceptions.NoProviderAvailable( self, data_format ) + + @dataproviders.decorators.dataprovider_factory( 'base' ) + def base_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.base.DataProvider( dataset_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'chunk', dataproviders.chunk.ChunkDataProvider.settings ) + def chunk_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.chunk.ChunkDataProvider( dataset_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'chunk64', dataproviders.chunk.Base64ChunkDataProvider.settings ) + def chunk64_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.chunk.Base64ChunkDataProvider( dataset_source, **settings ) + + +@dataproviders.decorators.has_dataproviders class Text( Data ): file_ext = 'txt' line_class = 'line' @@ -741,10 +783,31 @@ f.close() split = classmethod(split) + # ------------- Dataproviders + @dataproviders.decorators.dataprovider_factory( 'line', dataproviders.line.FilteredLineDataProvider.settings ) + def line_dataprovider( self, dataset, **settings ): + """ + Returns an iterator over the dataset's lines (that have been `strip`ed) + optionally excluding blank lines and lines that start with a comment character. + """ + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.line.FilteredLineDataProvider( dataset_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'regex-line', dataproviders.line.RegexLineDataProvider.settings ) + def regex_line_dataprovider( self, dataset, **settings ): + """ + Returns an iterator over the dataset's lines + optionally including/excluding lines that match one or more regex filters. + """ + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.line.RegexLineDataProvider( dataset_source, **settings ) + + class GenericAsn1( Text ): """Class for generic ASN.1 text format""" file_ext = 'asn1' + class LineCount( Text ): """ Dataset contains a single line with a single integer that denotes the @@ -752,6 +815,7 @@ """ pass + class Newick( Text ): """New Hampshire/Newick Format""" file_ext = "nhx" diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/dataproviders/__init__.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/__init__.py @@ -0,0 +1,28 @@ + +#TODO: ---- This is a work in progress ---- +""" +Dataproviders are iterators with context managers that provide data to some +consumer datum by datum. + +As well as subclassing and overriding to get the proper data, Dataproviders +can be piped from one to the other. +..example:: + +.. note:: be careful to NOT pipe providers into subclasses of those providers. + Subclasses provide all the functionality of their superclasses, + so there's generally no need. + +.. note:: be careful to when using piped providers that accept the same keywords + in their __init__ functions (such as limit or offset) to pass those + keywords to the proper (often final) provider. These errors that result + can be hard to diagnose. +""" +import decorators +import exceptions + +import base +import chunk +import line +import column +import external +import dataset diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/dataproviders/base.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/base.py @@ -0,0 +1,305 @@ +""" +Base class(es) for all DataProviders. +""" +# there's a blurry line between functionality here and functionality in datatypes module +# attempting to keep parsing to a minimum here and focus on chopping/pagination/reformat(/filtering-maybe?) +# and using as much pre-computed info/metadata from the datatypes module as possible +# also, this shouldn't be a replacement/re-implementation of the tool layer +# (which provides traceability/versioning/reproducibility) + +from collections import deque +import exceptions + +_TODO = """ +hooks into datatypes (define providers inside datatype modules) as factories +capture tell() when provider is done + def stop( self ): self.endpoint = source.tell(); raise StopIteration() +implement __len__ sensibly where it can be (would be good to have where we're giving some progress - '100 of 300') + seems like sniffed files would have this info +unit tests +add datum entry/exit point methods: possibly decode, encode + or create a class that pipes source through - how would decode work then? + +icorporate existing visualization/dataproviders +some of the sources (esp. in datasets) don't need to be re-created +YAGNI: InterleavingMultiSourceDataProvider, CombiningMultiSourceDataProvider + +datasets API entry point: + kwargs should be parsed from strings 2 layers up (in the DatasetsAPI) - that's the 'proper' place for that. + but how would it know how/what to parse if it doesn't have access to the classes used in the provider? + Building a giant list by sweeping all possible dprov classes doesn't make sense + For now - I'm burying them in the class __init__s - but I don't like that +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- base classes +class HasSettings( type ): + """ + Metaclass for data providers that allows defining and inheriting + a dictionary named 'settings'. + + Useful for allowing class level access to expected variable types + passed to class `__init__` functions so they can be parsed from a query string. + """ + # yeah - this is all too acrobatic + def __new__( cls, name, base_classes, attributes ): + settings = {} + # get settings defined in base classes + for base_class in base_classes: + base_settings = getattr( base_class, 'settings', None ) + if base_settings: + settings.update( base_settings ) + # get settings defined in this class + new_settings = attributes.pop( 'settings', None ) + if new_settings: + settings.update( new_settings ) + attributes[ 'settings' ] = settings + return type.__new__( cls, name, base_classes, attributes ) + + +# ----------------------------------------------------------------------------- base classes +class DataProvider( object ): + """ + Base class for all data providers. Data providers: + (a) have a source (which must be another file-like object) + (b) implement both the iterator and context manager interfaces + (c) do not allow write methods + (but otherwise implement the other file object interface methods) + """ + # a definition of expected types for keyword arguments sent to __init__ + # useful for controlling how query string dictionaries can be parsed into correct types for __init__ + # empty in this base class + __metaclass__ = HasSettings + settings = {} + + def __init__( self, source, **kwargs ): + """ + :param source: the source that this iterator will loop over. + (Should implement the iterable interface and ideally have the + context manager interface as well) + """ + self.source = self.validate_source( source ) + + def validate_source( self, source ): + """ + Is this a valid source for this provider? + + :raises InvalidDataProviderSource: if the source is considered invalid. + + Meant to be overridden in subclasses. + """ + if not source or not hasattr( source, '__iter__' ): + # that's by no means a thorough check + raise exceptions.InvalidDataProviderSource( source ) + return source + + #TODO: (this might cause problems later...) + #TODO: some providers (such as chunk's seek and read) rely on this... remove + def __getattr__( self, name ): + if name == 'source': + # if we're inside this fn, source hasn't been set - provide some safety just for this attr + return None + # otherwise, try to get the attr from the source - allows us to get things like provider.encoding, etc. + if hasattr( self.source, name ): + return getattr( self.source, name ) + # raise the proper error + return self.__getattribute__( name ) + + # write methods should not be allowed + def truncate( self, size ): + raise NotImplementedError( 'Write methods are purposely disabled' ) + def write( self, string ): + raise NotImplementedError( 'Write methods are purposely disabled' ) + def writelines( self, sequence ): + raise NotImplementedError( 'Write methods are purposely disabled' ) + + #TODO: route read methods through next? + #def readline( self ): + # return self.next() + def readlines( self ): + return [ line for line in self ] + + # iterator interface + def __iter__( self ): + # it's generators all the way up, Timmy + with self as source: + for datum in self.source: + yield datum + def next( self ): + return self.source.next() + + # context manager interface + def __enter__( self ): + # make the source's context manager interface optional + if hasattr( self.source, '__enter__' ): + self.source.__enter__() + return self + def __exit__( self, *args ): + # make the source's context manager interface optional, call on source if there + if hasattr( self.source, '__exit__' ): + self.source.__exit__( *args ) + # alternately, call close() + elif hasattr( self.source, 'close' ): + self.source.close() + + def __str__( self ): + """ + String representation for easier debugging. + + Will call `__str__` on it's source so this will display piped dataproviders. + """ + # we need to protect against recursion (in __getattr__) if self.source hasn't been set + source_str = str( self.source ) if hasattr( self, 'source' ) else '' + return '%s(%s)' %( self.__class__.__name__, str( source_str ) ) + + +class FilteredDataProvider( DataProvider ): + """ + Passes each datum through a filter function and yields it if that function + returns a non-`None` value. + + Also maintains counters: + - `num_data_read`: how many data have been consumed from the source. + - `num_valid_data_read`: how many data have been returned from `filter`. + - `num_data_returned`: how many data has this provider yielded. + """ + # not useful here - we don't want functions over the query string + #settings.update({ 'filter_fn': 'function' }) + + def __init__( self, source, filter_fn=None, **kwargs ): + """ + :param filter_fn: a lambda or function that will be passed a datum and + return either the (optionally modified) datum or None. + """ + super( FilteredDataProvider, self ).__init__( source, **kwargs ) + self.filter_fn = filter_fn if hasattr( filter_fn, '__call__' ) else None + # count how many data we got from the source + self.num_data_read = 0 + # how many valid data have we gotten from the source + # IOW, data that's passed the filter and been either provided OR have been skipped due to offset + self.num_valid_data_read = 0 + # how many lines have been provided/output + self.num_data_returned = 0 + + def __iter__( self ): + parent_gen = super( FilteredDataProvider, self ).__iter__() + for datum in parent_gen: + self.num_data_read += 1 + datum = self.filter( datum ) + if datum != None: + self.num_valid_data_read += 1 + self.num_data_returned += 1 + yield datum + + #TODO: may want to squash this into DataProvider + def filter( self, datum ): + """ + When given a datum from the provider's source, return None if the datum + 'does not pass' the filter or is invalid. Return the datum if it's valid. + + :param datum: the datum to check for validity. + :returns: the datum, a modified datum, or None + + Meant to be overridden. + """ + if self.filter_fn: + return self.filter_fn( datum ) + # also can be overriden entirely + return datum + + +class LimitedOffsetDataProvider( FilteredDataProvider ): + """ + A provider that uses the counters from FilteredDataProvider to limit the + number of data and/or skip `offset` number of data before providing. + + Useful for grabbing sections from a source (e.g. pagination). + """ + # define the expected types of these __init__ arguments so they can be parsed out from query strings + settings = { + 'limit' : 'int', + 'offset': 'int' + } + + #TODO: may want to squash this into DataProvider + def __init__( self, source, offset=0, limit=None, **kwargs ): + """ + :param offset: the number of data to skip before providing. + :param limit: the final number of data to provide. + """ + super( LimitedOffsetDataProvider, self ).__init__( source, **kwargs ) + + # how many valid data to skip before we start outputing data - must be positive + # (diff to support neg. indeces - must be pos.) + self.offset = max( offset, 0 ) + + # how many valid data to return - must be positive (None indicates no limit) + self.limit = limit + if self.limit != None: + self.limit = max( self.limit, 0 ) + + def __iter__( self ): + """ + Iterate over the source until `num_valid_data_read` is greater than + `offset`, begin providing datat, and stop when `num_data_returned` + is greater than `offset`. + """ + parent_gen = super( LimitedOffsetDataProvider, self ).__iter__() + for datum in parent_gen: + + if self.limit != None and self.num_data_returned > self.limit: + break + + if self.num_valid_data_read > self.offset: + yield datum + else: + # wot a cheezy way of doing this... + self.num_data_returned -= 1 + + #TODO: skipping lines is inefficient - somehow cache file position/line_num pair and allow provider + # to seek to a pos/line and then begin providing lines + # the important catch here is that we need to have accurate pos/line pairs + # in order to preserve the functionality of limit and offset + #if file_seek and len( file_seek ) == 2: + # seek_pos, new_line_num = file_seek + # self.seek_and_set_curr_line( seek_pos, new_line_num ) + + #def seek_and_set_curr_line( self, file_seek, new_curr_line_num ): + # self.seek( file_seek, os.SEEK_SET ) + # self.curr_line_num = new_curr_line_num + + +class MultiSourceDataProvider( DataProvider ): + """ + A provider that iterates over a list of given sources and provides data + from one after another. + + An iterator over iterators. + """ + def __init__( self, source_list, **kwargs ): + """ + :param source_list: an iterator of iterables + """ + self.source_list = deque( source_list ) + + def __iter__( self ): + """ + Iterate over the source_list, then iterate over the data in each source. + + Skip a given source in `source_list` if it is `None` or invalid. + """ + for source in self.source_list: + # just skip falsy sources + if not source: + continue + try: + self.source = self.validate_source( source ) + except exceptions.InvalidDataProviderSource, invalid_source: + continue + + parent_gen = super( MultiSourceDataProvider, self ).__iter__() + for datum in parent_gen: + yield datum diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/dataproviders/chunk.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/chunk.py @@ -0,0 +1,84 @@ +""" +Chunk (N number of bytes at M offset to a source's beginning) provider. + +Primarily for file sources but usable by any iterator that has both +seek and read( N ). +""" +import os +import base64 + +import base +import exceptions + +_TODO = """ +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- +class ChunkDataProvider( base.DataProvider ): + """ + Data provider that yields chunks of data from it's file. + + Note: this version does not account for lines and works with Binary datatypes. + """ + MAX_CHUNK_SIZE = 2**16 + DEFAULT_CHUNK_SIZE = MAX_CHUNK_SIZE + settings = { + 'chunk_index' : 'int', + 'chunk_size' : 'int' + } + + #TODO: subclass from LimitedOffsetDataProvider? + # see web/framework/base.iterate_file, util/__init__.file_reader, and datatypes.tabular + def __init__( self, source, chunk_index=0, chunk_size=DEFAULT_CHUNK_SIZE, **kwargs ): + """ + :param chunk_index: if a source can be divided into N number of + `chunk_size` sections, this is the index of which section to + return. + :param chunk_size: how large are the desired chunks to return + (gen. in bytes). + """ + super( ChunkDataProvider, self ).__init__( source, **kwargs ) + self.chunk_size = int( chunk_size ) + self.chunk_pos = int( chunk_index ) * self.chunk_size + + def validate_source( self, source ): + """ + Does the given source have both the methods `seek` and `read`? + :raises InvalidDataProviderSource: if not. + """ + source = super( ChunkDataProvider, self ).validate_source( source ) + if( ( not hasattr( source, 'seek' ) ) + or ( not hasattr( source, 'read' ) ) ): + raise exceptions.InvalidDataProviderSource( source ) + return source + + def __iter__( self ): + # not reeeally an iterator per se + self.__enter__() + self.source.seek( self.chunk_pos, os.SEEK_SET ) + chunk = self.encode( self.source.read( self.chunk_size ) ) + yield chunk + self.__exit__() + + def encode( self, chunk ): + """ + Called on the chunk before returning. + + Overrride to modify, encode, or decode chunks. + """ + return chunk + + +class Base64ChunkDataProvider( ChunkDataProvider ): + """ + Data provider that yields chunks of base64 encoded data from it's file. + """ + def encode( self, chunk ): + """ + Return chunks encoded in base 64. + """ + return base64.b64encode( chunk ) diff -r 7740b1dc41fad7bc6cfd80f1af41ae05dae2f4c3 -r 866d8b29854f4079250efae62a6fddf0204e51c4 lib/galaxy/datatypes/dataproviders/column.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/column.py @@ -0,0 +1,254 @@ +""" +Providers that provide lists of lists generally where each line of a source +is further subdivided into multiple data (e.g. columns from a line). +""" + +import line + +_TODO = """ +move ColumnarDataProvider parsers to more sensible location + +TransposedColumnarDataProvider: provides each column as a single array + - see existing visualizations/dataprovider/basic.ColumnDataProvider +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- base classes +class ColumnarDataProvider( line.RegexLineDataProvider ): + """ + Data provider that provide a list of columns from the lines of it's source. + + Columns are returned in the order given in indeces, so this provider can + re-arrange columns. + + If any desired index is outside the actual number of columns + in the source, this provider will None-pad the output and you are guaranteed + the same number of columns as the number of indeces asked for (even if they + are filled with None). + """ + settings = { + 'indeces' : 'list:int', + 'column_count' : 'int', + 'column_types' : 'list:str', + 'parse_columns' : 'bool', + 'deliminator' : 'str' + } + + def __init__( self, source, indeces=None, + column_count=None, column_types=None, parsers=None, parse_columns=True, + deliminator='\t', **kwargs ): + """ + :param indeces: a list of indeces of columns to gather from each row + Optional: will default to `None`. + If `None`, this provider will return all rows (even when a + particular row contains more/less than others). + If a row/line does not contain an element at a given index, the + provider will-return/fill-with a `None` value as the element. + :type indeces: list or None + + :param column_count: an alternate means of defining indeces, use an int + here to effectively provide the first N columns. + Optional: will default to `None`. + :type column_count: int + + :param column_types: a list of string names of types that the + provider will use to look up an appropriate parser for the column. + (e.g. 'int', 'float', 'str', 'bool') + Optional: will default to parsing all columns as strings. + :type column_types: list of strings + + :param parsers: a dictionary keyed with column type strings + and with values that are functions to use when parsing those + types. + Optional: will default to using the function `_get_default_parsers`. + :type parsers: dictionary + + :param parse_columns: attempt to parse columns? + Optional: defaults to `True`. + :type parse_columns: bool + + :param deliminator: character(s) used to split each row/line of the source. + Optional: defaults to the tab character. + :type deliminator: str + + .. note: that the subclass constructors are passed kwargs - so they're + params (limit, offset, etc.) are also applicable here. + """ + #TODO: other columnar formats: csv, etc. + super( ColumnarDataProvider, self ).__init__( source, **kwargs ) + + #IMPLICIT: if no indeces, column_count, or column_types passed: return all columns + self.selected_column_indeces = indeces + self.column_count = column_count + self.column_types = column_types or [] + # if no column count given, try to infer from indeces or column_types + if not self.column_count: + if self.selected_column_indeces: + self.column_count = len( self.selected_column_indeces ) + elif self.column_types: + self.column_count = len( self.column_types ) + # if no indeces given, infer from column_count + if not self.selected_column_indeces and self.column_count: + self.selected_column_indeces = list( xrange( self.column_count ) ) + + self.deliminator = deliminator + + # how/whether to parse each column value + self.parsers = {} + if parse_columns: + self.parsers = self.get_default_parsers() + # overwrite with user desired parsers + self.parsers.update( parsers or {} ) + + def get_default_parsers( self ): + """ + Return parser dictionary keyed for each columnar type + (as defined in datatypes). + + .. note: primitives only by default (str, int, float, boolean, None). + Other (more complex) types are retrieved as strings. + :returns: a dictionary of the form: + `{ <parser type name> : <function used to parse type> }` + """ + #TODO: move to module level (or datatypes, util) + return { + # str is default and not needed here + 'int' : int, + 'float' : float, + 'bool' : bool, + + # unfortunately, 'list' is used in dataset metadata both for + # query style maps (9th col gff) AND comma-sep strings. + # (disabled for now) + #'list' : lambda v: v.split( ',' ), + #'csv' : lambda v: v.split( ',' ), + ## i don't like how urlparses does sub-lists... + #'querystr' : lambda v: dict([ ( p.split( '=', 1 ) if '=' in p else ( p, True ) ) + # for p in v.split( ';', 1 ) ]) + + #'scifloat': #floating point which may be in scientific notation + + # always with the 1 base, biologists? + #'int1' : ( lambda i: int( i ) - 1 ), + + #'gffval': string or '.' for None + #'gffint': # int or '.' for None + #'gffphase': # 0, 1, 2, or '.' for None + #'gffstrand': # -, +, ?, or '.' for None, etc. + } + + def parse_value( self, val, type ): + """ + Attempt to parse and return the given value based on the given type. + + :param val: the column value to parse (often a string) + :param type: the string type 'name' used to find the appropriate parser + :returns: the parsed value + or `value` if no `type` found in `parsers` + or `None` if there was a parser error (ValueError) + """ + if type == 'str' or type == None: return val + try: + return self.parsers[ type ]( val ) + except KeyError, err: + # no parser - return as string + pass + except ValueError, err: + # bad value - return None + return None + return val + + def get_column_type( self, index ): + """ + Get the column type for the parser from `self.column_types` or `None` + if the type is unavailable. + :param index: the column index + :returns: string name of type (e.g. 'float', 'int', etc.) + """ + try: + return self.column_types[ index ] + except IndexError, ind_err: + return None + + def parse_column_at_index( self, columns, parser_index, index ): + """ + Get the column type for the parser from `self.column_types` or `None` + if the type is unavailable. + """ + try: + return self.parse_value( columns[ index ], self.get_column_type( parser_index ) ) + # if a selected index is not within columns, return None + except IndexError, index_err: + return None + + def parse_columns_from_line( self, line ): + """ + Returns a list of the desired, parsed columns. + :param line: the line to parse + :type line: str + """ + #TODO: too much going on in this loop - the above should all be precomputed AMAP... + all_columns = line.split( self.deliminator ) + # if no indeces were passed to init, return all columns + selected_indeces = self.selected_column_indeces or list( xrange( len( all_columns ) ) ) + parsed_columns = [] + for parser_index, column_index in enumerate( selected_indeces ): + parsed_columns.append( self.parse_column_at_index( all_columns, parser_index, column_index ) ) + return parsed_columns + + def __iter__( self ): + parent_gen = super( ColumnarDataProvider, self ).__iter__() + for line in parent_gen: + columns = self.parse_columns_from_line( line ) + yield columns + + #TODO: implement column filters here and not below - flatten hierarchy + +class FilteredByColumnDataProvider( ColumnarDataProvider ): + """ + Data provider that provide a list of columns from the lines of it's source + _only_ if they pass a given filter function. + + e.g. column #3 is type int and > N + """ + # TODO: how to do this and still have limit and offset work? + def __init__( self, source, **kwargs ): + raise NotImplementedError() + super( FilteredByColumnDataProvider, self ).__init__( source, **kwargs ) + + +class DictDataProvider( ColumnarDataProvider ): + """ + Data provider that zips column_names and columns from the source's contents + into a dictionary. + + A combination use of both `column_names` and `indeces` allows 'picking' + key/value pairs from the source. + + .. note: that the subclass constructors are passed kwargs - so they're + params (limit, offset, etc.) are also applicable here. + """ + settings = { + 'column_names' : 'list:str', + } + + def __init__( self, source, column_names=None, **kwargs ): + """ + :param column_names: an ordered list of strings that will be used as the keys + for each column in the returned dictionaries. + The number of key, value pairs each returned dictionary has will + be as short as the number of column names provided. + :type column_names: + """ + #TODO: allow passing in a map instead of name->index { 'name1': index1, ... } + super( DictDataProvider, self ).__init__( source, **kwargs ) + self.column_names = column_names or [] + + def __iter__( self ): + parent_gen = super( DictDataProvider, self ).__iter__() + for column_values in parent_gen: + map = dict( zip( self.column_names, column_values ) ) + yield map This diff is so big that we needed to truncate the remainder. Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org