4 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/ca6940bbf946/ Changeset: ca6940bbf946 Branch: provenance User: Kyle Ellrott Date: 2013-06-19 23:55:37 Summary: Mainline merge Affected #: 210 files diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 .hgtags --- a/.hgtags +++ b/.hgtags @@ -2,3 +2,4 @@ 1c717491139269651bb59687563da9410b84c65d release_2013.02.08 75f09617abaadbc8cc732bb8ee519decaeb56ea7 release_2013.04.01 2cc8d10988e03257dc7b97f8bb332c7df745d1dd security_2013.04.08 +524f246ca85395082719ae7a6ff72260d7ad5612 release_2013.06.03 diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 job_conf.xml.sample_advanced --- a/job_conf.xml.sample_advanced +++ b/job_conf.xml.sample_advanced @@ -54,7 +54,15 @@ <param id="shell_hostname">foo.example.org</param><param id="Job_Execution_Time">24:00:00</param></destination> - <destination id="condor" runner="condor"/> + <destination id="condor" runner="condor"> + <!-- With no params, jobs are submitted to the 'vanilla' universe with: + notification = NEVER + getenv = true + Additional/override query ClassAd params can be specified with + <param> tags. + --> + <param id="request_cpus">8</param> + </destination></destinations><tools><!-- Tools can be configured to use specific destinations or handlers, diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/app.py --- a/lib/galaxy/app.py +++ b/lib/galaxy/app.py @@ -62,7 +62,8 @@ self.config.database_engine_options, database_query_profiling_proxy = self.config.database_query_profiling_proxy, object_store = self.object_store, - trace_logger=self.trace_logger ) + trace_logger=self.trace_logger, + use_pbkdf2=self.config.get_bool( 'use_pbkdf2', True ) ) # Manage installed tool shed repositories. self.installed_repository_manager = tool_shed.galaxy_install.InstalledRepositoryManager( self ) # Create an empty datatypes registry. @@ -91,7 +92,7 @@ # Load additional entries defined by self.config.shed_tool_data_table_config into tool data tables. self.tool_data_tables.load_from_config_file( config_filename=self.config.shed_tool_data_table_config, tool_data_path=self.tool_data_tables.tool_data_path, - from_shed_config=True ) + from_shed_config=False ) # Initialize the job management configuration self.job_config = jobs.JobConfiguration(self) # Initialize the tools, making sure the list of tool configs includes the reserved migrated_tools_conf.xml file. diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -64,21 +64,33 @@ tcf = kwargs[ 'tool_config_files' ] else: tcf = 'tool_conf.xml' + self.tool_filters = listify( kwargs.get( "tool_filters", [] ) ) + self.tool_label_filters = listify( kwargs.get( "tool_label_filters", [] ) ) + self.tool_section_filters = listify( kwargs.get( "tool_section_filters", [] ) ) self.tool_configs = [ resolve_path( p, self.root ) for p in listify( tcf ) ] + self.shed_tool_data_path = kwargs.get( "shed_tool_data_path", None ) + if self.shed_tool_data_path: + self.shed_tool_data_path = resolve_path( self.shed_tool_data_path, self.root ) + else: + self.shed_tool_data_path = self.tool_data_path self.tool_data_table_config_path = resolve_path( kwargs.get( 'tool_data_table_config_path', 'tool_data_table_conf.xml' ), self.root ) self.shed_tool_data_table_config = resolve_path( kwargs.get( 'shed_tool_data_table_config', 'shed_tool_data_table_conf.xml' ), self.root ) self.enable_tool_shed_check = string_as_bool( kwargs.get( 'enable_tool_shed_check', False ) ) + self.hours_between_check = kwargs.get( 'hours_between_check', 12 ) try: - self.hours_between_check = kwargs.get( 'hours_between_check', 12 ) - if isinstance( self.hours_between_check, float ): + hbc_test = int( self.hours_between_check ) + self.hours_between_check = hbc_test + if self.hours_between_check < 1 or self.hours_between_check > 24: + self.hours_between_check = 12 + except: + try: # Float values are supported for functional tests. + hbc_test = float( self.hours_between_check ) + self.hours_between_check = hbc_test if self.hours_between_check < 0.001 or self.hours_between_check > 24.0: self.hours_between_check = 12.0 - else: - if self.hours_between_check < 1 or self.hours_between_check > 24: - self.hours_between_check = 12 - except: - self.hours_between_check = 12 + except: + self.hours_between_check = 12 self.update_integrated_tool_panel = kwargs.get( "update_integrated_tool_panel", True ) self.enable_data_manager_user_view = string_as_bool( kwargs.get( "enable_data_manager_user_view", "False" ) ) self.data_manager_config_file = resolve_path( kwargs.get('data_manager_config_file', 'data_manager_conf.xml' ), self.root ) @@ -154,6 +166,7 @@ self.ucsc_display_sites = kwargs.get( 'ucsc_display_sites', "main,test,archaea,ucla" ).lower().split(",") self.gbrowse_display_sites = kwargs.get( 'gbrowse_display_sites', "modencode,sgd_yeast,tair,wormbase,wormbase_ws120,wormbase_ws140,wormbase_ws170,wormbase_ws180,wormbase_ws190,wormbase_ws200,wormbase_ws204,wormbase_ws210,wormbase_ws220,wormbase_ws225" ).lower().split(",") self.brand = kwargs.get( 'brand', None ) + self.welcome_url = kwargs.get( 'welcome_url', '/static/welcome.html' ) # Configuration for the message box directly below the masthead. self.message_box_visible = kwargs.get( 'message_box_visible', False ) self.message_box_content = kwargs.get( 'message_box_content', None ) diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/binary.py --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -22,6 +22,7 @@ from galaxy.datatypes.metadata import MetadataElement from galaxy.datatypes import metadata from galaxy.datatypes.sniff import * +import dataproviders log = logging.getLogger(__name__) @@ -74,6 +75,7 @@ trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext) return open( dataset.file_name ) + class Ab1( Binary ): """Class describing an ab1 binary sequence file""" file_ext = "ab1" @@ -93,12 +95,15 @@ Binary.register_unsniffable_binary_ext("ab1") + class GenericAsn1Binary( Binary ): """Class for generic ASN.1 binary format""" file_ext = "asn1-binary" Binary.register_unsniffable_binary_ext("asn1-binary") + +@dataproviders.decorators.has_dataproviders class Bam( Binary ): """Class describing a BAM binary file""" file_ext = "bam" @@ -255,9 +260,92 @@ return dataset.peek except: return "Binary bam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) ) + + # ------------- Dataproviders + # pipe through samtools view + #ALSO: (as Sam) + # bam does not use '#' to indicate comments/headers - we need to strip out those headers from the std. providers + #TODO:?? seems like there should be an easier way to do/inherit this - metadata.comment_char? + #TODO: incorporate samtools options to control output: regions first, then flags, etc. + @dataproviders.decorators.dataprovider_factory( 'line' ) + def line_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.line.FilteredLineDataProvider( samtools_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'regex-line' ) + def regex_line_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.line.RegexLineDataProvider( samtools_source, **settings ) + @dataproviders.decorators.dataprovider_factory( 'column' ) + def column_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.column.ColumnarDataProvider( samtools_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'map' ) + def map_dataprovider( self, dataset, **settings ): + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + settings[ 'comment_char' ] = '@' + return dataproviders.column.MapDataProvider( samtools_source, **settings ) + + # these can't be used directly - may need BamColumn, BamMap (Bam metadata -> column/map) + # OR - see genomic_region_dataprovider + #@dataproviders.decorators.dataprovider_factory( 'dataset-column' ) + #def dataset_column_dataprovider( self, dataset, **settings ): + # settings[ 'comment_char' ] = '@' + # return super( Sam, self ).dataset_column_dataprovider( dataset, **settings ) + + #@dataproviders.decorators.dataprovider_factory( 'dataset-map' ) + #def dataset_map_dataprovider( self, dataset, **settings ): + # settings[ 'comment_char' ] = '@' + # return super( Sam, self ).dataset_map_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'header' ) + def header_dataprovider( self, dataset, **settings ): + # in this case we can use an option of samtools view to provide just what we need (w/o regex) + samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset, '-H' ) + return dataproviders.line.RegexLineDataProvider( samtools_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'id-seq-qual' ) + def id_seq_qual_dataprovider( self, dataset, **settings ): + settings[ 'indeces' ] = [ 0, 9, 10 ] + settings[ 'column_types' ] = [ 'str', 'str', 'str' ] + settings[ 'column_names' ] = [ 'id', 'seq', 'qual' ] + return self.map_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'genomic-region' ) + def genomic_region_dataprovider( self, dataset, **settings ): + # GenomicRegionDataProvider currently requires a dataset as source - may not be necc. + #TODO:?? consider (at least) the possible use of a kwarg: metadata_source (def. to source.dataset), + # or remove altogether... + #samtools_source = dataproviders.dataset.SamtoolsDataProvider( dataset ) + #return dataproviders.dataset.GenomicRegionDataProvider( samtools_source, metadata_source=dataset, + # 2, 3, 3, **settings ) + + # instead, set manually and use in-class column gen + settings[ 'indeces' ] = [ 2, 3, 3 ] + settings[ 'column_types' ] = [ 'str', 'int', 'int' ] + return self.column_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'genomic-region-map' ) + def genomic_region_map_dataprovider( self, dataset, **settings ): + settings[ 'indeces' ] = [ 2, 3, 3 ] + settings[ 'column_types' ] = [ 'str', 'int', 'int' ] + settings[ 'column_names' ] = [ 'chrom', 'start', 'end' ] + return self.map_dataprovider( dataset, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'samtools' ) + def samtools_dataprovider( self, dataset, **settings ): + """Generic samtools interface - all options available through settings.""" + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.dataset.SamtoolsDataProvider( dataset_source, **settings ) + Binary.register_sniffable_binary_format("bam", "bam", Bam) + class H5( Binary ): """Class describing an HDF5 file""" file_ext = "h5" @@ -277,6 +365,7 @@ Binary.register_unsniffable_binary_ext("h5") + class Scf( Binary ): """Class describing an scf binary sequence file""" file_ext = "scf" @@ -296,6 +385,7 @@ Binary.register_unsniffable_binary_ext("scf") + class Sff( Binary ): """ Standard Flowgram Format (SFF) """ file_ext = "sff" @@ -327,6 +417,7 @@ Binary.register_sniffable_binary_format("sff", "sff", Sff) + class BigWig(Binary): """ Accessing binary BigWig files from UCSC. @@ -363,6 +454,7 @@ Binary.register_sniffable_binary_format("bigwig", "bigwig", BigWig) + class BigBed(BigWig): """BigBed support from UCSC.""" @@ -375,6 +467,7 @@ Binary.register_sniffable_binary_format("bigbed", "bigbed", BigBed) + class TwoBit (Binary): """Class describing a TwoBit format nucleotide file""" @@ -399,3 +492,5 @@ return dataset.peek except: return "Binary TwoBit format nucleotide file (%s)" % (data.nice_size(dataset.get_size())) + +Binary.register_sniffable_binary_format("twobit", "twobit", TwoBit) diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/data.py --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -14,6 +14,8 @@ from galaxy.util.odict import odict from galaxy.util.sanitize_html import sanitize_html +import dataproviders + from galaxy import eggs eggs.require( "Paste" ) import paste @@ -56,6 +58,7 @@ cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls metadata.Statement.process( cls ) +@dataproviders.decorators.has_dataproviders class Data( object ): """ Base class for all datatypes. Implements basic interfaces as well @@ -545,7 +548,13 @@ def has_resolution(self): return False - + def matches_any( self, target_datatypes ): + """ + Check if this datatype is of any of the target_datatypes or is + a subtype thereof. + """ + datatype_classes = tuple( [ datatype.__class__ for datatype in target_datatypes ] ) + return isinstance( self, datatype_classes ) def merge( split_files, output_file): """ Merge files with copy.copyfileobj() will not hit the @@ -572,6 +581,40 @@ return [ 'trackster', 'circster' ] return [] + # ------------- Dataproviders + def has_dataprovider( self, data_format ): + """ + Returns True if `data_format` is available in `dataproviders`. + """ + return ( data_format in self.dataproviders ) + + def dataprovider( self, dataset, data_format, **settings ): + """ + Base dataprovider factory for all datatypes that returns the proper provider + for the given `data_format` or raises a `NoProviderAvailable`. + """ + #TODO:?? is this handling super class providers? + if self.has_dataprovider( data_format ): + return self.dataproviders[ data_format ]( self, dataset, **settings ) + raise dataproviders.exceptions.NoProviderAvailable( self, data_format ) + + @dataproviders.decorators.dataprovider_factory( 'base' ) + def base_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.base.DataProvider( dataset_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'chunk' ) + def chunk_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.chunk.ChunkDataProvider( dataset_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'chunk64' ) + def chunk64_dataprovider( self, dataset, **settings ): + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.chunk.Base64ChunkDataProvider( dataset_source, **settings ) + + +@dataproviders.decorators.has_dataproviders class Text( Data ): file_ext = 'txt' line_class = 'line' @@ -741,10 +784,31 @@ f.close() split = classmethod(split) + # ------------- Dataproviders + @dataproviders.decorators.dataprovider_factory( 'line' ) + def line_dataprovider( self, dataset, **settings ): + """ + Returns an iterator over the dataset's lines (that have been `strip`ed) + optionally excluding blank lines and lines that start with a comment character. + """ + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.line.FilteredLineDataProvider( dataset_source, **settings ) + + @dataproviders.decorators.dataprovider_factory( 'regex-line' ) + def regex_line_dataprovider( self, dataset, **settings ): + """ + Returns an iterator over the dataset's lines + optionally including/excluding lines that match one or more regex filters. + """ + dataset_source = dataproviders.dataset.DatasetDataProvider( dataset ) + return dataproviders.line.RegexLineDataProvider( dataset_source, **settings ) + + class GenericAsn1( Text ): """Class for generic ASN.1 text format""" file_ext = 'asn1' + class LineCount( Text ): """ Dataset contains a single line with a single integer that denotes the @@ -752,6 +816,7 @@ """ pass + class Newick( Text ): """New Hampshire/Newick Format""" file_ext = "nhx" diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/__init__.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/__init__.py @@ -0,0 +1,28 @@ + +#TODO: ---- This is a work in progress ---- +""" +Dataproviders are iterators with context managers that provide data to some +consumer datum by datum. + +As well as subclassing and overriding to get the proper data, Dataproviders +can be piped from one to the other. +..example:: + +.. note:: be careful to NOT pipe providers into subclasses of those providers. + Subclasses provide all the functionality of their superclasses, + so there's generally no need. + +.. note:: be careful to when using piped providers that accept the same keywords + in their __init__ functions (such as limit or offset) to pass those + keywords to the proper (often final) provider. These errors that result + can be hard to diagnose. +""" +import decorators +import exceptions + +import base +import chunk +import line +import column +import external +import dataset diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/base.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/base.py @@ -0,0 +1,260 @@ +""" +Base class(es) for all DataProviders. +""" +# there's a blurry line between functionality here and functionality in datatypes module +# attempting to keep parsing to a minimum here and focus on chopping/pagination/reformat(/filtering-maybe?) +# and using as much pre-computed info/metadata from the datatypes module as possible +# also, this shouldn't be a replacement/re-implementation of the tool layer +# (which provides traceability/versioning/reproducibility) + +from collections import deque +import exceptions + +_TODO = """ +hooks into datatypes (define providers inside datatype modules) as factories +capture tell() when provider is done + def stop( self ): self.endpoint = source.tell(); raise StopIteration() +implement __len__ sensibly where it can be (would be good to have where we're giving some progress - '100 of 300') + seems like sniffed files would have this info +unit tests +add datum entry/exit point methods: possibly decode, encode + or create a class that pipes source through - how would decode work then? + +icorporate existing visualization/dataproviders +some of the sources (esp. in datasets) don't need to be re-created + +YAGNI: InterleavingMultiSourceDataProvider, CombiningMultiSourceDataProvider +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- base classes +class DataProvider( object ): + """ + Base class for all data providers. Data providers: + (a) have a source (which must be another file-like object) + (b) implement both the iterator and context manager interfaces + (c) do not allow write methods + (but otherwise implement the other file object interface methods) + """ + def __init__( self, source, **kwargs ): + """ + :param source: the source that this iterator will loop over. + (Should implement the iterable interface and ideally have the + context manager interface as well) + """ + self.source = self.validate_source( source ) + + def validate_source( self, source ): + """ + Is this a valid source for this provider? + + :raises InvalidDataProviderSource: if the source is considered invalid. + + Meant to be overridden in subclasses. + """ + if not source or not hasattr( source, '__iter__' ): + # that's by no means a thorough check + raise exceptions.InvalidDataProviderSource( source ) + return source + + #TODO: (this might cause problems later...) + #TODO: some providers (such as chunk's seek and read) rely on this... remove + def __getattr__( self, name ): + if name == 'source': + # if we're inside this fn, source hasn't been set - provide some safety just for this attr + return None + # otherwise, try to get the attr from the source - allows us to get things like provider.encoding, etc. + if hasattr( self.source, name ): + return getattr( self.source, name ) + # raise the proper error + return self.__getattribute__( name ) + + # write methods should not be allowed + def truncate( self, size ): + raise NotImplementedError( 'Write methods are purposely disabled' ) + def write( self, string ): + raise NotImplementedError( 'Write methods are purposely disabled' ) + def writelines( self, sequence ): + raise NotImplementedError( 'Write methods are purposely disabled' ) + + #TODO: route read methods through next? + #def readline( self ): + # return self.next() + def readlines( self ): + return [ line for line in self ] + + # iterator interface + def __iter__( self ): + # it's generators all the way up, Timmy + with self as source: + for datum in self.source: + yield datum + def next( self ): + return self.source.next() + + # context manager interface + def __enter__( self ): + # make the source's context manager interface optional + if hasattr( self.source, '__enter__' ): + self.source.__enter__() + return self + def __exit__( self, *args ): + # make the source's context manager interface optional, call on source if there + if hasattr( self.source, '__exit__' ): + self.source.__exit__( *args ) + # alternately, call close() + elif hasattr( self.source, 'close' ): + self.source.close() + + def __str__( self ): + """ + String representation for easier debugging. + + Will call `__str__` on it's source so this will display piped dataproviders. + """ + # we need to protect against recursion (in __getattr__) if self.source hasn't been set + source_str = str( self.source ) if hasattr( self, 'source' ) else '' + return '%s(%s)' %( self.__class__.__name__, str( source_str ) ) + + +class FilteredDataProvider( DataProvider ): + """ + Passes each datum through a filter function and yields it if that function + returns a non-`None` value. + + Also maintains counters: + - `num_data_read`: how many data have been consumed from the source. + - `num_valid_data_read`: how many data have been returned from `filter`. + - `num_data_returned`: how many data has this provider yielded. + """ + def __init__( self, source, filter_fn=None, **kwargs ): + """ + :param filter_fn: a lambda or function that will be passed a datum and + return either the (optionally modified) datum or None. + """ + super( FilteredDataProvider, self ).__init__( source, **kwargs ) + self.filter_fn = filter_fn + # count how many data we got from the source + self.num_data_read = 0 + # how many valid data have we gotten from the source + # IOW, data that's passed the filter and been either provided OR have been skipped due to offset + self.num_valid_data_read = 0 + # how many lines have been provided/output + self.num_data_returned = 0 + + def __iter__( self ): + parent_gen = super( FilteredDataProvider, self ).__iter__() + for datum in parent_gen: + self.num_data_read += 1 + datum = self.filter( datum ) + if datum != None: + self.num_valid_data_read += 1 + self.num_data_returned += 1 + yield datum + + #TODO: may want to squash this into DataProvider + def filter( self, datum ): + """ + When given a datum from the provider's source, return None if the datum + 'does not pass' the filter or is invalid. Return the datum if it's valid. + + :param datum: the datum to check for validity. + :returns: the datum, a modified datum, or None + + Meant to be overridden. + """ + if self.filter_fn: + return self.filter_fn( datum ) + # also can be overriden entirely + return datum + + +class LimitedOffsetDataProvider( FilteredDataProvider ): + """ + A provider that uses the counters from FilteredDataProvider to limit the + number of data and/or skip `offset` number of data before providing. + + Useful for grabbing sections from a source (e.g. pagination). + """ + #TODO: may want to squash this into DataProvider + def __init__( self, source, offset=0, limit=None, **kwargs ): + """ + :param offset: the number of data to skip before providing. + :param limit: the final number of data to provide. + """ + super( LimitedOffsetDataProvider, self ).__init__( source, **kwargs ) + + # how many valid data to skip before we start outputing data - must be positive + # (diff to support neg. indeces - must be pos.) + self.offset = max( offset, 0 ) + + # how many valid data to return - must be positive (None indicates no limit) + self.limit = limit + if self.limit != None: + self.limit = max( self.limit, 0 ) + + def __iter__( self ): + """ + Iterate over the source until `num_valid_data_read` is greater than + `offset`, begin providing datat, and stop when `num_data_returned` + is greater than `offset`. + """ + parent_gen = super( LimitedOffsetDataProvider, self ).__iter__() + for datum in parent_gen: + + if self.limit != None and self.num_data_returned > self.limit: + break + + if self.num_valid_data_read > self.offset: + yield datum + else: + # wot a cheezy way of doing this... + self.num_data_returned -= 1 + + #TODO: skipping lines is inefficient - somehow cache file position/line_num pair and allow provider + # to seek to a pos/line and then begin providing lines + # the important catch here is that we need to have accurate pos/line pairs + # in order to preserve the functionality of limit and offset + #if file_seek and len( file_seek ) == 2: + # seek_pos, new_line_num = file_seek + # self.seek_and_set_curr_line( seek_pos, new_line_num ) + + #def seek_and_set_curr_line( self, file_seek, new_curr_line_num ): + # self.seek( file_seek, os.SEEK_SET ) + # self.curr_line_num = new_curr_line_num + + +class MultiSourceDataProvider( DataProvider ): + """ + A provider that iterates over a list of given sources and provides data + from one after another. + + An iterator over iterators. + """ + def __init__( self, source_list, **kwargs ): + """ + :param source_list: an iterator of iterables + """ + self.source_list = deque( source_list ) + + def __iter__( self ): + """ + Iterate over the source_list, then iterate over the data in each source. + + Skip a given source in `source_list` if it is `None` or invalid. + """ + for source in self.source_list: + # just skip falsy sources + if not source: + continue + try: + self.source = self.validate_source( source ) + except exceptions.InvalidDataProviderSource, invalid_source: + continue + + parent_gen = super( MultiSourceDataProvider, self ).__iter__() + for datum in parent_gen: + yield datum diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/chunk.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/chunk.py @@ -0,0 +1,80 @@ +""" +Chunk (N number of bytes at M offset to a source's beginning) provider. + +Primarily for file sources but usable by any iterator that has both +seek and read( N ). +""" +import os +import base64 + +import base +import exceptions + +_TODO = """ +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- +class ChunkDataProvider( base.DataProvider ): + """ + Data provider that yields chunks of data from it's file. + + Note: this version does not account for lines and works with Binary datatypes. + """ + MAX_CHUNK_SIZE = 2**16 + DEFAULT_CHUNK_SIZE = MAX_CHUNK_SIZE + + #TODO: subclass from LimitedOffsetDataProvider? + # see web/framework/base.iterate_file, util/__init__.file_reader, and datatypes.tabular + def __init__( self, source, chunk_index=0, chunk_size=DEFAULT_CHUNK_SIZE, **kwargs ): + """ + :param chunk_index: if a source can be divided into N number of + `chunk_size` sections, this is the index of which section to + return. + :param chunk_size: how large are the desired chunks to return + (gen. in bytes). + """ + super( ChunkDataProvider, self ).__init__( source, **kwargs ) + self.chunk_size = chunk_size + self.chunk_pos = chunk_index * self.chunk_size + + def validate_source( self, source ): + """ + Does the given source have both the methods `seek` and `read`? + :raises InvalidDataProviderSource: if not. + """ + source = super( ChunkDataProvider, self ).validate_source( source ) + if( ( not hasattr( source, 'seek' ) ) + or ( not hasattr( source, 'read' ) ) ): + raise exceptions.InvalidDataProviderSource( source ) + return source + + def __iter__( self ): + # not reeeally an iterator per se + self.__enter__() + self.source.seek( self.chunk_pos, os.SEEK_SET ) + chunk = self.encode( self.source.read( self.chunk_size ) ) + yield chunk + self.__exit__() + + def encode( self, chunk ): + """ + Called on the chunk before returning. + + Overrride to modify, encode, or decode chunks. + """ + return chunk + + +class Base64ChunkDataProvider( ChunkDataProvider ): + """ + Data provider that yields chunks of base64 encoded data from it's file. + """ + def encode( self, chunk ): + """ + Return chunks encoded in base 64. + """ + return base64.b64encode( chunk ) diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/column.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/column.py @@ -0,0 +1,242 @@ +""" +Providers that provide lists of lists generally where each line of a source +is further subdivided into multiple data (e.g. columns from a line). +""" + +import line + +_TODO = """ +move ColumnarDataProvider parsers to more sensible location + +TransposedColumnarDataProvider: provides each column as a single array + - see existing visualizations/dataprovider/basic.ColumnDataProvider +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- base classes +class ColumnarDataProvider( line.RegexLineDataProvider ): + """ + Data provider that provide a list of columns from the lines of it's source. + + Columns are returned in the order given in indeces, so this provider can + re-arrange columns. + + If any desired index is outside the actual number of columns + in the source, this provider will None-pad the output and you are guaranteed + the same number of columns as the number of indeces asked for (even if they + are filled with None). + """ + def __init__( self, source, indeces=None, + column_count=None, column_types=None, parsers=None, parse_columns=True, + deliminator='\t', **kwargs ): + """ + :param indeces: a list of indeces of columns to gather from each row + Optional: will default to `None`. + If `None`, this provider will return all rows (even when a + particular row contains more/less than others). + If a row/line does not contain an element at a given index, the + provider will-return/fill-with a `None` value as the element. + :type indeces: list or None + + :param column_count: an alternate means of defining indeces, use an int + here to effectively provide the first N columns. + Optional: will default to `None`. + :type column_count: int + + :param column_types: a list of string names of types that the + provider will use to look up an appropriate parser for the column. + (e.g. 'int', 'float', 'str', 'bool') + Optional: will default to parsing all columns as strings. + :type column_types: list of strings + + :param parsers: a dictionary keyed with column type strings + and with values that are functions to use when parsing those + types. + Optional: will default to using the function `_get_default_parsers`. + :type parsers: dictionary + + :param parse_columns: attempt to parse columns? + Optional: defaults to `True`. + :type parse_columns: bool + + :param deliminator: character(s) used to split each row/line of the source. + Optional: defaults to the tab character. + :type deliminator: str + + .. note: that the subclass constructors are passed kwargs - so they're + params (limit, offset, etc.) are also applicable here. + """ + #TODO: other columnar formats: csv, etc. + super( ColumnarDataProvider, self ).__init__( source, **kwargs ) + + #IMPLICIT: if no indeces, column_count, or column_types passed: return all columns + self.selected_column_indeces = indeces + self.column_count = column_count + self.column_types = column_types or [] + # if no column count given, try to infer from indeces or column_types + if not self.column_count: + if self.selected_column_indeces: + self.column_count = len( self.selected_column_indeces ) + elif self.column_types: + self.column_count = len( self.column_types ) + # if no indeces given, infer from column_count + if not self.selected_column_indeces and self.column_count: + self.selected_column_indeces = list( xrange( self.column_count ) ) + + self.deliminator = deliminator + + # how/whether to parse each column value + self.parsers = {} + if parse_columns: + self.parsers = self._get_default_parsers() + # overwrite with user desired parsers + self.parsers.update( parsers or {} ) + + def _get_default_parsers( self ): + """ + Return parser dictionary keyed for each columnar type + (as defined in datatypes). + + .. note: primitives only by default (str, int, float, boolean, None). + Other (more complex) types are retrieved as strings. + :returns: a dictionary of the form: + `{ <parser type name> : <function used to parse type> }` + """ + #TODO: move to module level (or datatypes, util) + return { + # str is default and not needed here + 'int' : int, + 'float' : float, + 'bool' : bool, + + # unfortunately, 'list' is used in dataset metadata both for + # query style maps (9th col gff) AND comma-sep strings. + # (disabled for now) + #'list' : lambda v: v.split( ',' ), + #'csv' : lambda v: v.split( ',' ), + ## i don't like how urlparses does sub-lists... + #'querystr' : lambda v: dict([ ( p.split( '=', 1 ) if '=' in p else ( p, True ) ) + # for p in v.split( ';', 1 ) ]) + + #'scifloat': #floating point which may be in scientific notation + + # always with the 1 base, biologists? + #'int1' : ( lambda i: int( i ) - 1 ), + + #'gffval': string or '.' for None + #'gffint': # int or '.' for None + #'gffphase': # 0, 1, 2, or '.' for None + #'gffstrand': # -, +, ?, or '.' for None, etc. + } + + def _parse_value( self, val, type ): + """ + Attempt to parse and return the given value based on the given type. + + :param val: the column value to parse (often a string) + :param type: the string type 'name' used to find the appropriate parser + :returns: the parsed value + or `value` if no `type` found in `parsers` + or `None` if there was a parser error (ValueError) + """ + if type == 'str' or type == None: return val + try: + return self.parsers[ type ]( val ) + except KeyError, err: + # no parser - return as string + pass + except ValueError, err: + # bad value - return None + return None + return val + + def _get_column_type( self, index ): + """ + Get the column type for the parser from `self.column_types` or `None` + if the type is unavailable. + :param index: the column index + :returns: string name of type (e.g. 'float', 'int', etc.) + """ + try: + return self.column_types[ index ] + except IndexError, ind_err: + return None + + def _parse_column_at_index( self, columns, parser_index, index ): + """ + Get the column type for the parser from `self.column_types` or `None` + if the type is unavailable. + """ + try: + return self._parse_value( columns[ index ], self._get_column_type( parser_index ) ) + # if a selected index is not within columns, return None + except IndexError, index_err: + return None + + def _parse_columns_from_line( self, line ): + """ + Returns a list of the desired, parsed columns. + :param line: the line to parse + :type line: str + """ + #TODO: too much going on in this loop - the above should all be precomputed AMAP... + all_columns = line.split( self.deliminator ) + # if no indeces were passed to init, return all columns + selected_indeces = self.selected_column_indeces or list( xrange( len( all_columns ) ) ) + parsed_columns = [] + for parser_index, column_index in enumerate( selected_indeces ): + parsed_columns.append( self._parse_column_at_index( all_columns, parser_index, column_index ) ) + return parsed_columns + + def __iter__( self ): + parent_gen = super( ColumnarDataProvider, self ).__iter__() + for line in parent_gen: + columns = self._parse_columns_from_line( line ) + yield columns + + #TODO: implement column filters here and not below - flatten hierarchy + +class FilteredByColumnDataProvider( ColumnarDataProvider ): + """ + Data provider that provide a list of columns from the lines of it's source + _only_ if they pass a given filter function. + + e.g. column #3 is type int and > N + """ + # TODO: how to do this and still have limit and offset work? + def __init__( self, source, **kwargs ): + raise NotImplementedError() + super( FilteredByColumnDataProvider, self ).__init__( source, **kwargs ) + + +class MapDataProvider( ColumnarDataProvider ): + """ + Data provider that column_names and columns from the source's contents + into a dictionary. + + A combination use of both `column_names` and `indeces` allows 'picking' + key/value pairs from the source. + + .. note: that the subclass constructors are passed kwargs - so they're + params (limit, offset, etc.) are also applicable here. + """ + def __init__( self, source, column_names=None, **kwargs ): + """ + :param column_names: an ordered list of strings that will be used as the keys + for each column in the returned dictionaries. + The number of key, value pairs each returned dictionary has will + be as short as the number of column names provided. + :type column_names: + """ + #TODO: allow passing in a map instead of name->index { 'name1': index1, ... } + super( MapDataProvider, self ).__init__( source, **kwargs ) + self.column_names = column_names or [] + + def __iter__( self ): + parent_gen = super( MapDataProvider, self ).__iter__() + for column_values in parent_gen: + map = dict( zip( self.column_names, column_values ) ) + yield map diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/dataset.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/dataset.py @@ -0,0 +1,671 @@ +""" +Dataproviders that use either: + - the file contents and/or metadata from a Galaxy DatasetInstance as + their source. + - or provide data in some way relevant to bioinformatic data + (e.g. parsing genomic regions from their source) +""" + +import pkg_resources +pkg_resources.require( 'bx-python' ) +from bx import seq as bx_seq +from bx import wiggle as bx_wig + +import galaxy.model +import galaxy.datatypes +import galaxy.datatypes.data + +#TODO: can't import these due to circular ref in model/registry +#import galaxy.datatypes.binary +#import galaxy.datatypes.tabular + +import exceptions +import base +import line +import column +import external + +_TODO = """ +use bx as much as possible +the use of DatasetInstance seems to create some import problems +gff3 hierarchies +""" + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- base for using a Glx dataset +class DatasetDataProvider( base.DataProvider ): + """ + Class that uses the file contents and/or metadata from a Galaxy DatasetInstance + as it's source. + + DatasetDataProvider can be seen as the intersection between a datatype's + metadata and a dataset's file contents. It (so far) mainly provides helper + and conv. methods for using dataset metadata to set up and control how + the data is provided. + """ + def __init__( self, dataset, **kwargs ): + """ + :param dataset: the Galaxy dataset whose file will be the source + :type dataset: model.DatasetInstance + + :raises exceptions.InvalidDataProviderSource: if not a DatsetInstance + """ + if not isinstance( dataset, galaxy.model.DatasetInstance ): + raise exceptions.InvalidDataProviderSource( "Data provider can only be used with a DatasetInstance" ) + self.dataset = dataset + # this dataset file is obviously the source + #TODO: this might be a good place to interface with the object_store... + super( DatasetDataProvider, self ).__init__( open( dataset.file_name, 'rb' ) ) + + #TODO: this is a bit of a mess + @classmethod + def get_column_metadata_from_dataset( cls, dataset ): + """ + Convenience class method to get column metadata from a dataset. + :returns: a dictionary of `column_count`, `column_types`, and `column_names` + if they're available, setting each to `None` if not. + """ + # re-map keys to fit ColumnarProvider.__init__ kwargs + params = {} + params[ 'column_count' ] = dataset.metadata.columns + params[ 'column_types' ] = dataset.metadata.column_types + params[ 'column_names' ] = dataset.metadata.column_names or getattr( dataset.datatype, 'column_names', None ) + return params + + def get_metadata_column_types( self, indeces=None ): + """ + Return the list of `column_types` for this dataset or `None` if unavailable. + :param indeces: the indeces for the columns of which to return the types. + Optional: defaults to None (return all types) + :type indeces: list of ints + """ + metadata_column_types = ( self.dataset.metadata.column_types + or getattr( self.dataset.datatype, 'column_types', None ) + or None ) + if not metadata_column_types: + return metadata_column_types + if indeces: + column_types = [] + for index in indeces: + column_type = metadata_column_types[ index ] if index < len( metadata_column_types ) else None + column_types.append( column_type ) + return column_types + return metadata_column_types + + def get_metadata_column_names( self, indeces=None ): + """ + Return the list of `column_names` for this dataset or `None` if unavailable. + :param indeces: the indeces for the columns of which to return the names. + Optional: defaults to None (return all names) + :type indeces: list of ints + """ + metadata_column_names = ( self.dataset.metadata.column_names + or getattr( self.dataset.datatype, 'column_names', None ) + or None ) + if not metadata_column_names: + return metadata_column_names + if indeces: + column_names = [] + for index in indeces: + column_type = metadata_column_names[ index ] if index < len( metadata_column_names ) else None + column_names.append( column_type ) + return column_names + return metadata_column_names + + #TODO: merge the next two + def get_indeces_by_column_names( self, list_of_column_names ): + """ + Return the list of column indeces when given a list of column_names. + :param list_of_column_names: the names of the columns of which to get indeces. + :type list_of_column_names: list of strs + :raises KeyError: if column_names are not found + :raises ValueError: if an entry in list_of_column_names is not in column_names + """ + metadata_column_names = ( self.dataset.metadata.column_names + or getattr( self.dataset.datatype, 'column_names', None ) + or None ) + if not metadata_column_names: + raise KeyError( 'No column_names found for ' + + 'datatype: %s, dataset: %s' %( str( self.dataset.datatype ), str( self.dataset ) ) ) + indeces = [] + for column_name in list_of_column_names: + indeces.append( metadata_column_names.index( column_name ) ) + return indeces + + def get_metadata_column_index_by_name( self, name ): + """ + Return the 1-base index of a sources column with the given `name`. + """ + # metadata columns are 1-based indeces + column = getattr( self.dataset.metadata, name ) + return ( column - 1 ) if isinstance( column, int ) else None + + def get_genomic_region_indeces( self, check=False ): + """ + Return a list of column indeces for 'chromCol', 'startCol', 'endCol' from + a source representing a genomic region. + + :param check: if True will raise a ValueError if any were not found. + :type check: bool + :raises ValueError: if check is `True` and one or more indeces were not found. + :returns: list of column indeces for the named columns. + """ + region_column_names = ( 'chromCol', 'startCol', 'endCol' ) + region_indeces = [ self.get_metadata_column_index_by_name( name ) for name in region_column_names ] + if check and not all( map( lambda i: i != None, indeces ) ): + raise ValueError( "Could not determine proper column indeces for chrom, start, end: %s" %( str( indeces ) ) ) + return region_indeces + + +class ConvertedDatasetDataProvider( DatasetDataProvider ): + """ + Class that uses the file contents of a dataset after conversion to a different + format. + """ + def __init__( self, dataset, **kwargs ): + raise NotImplementedError( 'Abstract class' ) + self.original_dataset = dataset + self.converted_dataset = self.convert_dataset( dataset, **kwargs ) + super( ConvertedDatasetDataProvider, self ).__init__( self.converted_dataset, **kwargs ) + #NOTE: now self.converted_dataset == self.dataset + + def convert_dataset( self, dataset, **kwargs ): + """ + Convert the given dataset in some way. + """ + return dataset + + +# ----------------------------------------------------------------------------- uses metadata for settings +class DatasetColumnarDataProvider( column.ColumnarDataProvider ): + """ + Data provider that uses a DatasetDataProvider as it's source and the + dataset's metadata to buuild settings for the ColumnarDataProvider it's + inherited from. + """ + def __init__( self, dataset, **kwargs ): + """ + All kwargs are inherited from ColumnarDataProvider. + .. seealso:: column.ColumnarDataProvider + + If no kwargs are given, this class will attempt to get those kwargs + from the dataset source's metadata. + If any kwarg is given, it will override and be used in place of + any metadata available. + """ + dataset_source = DatasetDataProvider( dataset ) + if not kwargs.get( 'column_types', None ): + indeces = kwargs.get( 'indeces', None ) + kwargs[ 'column_types' ] = dataset_source.get_metadata_column_types( indeces=indeces ) + super( DatasetColumnarDataProvider, self ).__init__( dataset_source, **kwargs ) + + +class DatasetMapDataProvider( column.MapDataProvider ): + """ + Data provider that uses a DatasetDataProvider as it's source and the + dataset's metadata to buuild settings for the MapDataProvider it's + inherited from. + """ + def __init__( self, dataset, **kwargs ): + """ + All kwargs are inherited from MapDataProvider. + .. seealso:: column.MapDataProvider + + If no kwargs are given, this class will attempt to get those kwargs + from the dataset source's metadata. + If any kwarg is given, it will override and be used in place of + any metadata available. + + The relationship between column_names and indeces is more complex: + +-----------------+-------------------------------+-----------------------+ + | | Indeces given | Indeces NOT given | + +=================+===============================+=======================+ + | Names given | pull indeces, rename w/ names | pull by name | + +=================+-------------------------------+-----------------------+ + | Names NOT given | pull indeces, name w/ meta | pull all, name w/meta | + +=================+-------------------------------+-----------------------+ + """ + dataset_source = DatasetDataProvider( dataset ) + + #TODO: getting too complicated - simplify at some lvl, somehow + # if no column_types given, get column_types from indeces (or all if indeces == None) + indeces = kwargs.get( 'indeces', None ) + column_names = kwargs.get( 'column_names', None ) + + #if indeces and column_names: + # # pull using indeces and re-name with given names - no need to alter (does as super would) + # pass + + if not indeces and column_names: + # pull columns by name + indeces = kwargs[ 'indeces' ] = dataset_source.get_indeces_by_column_names( column_names ) + + elif indeces and not column_names: + # pull using indeces, name with meta + column_names = kwargs[ 'column_names' ] = dataset_source.get_metadata_column_names( indeces=indeces ) + + elif not indeces and not column_names: + # pull all indeces and name using metadata + column_names = kwargs[ 'column_names' ] = dataset_source.get_metadata_column_names( indeces=indeces ) + + # if no column_types given, use metadata column_types + if not kwargs.get( 'column_types', None ): + kwargs[ 'column_types' ] = dataset_source.get_metadata_column_types( indeces=indeces ) + + super( DatasetMapDataProvider, self ).__init__( dataset_source, **kwargs ) + + +# ----------------------------------------------------------------------------- provides a bio-relevant datum +class GenomicRegionDataProvider( column.ColumnarDataProvider ): + """ + Data provider that parses chromosome, start, and end data from a file + using the datasets metadata settings. + + Is a ColumnarDataProvider that uses a DatasetDataProvider as it's source. + + If `named_columns` is true, will return dictionaries with the keys + 'chrom', 'start', 'end'. + """ + # dictionary keys when named_columns=True + COLUMN_NAMES = [ 'chrom', 'start', 'end' ] + + def __init__( self, dataset, chrom_column=None, start_column=None, end_column=None, named_columns=False, **kwargs ): + """ + :param dataset: the Galaxy dataset whose file will be the source + :type dataset: model.DatasetInstance + + :param chrom_column: optionally specify the chrom column index + :type chrom_column: int + :param start_column: optionally specify the start column index + :type start_column: int + :param end_column: optionally specify the end column index + :type end_column: int + + :param named_columns: optionally return dictionaries keying each column + with 'chrom', 'start', or 'end'. + Optional: defaults to False + :type named_columns: bool + """ + #TODO: allow passing in a string format e.g. "{chrom}:{start}-{end}" + dataset_source = DatasetDataProvider( dataset ) + + if chrom_column == None: + chrom_column = dataset_source.get_metadata_column_index_by_name( 'chromCol' ) + if start_column == None: + start_column = dataset_source.get_metadata_column_index_by_name( 'startCol' ) + if end_column == None: + end_column = dataset_source.get_metadata_column_index_by_name( 'endCol' ) + indeces = [ chrom_column, start_column, end_column ] + if not all( map( lambda i: i != None, indeces ) ): + raise ValueError( "Could not determine proper column indeces for" + + " chrom, start, end: %s" %( str( indeces ) ) ) + kwargs.update({ 'indeces' : indeces }) + + if not kwargs.get( 'column_types', None ): + kwargs.update({ 'column_types' : dataset_source.get_metadata_column_types( indeces=indeces ) }) + + self.named_columns = named_columns + if self.named_columns: + self.column_names = self.COLUMN_NAMES + + super( GenomicRegionDataProvider, self ).__init__( dataset_source, **kwargs ) + + def __iter__( self ): + parent_gen = super( GenomicRegionDataProvider, self ).__iter__() + for column_values in parent_gen: + if self.named_columns: + yield dict( zip( self.column_names, column_values ) ) + else: + yield column_values + + +#TODO: this optionally provides the same data as the above and makes GenomicRegionDataProvider redundant +# GenomicRegionDataProvider is a better name, tho +class IntervalDataProvider( column.ColumnarDataProvider ): + """ + Data provider that parses chromosome, start, and end data (as well as strand + and name if set in the metadata) using the dataset's metadata settings. + + If `named_columns` is true, will return dictionaries with the keys + 'chrom', 'start', 'end' (and 'strand' and 'name' if available). + """ + COLUMN_NAMES = [ 'chrom', 'start', 'end', 'strand', 'name' ] + + def __init__( self, dataset, chrom_column=None, start_column=None, end_column=None, + strand_column=None, name_column=None, named_columns=False, **kwargs ): + """ + :param dataset: the Galaxy dataset whose file will be the source + :type dataset: model.DatasetInstance + + :param named_columns: optionally return dictionaries keying each column + with 'chrom', 'start', 'end', 'strand', or 'name'. + Optional: defaults to False + :type named_columns: bool + """ + #TODO: allow passing in a string format e.g. "{chrom}:{start}-{end}" + dataset_source = DatasetDataProvider( dataset ) + + # get genomic indeces and add strand and name + if chrom_column == None: + chrom_column = dataset_source.get_metadata_column_index_by_name( 'chromCol' ) + if start_column == None: + start_column = dataset_source.get_metadata_column_index_by_name( 'startCol' ) + if end_column == None: + end_column = dataset_source.get_metadata_column_index_by_name( 'endCol' ) + if strand_column == None: + strand_column = dataset_source.get_metadata_column_index_by_name( 'strandCol' ) + if name_column == None: + name_column = dataset_source.get_metadata_column_index_by_name( 'nameCol' ) + indeces = [ chrom_column, start_column, end_column, strand_column, name_column ] + kwargs.update({ 'indeces' : indeces }) + + if not kwargs.get( 'column_types', None ): + kwargs.update({ 'column_types' : dataset_source.get_metadata_column_types( indeces=indeces ) }) + + self.named_columns = named_columns + if self.named_columns: + self.column_names = self.COLUMN_NAMES + + super( IntervalDataProvider, self ).__init__( dataset_source, **kwargs ) + + def __iter__( self ): + parent_gen = super( IntervalDataProvider, self ).__iter__() + for column_values in parent_gen: + if self.named_columns: + yield dict( zip( self.column_names, column_values ) ) + else: + yield column_values + + +#TODO: ideally with these next two - you'd allow pulling some region from the sequence +# WITHOUT reading the entire seq into memory - possibly apply some version of limit/offset +class FastaDataProvider( base.FilteredDataProvider ): + """ + Class that returns fasta format data in a list of maps of the form: + { + id: <fasta header id>, + sequence: <joined lines of nucleotide/amino data> + } + """ + def __init__( self, source, ids=None, **kwargs ): + """ + :param ids: optionally return only ids (and sequences) that are in this list. + Optional: defaults to None (provide all ids) + :type ids: list or None + """ + source = bx_seq.fasta.FastaReader( source ) + #TODO: validate is a fasta + super( FastaDataProvider, self ).__init__( source, **kwargs ) + self.ids = ids + # how to do ids? + + def __iter__( self ): + parent_gen = super( FastaDataProvider, self ).__iter__() + for fasta_record in parent_gen: + yield { + 'id' : fasta_record.name, + 'seq' : fasta_record.text + } + + +class TwoBitFastaDataProvider( DatasetDataProvider ): + """ + Class that returns fasta format data in a list of maps of the form: + { + id: <fasta header id>, + sequence: <joined lines of nucleotide/amino data> + } + """ + def __init__( self, source, ids=None, **kwargs ): + """ + :param ids: optionally return only ids (and sequences) that are in this list. + Optional: defaults to None (provide all ids) + :type ids: list or None + """ + source = bx_seq.twobit.TwoBitFile( source ) + #TODO: validate is a 2bit + super( FastaDataProvider, self ).__init__( source, **kwargs ) + # could do in order provided with twobit + self.ids = ids or self.source.keys() + + def __iter__( self ): + for id_ in self.ids: + yield { + 'id' : id_, + 'seq' : self.source[ name ] + } + + +#TODO: +class WiggleDataProvider( base.LimitedOffsetDataProvider ): + """ + Class that returns chrom, pos, data from a wiggle source. + """ + COLUMN_NAMES = [ 'chrom', 'pos', 'value' ] + + def __init__( self, source, named_columns=False, column_names=None, **kwargs ): + """ + :param named_columns: optionally return dictionaries keying each column + with 'chrom', 'start', 'end', 'strand', or 'name'. + Optional: defaults to False + :type named_columns: bool + + :param column_names: an ordered list of strings that will be used as the keys + for each column in the returned dictionaries. + The number of key, value pairs each returned dictionary has will + be as short as the number of column names provided. + :type column_names: + """ + #TODO: validate is a wig + # still good to maintain a ref to the raw source bc Reader won't + self.raw_source = source + self.parser = bx_wig.Reader( source ) + super( WiggleDataProvider, self ).__init__( self.parser, **kwargs ) + + self.named_columns = named_columns + self.column_names = column_names or self.COLUMN_NAMES + + def __iter__( self ): + parent_gen = super( WiggleDataProvider, self ).__iter__() + for three_tuple in parent_gen: + if self.named_columns: + yield dict( zip( self.column_names, three_tuple ) ) + else: + # list is not strictly necessary - but consistent + yield list( three_tuple ) + + +class BigWigDataProvider( base.LimitedOffsetDataProvider ): + """ + Class that returns chrom, pos, data from a wiggle source. + """ + COLUMN_NAMES = [ 'chrom', 'pos', 'value' ] + + def __init__( self, source, chrom, start, end, named_columns=False, column_names=None, **kwargs ): + """ + :param chrom: which chromosome within the bigbed file to extract data for + :type chrom: str + :param start: the start of the region from which to extract data + :type start: int + :param end: the end of the region from which to extract data + :type end: int + + :param named_columns: optionally return dictionaries keying each column + with 'chrom', 'start', 'end', 'strand', or 'name'. + Optional: defaults to False + :type named_columns: bool + + :param column_names: an ordered list of strings that will be used as the keys + for each column in the returned dictionaries. + The number of key, value pairs each returned dictionary has will + be as short as the number of column names provided. + :type column_names: + """ + raise NotImplementedError( 'Work in progress' ) + #TODO: validate is a wig + # still good to maintain a ref to the raw source bc Reader won't + self.raw_source = source + self.parser = bx_bbi.bigwig_file.BigWigFile( source ) + super( BigWigDataProvider, self ).__init__( self.parser, **kwargs ) + + self.named_columns = named_columns + self.column_names = column_names or self.COLUMN_NAMES + + def __iter__( self ): + parent_gen = super( BigWigDataProvider, self ).__iter__() + for three_tuple in parent_gen: + if self.named_columns: + yield dict( zip( self.column_names, three_tuple ) ) + else: + # list is not strictly necessary - but consistent + yield list( three_tuple ) + + +# ----------------------------------------------------------------------------- binary, external conversion or tool +class DatasetSubprocessDataProvider( external.SubprocessDataProvider ): + """ + Create a source from running a subprocess on a dataset's file. + + Uses a subprocess as it's source and has a dataset (gen. as an input file + for the process). + """ + #TODO: below should be a subclass of this and not RegexSubprocess + def __init__( self, dataset, *args, **kwargs ): + """ + :param args: the list of strings used to build commands. + :type args: variadic function args + """ + raise NotImplementedError( 'Abstract class' ) + super( DatasetSubprocessDataProvider, self ).__init__( *args, **kwargs ) + self.dataset = dataset + + +class SamtoolsDataProvider( line.RegexLineDataProvider ): + """ + Data provider that uses samtools on a Sam or Bam file as it's source. + + This can be piped through other providers (column, map, genome region, etc.). + + .. note:: that only the samtools 'view' command is currently implemented. + """ + FLAGS_WO_ARGS = 'bhHSu1xXcB' + FLAGS_W_ARGS = 'fFqlrs' + VALID_FLAGS = FLAGS_WO_ARGS + FLAGS_W_ARGS + + def __init__( self, dataset, options_string='', options_dict=None, regions=None, **kwargs ): + """ + :param options_string: samtools options in string form (flags separated + by spaces) + Optional: defaults to '' + :type options_string: str + :param options_dict: dictionary of samtools options + Optional: defaults to None + :type options_dict: dict or None + :param regions: list of samtools regions strings + Optional: defaults to None + :type regions: list of str or None + """ + #TODO: into validate_source + + #TODO: have to import these here due to circular ref in model/datatypes + import galaxy.datatypes.binary + import galaxy.datatypes.tabular + if( not( isinstance( dataset.datatype, galaxy.datatypes.tabular.Sam ) + or isinstance( dataset.datatype, galaxy.datatypes.binary.Bam ) ) ): + raise exceptions.InvalidDataProviderSource( + 'dataset must be a Sam or Bam datatype: %s' %( str( dataset.datatype ) ) ) + self.dataset = dataset + + options_dict = options_dict or {} + # ensure regions are strings + regions = [ str( r ) for r in regions ] if regions else [] + + #TODO: view only for now + #TODO: not properly using overriding super's validate_opts, command here + subcommand = 'view' + #TODO:?? do we need a path to samtools? + subproc_args = self.build_command_list( subcommand, options_string, options_dict, regions ) +#TODO: the composition/inheritance here doesn't make a lot sense + subproc_provider = external.SubprocessDataProvider( *subproc_args ) + super( SamtoolsDataProvider, self ).__init__( subproc_provider, **kwargs ) + + def build_command_list( self, subcommand, options_string, options_dict, regions ): + """ + Convert all init args to list form. + """ + command = [ 'samtools', subcommand ] + # add options and switches, input file, regions list (if any) + command.extend( self.to_options_list( options_string, options_dict ) ) + command.append( self.dataset.file_name ) + command.extend( regions ) + return command + + def to_options_list( self, options_string, options_dict ): + """ + Convert both options_string and options_dict to list form + while filtering out non-'valid' options. + """ + opt_list = [] + + # strip out any user supplied bash switch formating -> string of option chars + # then compress to single option string of unique, VALID flags with prefixed bash switch char '-' + options_string = options_string.strip( '- ' ) + validated_flag_list = set([ flag for flag in options_string if flag in self.FLAGS_WO_ARGS ]) + + # if sam add -S + if( ( isinstance( self.dataset.datatype, galaxy.datatypes.tabular.Sam ) + and ( 'S' not in validated_flag_list ) ) ): + validated_flag_list.append( 'S' ) + + if validated_flag_list: + opt_list.append( '-' + ''.join( validated_flag_list ) ) + + for flag, arg in options_dict.items(): + if flag in self.FLAGS_W_ARGS: + opt_list.extend([ '-' + flag, str( arg ) ]) + + return opt_list + + @classmethod + def extract_options_from_dict( cls, dictionary ): + """ + Separrates valid samtools key/value pair options from a dictionary and + returns both as a 2-tuple. + """ + # handy for extracting options from kwargs - but otherwise... + #TODO: could be abstracted to util.extract( dict, valid_keys_list ) + options_dict = {} + new_kwargs = {} + for key, value in dictionary.items(): + if key in cls.FLAGS_W_ARGS: + options_dict[ key ] = value + else: + new_kwargs[ key ] = value + return options_dict, new_kwargs + + +class BcftoolsDataProvider( line.RegexLineDataProvider ): + """ + Data provider that uses an bcftools on a bcf (or vcf?) file as it's source. + + This can be piped through other providers (column, map, genome region, etc.). + """ + def __init__( self, dataset, **kwargs ): + #TODO: as samtools + raise NotImplementedError() + super( BCFDataProvider, self ).__init__( dataset, **kwargs ) + + +class BGzipTabixDataProvider( base.DataProvider ): + """ + Data provider that uses an g(un)zip on a file as it's source. + + This can be piped through other providers (column, map, genome region, etc.). + """ + def __init__( self, dataset, **kwargs ): + #TODO: as samtools - need more info on output format + raise NotImplementedError() + super( BGzipTabixDataProvider, self ).__init__( dataset, **kwargs ) diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/decorators.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/decorators.py @@ -0,0 +1,107 @@ +""" +DataProvider related decorators. +""" + +# I'd like to decorate the factory methods that give data_providers by the name they can be accessed from. e.g.: +#@provides( 'id_seq' ) # where 'id_seq' is some 'data_format' string/alias +#def get_id_seq_provider( dataset, **settings ): + +# then in some central dispatch (maybe data.Data), have it look up the proper method by the data_format string + +# also it would be good to have this decorator maintain a list of available providers (for a datatype) + +# i don't particularly want to cut up method names ( get_([\w_]*)_provider ) +#!/usr/bin/env python + +# adapted from: http://stackoverflow.com +# /questions/14095616/python-can-i-programmatically-decorate-class-methods-from-a-class-instance + +from functools import wraps +#from types import MethodType +import copy + +import logging +log = logging.getLogger( __name__ ) + + +# ----------------------------------------------------------------------------- +_DATAPROVIDER_CLASS_MAP_KEY = 'dataproviders' +_DATAPROVIDER_METHOD_NAME_KEY = '_dataprovider_name' + +# ----------------------------------------------------------------------------- +def has_dataproviders( cls ): + """ + Wraps a class (generally a Datatype), finds methods within that have been + decorated with `@dataprovider` and adds them, by their name, to a map + in the class. + + This allows a class to maintain a name -> method map, effectively + 'registering' dataprovider factory methods. + + .. example:: + @has_dataproviders + class MyDtype( data.Data ): + + @dataprovider_factory( 'bler' ) + def provide_some_bler( self, dataset, **settings ): + '''blerblerbler''' + dataset_source = providers.DatasetDataProvider( dataset ) + # ... chain other, intermidiate providers here + return providers.BlerDataProvider( dataset_source, **settings ) + + # use the base method in data.Data + provider = dataset.datatype.dataprovider( dataset, 'bler', + my_setting='blah', ... ) + # OR directly from the map + provider = dataset.datatype.dataproviders[ 'bler' ]( dataset, + my_setting='blah', ... ) + """ + #log.debug( 'has_dataproviders:', cls ) + # init the class dataproviders map if necc. + if not hasattr( cls, _DATAPROVIDER_CLASS_MAP_KEY ): + setattr( cls, _DATAPROVIDER_CLASS_MAP_KEY, {} ) + else: + # need to deepcopy or subclasses will modify super.dataproviders as well + existing_dataproviders = getattr( cls, _DATAPROVIDER_CLASS_MAP_KEY ) + copied_dataproviders = copy.deepcopy( existing_dataproviders ) + setattr( cls, _DATAPROVIDER_CLASS_MAP_KEY, copied_dataproviders ) + + dataproviders = getattr( cls, _DATAPROVIDER_CLASS_MAP_KEY ) + + # scan for methods with dataprovider names and add them to the map + # note: this has a 'cascading' effect + # where it's possible to override a super's provider with a sub's + for attr_key, attr_value in cls.__dict__.iteritems(): + #log.debug( '\t key:', attr_key ) + # can't use isinstance( attr_value, MethodType ) bc of wrapping + if( ( callable( attr_value ) ) + and ( not attr_key.startswith( "__" ) ) + and ( getattr( attr_value, _DATAPROVIDER_METHOD_NAME_KEY, None ) ) ): + #log.debug( '\t\t is a dataprovider', attr_key ) + name = getattr( attr_value, _DATAPROVIDER_METHOD_NAME_KEY ) + dataproviders[ name ] = attr_value + + #log.debug( 'dataproviders:' ) + #for name, fn in cls.dataproviders.items(): + # log.debug( '\t ', name, '->', fn.__name__, fn ) + # log.debug( '\t\t ', fn.__doc__ ) + return cls + +def dataprovider_factory( name ): + """ + Wraps a class method and marks it as a dataprovider factory. + + :param name: what name/key to register the factory under in `cls.dataproviders` + :param type: any hashable var + """ + #log.debug( 'dataprovider:', name ) + def named_dataprovider_factory( func ): + #log.debug( 'named_dataprovider_factory:', name, '->', func.__name__ ) + setattr( func, _DATAPROVIDER_METHOD_NAME_KEY, name ) + #log.debug( '\t setting:', getattr( func, _DATAPROVIDER_METHOD_NAME_KEY ) ) + @wraps( func ) + def wrapped_dataprovider_factory( self, *args, **kwargs ): + #log.debug( 'wrapped_dataprovider_factory', name, self, args, kwargs ) + return func( self, *args, **kwargs ) + return wrapped_dataprovider_factory + return named_dataprovider_factory diff -r 65fbe93c7abe40826ce752462d2f906538efcab5 -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 lib/galaxy/datatypes/dataproviders/exceptions.py --- /dev/null +++ b/lib/galaxy/datatypes/dataproviders/exceptions.py @@ -0,0 +1,33 @@ +""" +DataProvider related exceptions. +""" + +class InvalidDataProviderSource( TypeError ): + """ + Raised when a unusable source is passed to a provider. + """ + def __init__( self, source=None, msg='' ): + msg = msg or 'Invalid source for provider: %s' %( source ) + super( InvalidDataProviderSource, self ).__init__( msg ) + + +class NoProviderAvailable( TypeError ): + """ + Raised when no provider is found for the given `format_requested`. + + :param factory_source: the item that the provider was requested from + :param format_requested: the format_requested (a hashable key to access + `factory_source.datatypes` with) + + Both params are attached to this class and accessible to the try-catch + receiver. + + Meant to be used within a class that builds dataproviders (e.g. a Datatype) + """ + def __init__( self, factory_source, format_requested=None, msg='' ): + self.factory_source = factory_source + self.format_requested = format_requested + msg = msg or 'No provider available in factory_source "%s" for format requested' %( str( factory_source ) ) + if self.format_requested: + msg += ': "%s"' %( self.format_requested ) + super( NoProviderAvailable, self ).__init__( msg ) This diff is so big that we needed to truncate the remainder. https://bitbucket.org/galaxy/galaxy-central/commits/06f8e1ce0bae/ Changeset: 06f8e1ce0bae Branch: provenance User: Kyle Ellrott Date: 2013-06-20 00:05:07 Summary: Provenance api element was missing security mixin. Affected #: 1 file diff -r ca6940bbf946d7a6e937c4ef1652f8c8afbc1ef8 -r 06f8e1ce0bae142044294ae663d059a4e0f8362b lib/galaxy/webapps/galaxy/api/provenance.py --- a/lib/galaxy/webapps/galaxy/api/provenance.py +++ b/lib/galaxy/webapps/galaxy/api/provenance.py @@ -3,13 +3,13 @@ """ import logging from galaxy import web -from galaxy.web.base.controller import BaseAPIController +from galaxy.web.base.controller import BaseAPIController, SharableItemSecurityMixin from paste.httpexceptions import HTTPNotImplemented, HTTPBadRequest log = logging.getLogger( __name__ ) -class BaseProvenanceController( BaseAPIController ): +class BaseProvenanceController( BaseAPIController, SharableItemSecurityMixin ): """ """ @web.expose_api https://bitbucket.org/galaxy/galaxy-central/commits/0bb601dceb65/ Changeset: 0bb601dceb65 Branch: provenance User: kellrott Date: 2013-06-20 00:43:25 Summary: Updating SharableItemSecurityMixin import to UsesHistoryMixin as suggested. Affected #: 1 file diff -r 06f8e1ce0bae142044294ae663d059a4e0f8362b -r 0bb601dceb65bf7a6dbf23ab7566054e4feead0b lib/galaxy/webapps/galaxy/api/provenance.py --- a/lib/galaxy/webapps/galaxy/api/provenance.py +++ b/lib/galaxy/webapps/galaxy/api/provenance.py @@ -3,13 +3,13 @@ """ import logging from galaxy import web -from galaxy.web.base.controller import BaseAPIController, SharableItemSecurityMixin +from galaxy.web.base.controller import BaseAPIController, UsesHistoryMixin from paste.httpexceptions import HTTPNotImplemented, HTTPBadRequest log = logging.getLogger( __name__ ) -class BaseProvenanceController( BaseAPIController, SharableItemSecurityMixin ): +class BaseProvenanceController( BaseAPIController, UsesHistoryMixin ): """ """ @web.expose_api https://bitbucket.org/galaxy/galaxy-central/commits/36d9e5bcd2cb/ Changeset: 36d9e5bcd2cb User: dannon Date: 2013-06-20 00:45:38 Summary: Merged in kellrott/galaxy-central/provenance (pull request #185) Provenance Bug Fix Affected #: 1 file diff -r 2a6a9157ff83744e8526538042f65ebe891d5526 -r 36d9e5bcd2cbcd5b34b2ae0e7839a71a55350011 lib/galaxy/webapps/galaxy/api/provenance.py --- a/lib/galaxy/webapps/galaxy/api/provenance.py +++ b/lib/galaxy/webapps/galaxy/api/provenance.py @@ -3,13 +3,13 @@ """ import logging from galaxy import web -from galaxy.web.base.controller import BaseAPIController +from galaxy.web.base.controller import BaseAPIController, UsesHistoryMixin from paste.httpexceptions import HTTPNotImplemented, HTTPBadRequest log = logging.getLogger( __name__ ) -class BaseProvenanceController( BaseAPIController ): +class BaseProvenanceController( BaseAPIController, UsesHistoryMixin ): """ """ @web.expose_api Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.