July 2014 - galaxy-commits - lists.galaxyproject.org

commit/galaxy-central: jmchilton: Merged in jmchilton/galaxy-central-fork-1 (pull request #440)
by commits-noreply＠bitbucket.org 28 Jul '14

28 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/d5870d822b24/ Changeset: d5870d822b24 User: jmchilton Date: 2014-07-28 19:15:32 Summary: Merged in jmchilton/galaxy-central-fork-1 (pull request #440) Initial BibTeX/DOI citation support in tools and histories. Affected #: 16 files diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae .hgignore --- a/.hgignore +++ b/.hgignore @@ -16,6 +16,7 @@ # Database stuff database/beaker_sessions +database/citations database/community_files database/compiled_templates database/files diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -376,6 +376,10 @@ # Default chunk size for chunkable datatypes -- 64k self.display_chunk_size = int( kwargs.get( 'display_chunk_size', 65536) ) + self.citation_cache_type = kwargs.get( "citation_cache_type", "file" ) + self.citation_cache_data_dir = self.resolve_path( kwargs.get( "citation_cache_data_dir", "database/citations/data" ) ) + self.citation_cache_lock_dir = self.resolve_path( kwargs.get( "citation_cache_lock_dir", "database/citations/locks" ) ) + @property def sentry_dsn_public( self ): """ @@ -570,6 +574,10 @@ tool_configs = self.config.tool_configs if self.config.migrated_tools_config not in tool_configs: tool_configs.append( self.config.migrated_tools_config ) + + from galaxy.managers.citations import CitationsManager + self.citations_manager = CitationsManager( self ) + from galaxy import tools self.toolbox = tools.ToolBox( tool_configs, self.config.tool_path, self ) # Search support for tools diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/managers/citations.py --- /dev/null +++ b/lib/galaxy/managers/citations.py @@ -0,0 +1,170 @@ +import pkg_resources + +import functools +import os +import urllib2 + +pkg_resources.require('Beaker') +from beaker.cache import CacheManager +from beaker.util import parse_cache_config_options + +import logging +log = logging.getLogger( __name__ ) + + +class CitationsManager( object ): + + def __init__( self, app ): + self.app = app + self.doi_cache = DoiCache( app.config ) + + def citations_for_tool( self, tool ): + return tool.citations + + def citations_for_tool_ids( self, tool_ids ): + citation_collection = CitationCollection() + for tool_id in tool_ids: + tool = self._get_tool( tool_id ) + for citation in self.citations_for_tool( tool ): + citation_collection.add( citation ) + return citation_collection.citations + + def parse_citation( self, citation_elem, tool_directory ): + return parse_citation( citation_elem, tool_directory, self ) + + def _get_tool( self, tool_id ): + tool = self.app.toolbox.get_tool( tool_id ) + return tool + + +class DoiCache( object ): + + def __init__( self, config ): + cache_opts = { + 'cache.type': getattr( config, 'citation_cache_type', 'file'), + 'cache.data_dir': getattr( config, 'citation_cache_data_dir', None), + 'cache.lock_dir': getattr( config, 'citation_cache_lock_dir', None), + } + self._cache = CacheManager(**parse_cache_config_options(cache_opts)).get_cache('doi') + + def _raw_get_bibtex( self, doi ): + dx_url = "http://dx.doi.org/" + doi + headers = {'Accept': "text/bibliography; style=bibtex" } + req = urllib2.Request(dx_url, data="", headers=headers) + response = urllib2.urlopen(req) + bibtex = response.read() + return bibtex + + def get_bibtex( self, doi ): + createfunc = functools.partial(self._raw_get_bibtex, doi) + return self._cache.get(key=doi, createfunc=createfunc) + + +def parse_citation( elem, directory, citation_manager ): + """ Parse an abstract citation entry from the specified XML element. + The directory parameter should be used to find external files for this + citation. + """ + citation_type = elem.attrib.get( 'type', None ) + citation_class = CITATION_CLASSES.get( citation_type, None ) + if not citation_class: + log.warn("Unknown or unspecified citation type: %s" % citation_type) + return None + return citation_class( elem, directory, citation_manager ) + + +class CitationCollection( object ): + + def __init__( self ): + self.citations = [] + + def __iter__( self ): + return self.citations.__iter__() + + def __len__( self ): + return len( self.citations ) + + def add( self, new_citation ): + for citation in self.citations: + if citation.equals( new_citation ): + # TODO: We have two equivalent citations, pick the more + # informative/complete/correct. + return False + + self.citations.append( new_citation ) + return True + + +class BaseCitation( object ): + + def to_dict( self, citation_format ): + if citation_format == "bibtex": + return dict( + format="bibtex", + content=self.to_bibtex(), + ) + else: + raise Exception("Unknown citation format %s" % citation_format) + + def equals( self, other_citation ): + if self.has_doi() and other_citation.has_doi(): + return self.doi() == other_citation.doi() + else: + # TODO: Do a better job figuring out if this is the same citation. + return self.to_bibtex() == other_citation.to_bibtex() + + def has_doi( self ): + return False + + +class BibtexCitation( BaseCitation ): + + def __init__( self, elem, directory, citation_manager ): + bibtex_file = elem.attrib.get("file", None) + if bibtex_file: + raw_bibtex = open(os.path.join(directory, bibtex_file), "r").read() + else: + raw_bibtex = elem.text.strip() + self._set_raw_bibtex( raw_bibtex ) + + def _set_raw_bibtex( self, raw_bibtex ): + self.raw_bibtex = raw_bibtex + + def to_bibtex( self ): + return self.raw_bibtex + + +class DoiCitation( BaseCitation ): + BIBTEX_UNSET = object() + + def __init__( self, elem, directory, citation_manager ): + self.__doi = elem.text.strip() + self.doi_cache = citation_manager.doi_cache + self.raw_bibtex = DoiCitation.BIBTEX_UNSET + + def has_doi( self ): + return True + + def doi( self ): + return self.__doi + + def to_bibtex( self ): + if self.raw_bibtex is DoiCitation.BIBTEX_UNSET: + try: + self.raw_bibtex = self.doi_cache.get_bibtex(self.__doi) + except Exception: + log.exception("Failed to fetch bibtex for DOI %s" % self.__doi) + + if self.raw_bibtex is DoiCitation.BIBTEX_UNSET: + return """@MISC{%s, + DOI = '%s', + note = 'Failed to fetch BibTeX for DOI.' + }""" % (self.__doi, self.__doi) + else: + return self.raw_bibtex + + +CITATION_CLASSES = dict( + bibtex=BibtexCitation, + doi=DoiCitation, +) diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -1360,6 +1360,9 @@ requirements, containers = parse_requirements_from_xml( root ) self.requirements = requirements self.containers = containers + + self.citations = self._parse_citations( root ) + # Determine if this tool can be used in workflows self.is_workflow_compatible = self.check_workflow_compatible(root) # Trackster configuration. @@ -1686,6 +1689,20 @@ trace_msg = repr( traceback.format_tb( trace ) ) log.error( "Traceback: %s" % trace_msg ) + def _parse_citations( self, root ): + citations = [] + citations_elem = root.find("citations") + if not citations_elem: + return citations + + for citation_elem in citations_elem: + if citation_elem.tag != "citation": + pass + citation = self.app.citations_manager.parse_citation( citation_elem, self.tool_dir ) + if citation: + citations.append( citation ) + return citations + # TODO: This method doesn't have to be part of the Tool class. def parse_error_level( self, err_level ): """ diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/api/histories.py --- a/lib/galaxy/webapps/galaxy/api/histories.py +++ b/lib/galaxy/webapps/galaxy/api/histories.py @@ -18,7 +18,7 @@ from galaxy.web.base.controller import ExportsHistoryMixin from galaxy.web.base.controller import ImportsHistoryMixin -from galaxy.managers import histories +from galaxy.managers import histories, citations from galaxy import util from galaxy.util import string_as_bool @@ -34,6 +34,7 @@ def __init__( self, app ): super( HistoriesController, self ).__init__( app ) + self.citations_manager = citations.CitationsManager( app ) self.mgrs = util.bunch.Bunch( histories=histories.HistoryManager() ) @@ -117,6 +118,20 @@ history_data[ 'contents_url' ] = url_for( 'history_contents', history_id=history_id ) return history_data + @expose_api_anonymous + def citations( self, trans, history_id, **kwd ): + history = self.mgrs.histories.get( trans, self._decode_id( trans, history_id ), check_ownership=False, check_accessible=True ) + tool_ids = set([]) + for dataset in history.datasets: + job = dataset.creating_job + if not job: + continue + tool_id = job.tool_id + if not tool_id: + continue + tool_ids.add(tool_id) + return map( lambda citation: citation.to_dict( "bibtex" ), self.citations_manager.citations_for_tool_ids( tool_ids ) ) + @expose_api def set_as_current( self, trans, id, **kwd ): """ diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/api/tools.py --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -1,6 +1,8 @@ import urllib +from galaxy import exceptions from galaxy import web, util +from galaxy.web import _future_expose_api_anonymous from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin from galaxy.web.base.controller import UsesHistoryMixin @@ -44,7 +46,7 @@ trans.response.status = 500 return { 'error': str( exc ) } - @web.expose_api + @_future_expose_api_anonymous def show( self, trans, id, **kwd ): """ GET /api/tools/{tool_id} @@ -52,18 +54,16 @@ """ io_details = util.string_as_bool( kwd.get( 'io_details', False ) ) link_details = util.string_as_bool( kwd.get( 'link_details', False ) ) - try: - id = urllib.unquote_plus( id ) - tool = self.app.toolbox.get_tool( id ) - if not tool: - trans.response.status = 404 - return { 'error': 'tool not found', 'id': id } - return tool.to_dict( trans, io_details=io_details, link_details=link_details ) + tool = self._get_tool( id ) + return tool.to_dict( trans, io_details=io_details, link_details=link_details ) - except Exception, exc: - log.error( 'could not convert tool (%s) to dictionary: %s', id, str( exc ), exc_info=True ) - trans.response.status = 500 - return { 'error': str( exc ) } + @_future_expose_api_anonymous + def citations( self, trans, id, **kwds ): + tool = self._get_tool( id ) + rval = [] + for citation in tool.citations: + rval.append( citation.to_dict( 'bibtex' ) ) + return rval @web.expose_api_anonymous def create( self, trans, payload, **kwd ): @@ -170,6 +170,12 @@ # # -- Helper methods -- # + def _get_tool( self, id ): + id = urllib.unquote_plus( id ) + tool = self.app.toolbox.get_tool( id ) + if not tool: + raise exceptions.ObjectNotFound("Could not find tool with id '%s'" % id) + return tool def _rerun_tool( self, trans, payload, **kwargs ): """ diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/buildapp.py --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -174,6 +174,7 @@ webapp.mapper.resource( 'ftp_file', 'ftp_files', path_prefix='/api' ) webapp.mapper.resource( 'group', 'groups', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'quota', 'quotas', path_prefix='/api' ) + webapp.mapper.connect( '/api/tools/{id:.+?}/citations', action='citations', controller="tools" ) webapp.mapper.connect( '/api/tools/{id:.+?}', action='show', controller="tools" ) webapp.mapper.resource( 'tool', 'tools', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'user', 'users', path_prefix='/api' ) @@ -181,6 +182,7 @@ webapp.mapper.resource( 'visualization', 'visualizations', path_prefix='/api' ) webapp.mapper.resource( 'workflow', 'workflows', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'history', 'histories', path_prefix='/api' ) + webapp.mapper.connect( '/api/histories/{history_id}/citations', action='citations', controller="histories" ) webapp.mapper.resource( 'configuration', 'configuration', path_prefix='/api' ) webapp.mapper.resource( 'datatype', 'datatypes', diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/controllers/history.py --- a/lib/galaxy/webapps/galaxy/controllers/history.py +++ b/lib/galaxy/webapps/galaxy/controllers/history.py @@ -448,6 +448,13 @@ # ......................................................................... html @web.expose + def citations( self, trans ): + # Get history + history = trans.history + history_id = trans.security.encode_id( history.id ) + return trans.fill_template( "history/citations.mako", history=history, history_id=history_id ) + + @web.expose def display_structured( self, trans, id=None ): """ Display a history as a nested structure showing the jobs and workflow This diff is so big that we needed to truncate the remainder. Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: 2 new changesets
by commits-noreply＠bitbucket.org 28 Jul '14

28 Jul '14

2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/8536414e0358/ Changeset: 8536414e0358 User: jmchilton Date: 2014-07-23 22:43:13 Summary: Initial BibTeX/DOI citation support in tools and histories. Allow tool authors to specify citations using a DOI or BibTeX. BibTeX can be specified either by pointing at a BibTeX file in the tool directory or by embedding bibtex entries right in tool citation blocks. If referencing a file parallel to the tool, the file should contain only a single BibTeX entry, this restriction can be easily lifted by adding a BibTeX parser as a Python dependency for Galaxy - but I do not have permission to do this. These citations will appear at the bottom of the tool form in a formatted way but the user will have to option to select RAW BibTeX for copying and pasting. Likewise, the history menu now has an option allowing users to aggregatesuch citations across an analysis a comparable list of citations. UI interactions are implemented using a Backbone model and view and data is fetched from the Galaxy server as BibTeX using the API. Two API entry points have been added - one to fetch the BibTeX entries for a tool and another for a history. BibTeX entries for citations annotated with DOIs will be fetched from http://dx.doi.org/ and cached using Beaker. Additional Limitations: - I am not super happy with a few different GUI elements of this. It is ugly and I didn't write the BibTeX parser but I did write the code that takes parsed BibTeX and converts it to a formatted entry. If merged, I will outline a Trello card to follow up and improve the UI and find some more standard way to build a formatted HTML citation from a parsed BibTeX entry. - BibTeX Limitations: LaTeX embedded in the BibTeX entries doesn't render properly when producing a "pretty" citation in the GUI (should still be exported to citation managers properly though). Cross references aren't supported at this time. Alternative Implementations: BibTex/DOI vs. PROV: There was some discussion of PROV encoding citation information on the development mailing list. The citation tags on tools are typed so this could certainly be added - but it was discussed at the BOSC 2014 codefest and there was some conensus that tool authors are more likely to already have BibTeX or DOIs available for the tool's references and the major reference managers end users will likely plug these citations into while writing papers are more likely to be able to consume BibTeX than anything else. The bench biologist using Galaxy is the consumer of this work I most concerned with - if we need to convert citations into other formats such as EndNote or Word's bibliography support there is a suite of tools we could optionally plug into Galaxy (http://sourceforge.net/p/bibutils/home/Bibutils/) to enable this down the road. PROV seems to have such an ecosystem to leverage - I could not even find a tool to convert it BibTeX. Parse BibTeX Client vs Server: As mentioned above it would be nice in some ways to be able to parse and reason about BibTeX on the backend - but it would require adding a new dependency to Python. Since we have to ship BibTeX to the browser anyway to allow users to copy and paste it - I decided it was easier to start with parsing and formatting BibTeX on the client side. I therefore added the following library BSD JavaScript dependency https://github.com/mayanklahiri/bib2json to enable this. I would be happy to revisit this decision and produce formatted entries server side - there seem to be more Python options for doing this than JavaScript. Affected #: 16 files diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 .hgignore --- a/.hgignore +++ b/.hgignore @@ -16,6 +16,7 @@ # Database stuff database/beaker_sessions +database/citations database/community_files database/compiled_templates database/files diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -378,6 +378,10 @@ # Default chunk size for chunkable datatypes -- 64k self.display_chunk_size = int( kwargs.get( 'display_chunk_size', 65536) ) + self.citation_cache_type = kwargs.get( "citation_cache_type", "file" ) + self.citation_cache_data_dir = self.resolve_path( kwargs.get( "citation_cache_data_dir", "database/citations/data" ) ) + self.citation_cache_lock_dir = self.resolve_path( kwargs.get( "citation_cache_lock_dir", "database/citations/locks" ) ) + @property def sentry_dsn_public( self ): """ @@ -572,6 +576,10 @@ tool_configs = self.config.tool_configs if self.config.migrated_tools_config not in tool_configs: tool_configs.append( self.config.migrated_tools_config ) + + from galaxy.managers.citations import CitationsManager + self.citations_manager = CitationsManager( self ) + from galaxy import tools self.toolbox = tools.ToolBox( tool_configs, self.config.tool_path, self ) # Search support for tools diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/managers/citations.py --- /dev/null +++ b/lib/galaxy/managers/citations.py @@ -0,0 +1,170 @@ +import pkg_resources + +import functools +import os +import urllib2 + +pkg_resources.require('Beaker') +from beaker.cache import CacheManager +from beaker.util import parse_cache_config_options + +import logging +log = logging.getLogger( __name__ ) + + +class CitationsManager( object ): + + def __init__( self, app ): + self.app = app + self.doi_cache = DoiCache( app.config ) + + def citations_for_tool( self, tool ): + return tool.citations + + def citations_for_tool_ids( self, tool_ids ): + citation_collection = CitationCollection() + for tool_id in tool_ids: + tool = self._get_tool( tool_id ) + for citation in self.citations_for_tool( tool ): + citation_collection.add( citation ) + return citation_collection.citations + + def parse_citation( self, citation_elem, tool_directory ): + return parse_citation( citation_elem, tool_directory, self ) + + def _get_tool( self, tool_id ): + tool = self.app.toolbox.get_tool( tool_id ) + return tool + + +class DoiCache( object ): + + def __init__( self, config ): + cache_opts = { + 'cache.type': getattr( config, 'citation_cache_type', 'file'), + 'cache.data_dir': getattr( config, 'citation_cache_data_dir', None), + 'cache.lock_dir': getattr( config, 'citation_cache_lock_dir', None), + } + self._cache = CacheManager(**parse_cache_config_options(cache_opts)).get_cache('doi') + + def _raw_get_bibtex( self, doi ): + dx_url = "http://dx.doi.org/" + doi + headers = {'Accept': "text/bibliography; style=bibtex" } + req = urllib2.Request(dx_url, data="", headers=headers) + response = urllib2.urlopen(req) + bibtex = response.read() + return bibtex + + def get_bibtex( self, doi ): + createfunc = functools.partial(self._raw_get_bibtex, doi) + return self._cache.get(key=doi, createfunc=createfunc) + + +def parse_citation( elem, directory, citation_manager ): + """ Parse an abstract citation entry from the specified XML element. + The directory parameter should be used to find external files for this + citation. + """ + citation_type = elem.attrib.get( 'type', None ) + citation_class = CITATION_CLASSES.get( citation_type, None ) + if not citation_class: + log.warn("Unknown or unspecified citation type: %s" % citation_type) + return None + return citation_class( elem, directory, citation_manager ) + + +class CitationCollection( object ): + + def __init__( self ): + self.citations = [] + + def __iter__( self ): + return self.citations.__iter__() + + def __len__( self ): + return len( self.citations ) + + def add( self, new_citation ): + for citation in self.citations: + if citation.equals( new_citation ): + # TODO: We have two equivalent citations, pick the more + # informative/complete/correct. + return False + + self.citations.append( new_citation ) + return True + + +class BaseCitation( object ): + + def to_dict( self, citation_format ): + if citation_format == "bibtex": + return dict( + format="bibtex", + content=self.to_bibtex(), + ) + else: + raise Exception("Unknown citation format %s" % citation_format) + + def equals( self, other_citation ): + if self.has_doi() and other_citation.has_doi(): + return self.doi() == other_citation.doi() + else: + # TODO: Do a better job figuring out if this is the same citation. + return self.to_bibtex() == other_citation.to_bibtex() + + def has_doi( self ): + return False + + +class BibtexCitation( BaseCitation ): + + def __init__( self, elem, directory, citation_manager ): + bibtex_file = elem.attrib.get("file", None) + if bibtex_file: + raw_bibtex = open(os.path.join(directory, bibtex_file), "r").read() + else: + raw_bibtex = elem.text.strip() + self._set_raw_bibtex( raw_bibtex ) + + def _set_raw_bibtex( self, raw_bibtex ): + self.raw_bibtex = raw_bibtex + + def to_bibtex( self ): + return self.raw_bibtex + + +class DoiCitation( BaseCitation ): + BIBTEX_UNSET = object() + + def __init__( self, elem, directory, citation_manager ): + self.__doi = elem.text.strip() + self.doi_cache = citation_manager.doi_cache + self.raw_bibtex = DoiCitation.BIBTEX_UNSET + + def has_doi( self ): + return True + + def doi( self ): + return self.__doi + + def to_bibtex( self ): + if self.raw_bibtex is DoiCitation.BIBTEX_UNSET: + try: + self.raw_bibtex = self.doi_cache.get_bibtex(self.__doi) + except Exception: + log.exception("Failed to fetch bibtex for DOI %s" % self.__doi) + + if self.raw_bibtex is DoiCitation.BIBTEX_UNSET: + return """@MISC{%s, + DOI = '%s', + note = 'Failed to fetch BibTeX for DOI.' + }""" % (self.__doi, self.__doi) + else: + return self.raw_bibtex + + +CITATION_CLASSES = dict( + bibtex=BibtexCitation, + doi=DoiCitation, +) diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -1360,6 +1360,9 @@ requirements, containers = parse_requirements_from_xml( root ) self.requirements = requirements self.containers = containers + + self.citations = self._parse_citations( root ) + # Determine if this tool can be used in workflows self.is_workflow_compatible = self.check_workflow_compatible(root) # Trackster configuration. @@ -1686,6 +1689,20 @@ trace_msg = repr( traceback.format_tb( trace ) ) log.error( "Traceback: %s" % trace_msg ) + def _parse_citations( self, root ): + citations = [] + citations_elem = root.find("citations") + if not citations_elem: + return citations + + for citation_elem in citations_elem: + if citation_elem.tag != "citation": + pass + citation = self.app.citations_manager.parse_citation( citation_elem, self.tool_dir ) + if citation: + citations.append( citation ) + return citations + # TODO: This method doesn't have to be part of the Tool class. def parse_error_level( self, err_level ): """ diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/webapps/galaxy/api/histories.py --- a/lib/galaxy/webapps/galaxy/api/histories.py +++ b/lib/galaxy/webapps/galaxy/api/histories.py @@ -18,7 +18,7 @@ from galaxy.web.base.controller import ExportsHistoryMixin from galaxy.web.base.controller import ImportsHistoryMixin -from galaxy.managers import histories +from galaxy.managers import histories, citations from galaxy import util from galaxy.util import string_as_bool @@ -34,6 +34,7 @@ def __init__( self, app ): super( HistoriesController, self ).__init__( app ) + self.citations_manager = citations.CitationsManager( app ) self.mgrs = util.bunch.Bunch( histories=histories.HistoryManager() ) @@ -117,6 +118,20 @@ history_data[ 'contents_url' ] = url_for( 'history_contents', history_id=history_id ) return history_data + @expose_api_anonymous + def citations( self, trans, history_id, **kwd ): + history = self.mgrs.histories.get( trans, self._decode_id( trans, history_id ), check_ownership=False, check_accessible=True ) + tool_ids = set([]) + for dataset in history.datasets: + job = dataset.creating_job + if not job: + continue + tool_id = job.tool_id + if not tool_id: + continue + tool_ids.add(tool_id) + return map( lambda citation: citation.to_dict( "bibtex" ), self.citations_manager.citations_for_tool_ids( tool_ids ) ) + @expose_api def set_as_current( self, trans, id, **kwd ): """ diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/webapps/galaxy/api/tools.py --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -1,6 +1,8 @@ import urllib +from galaxy import exceptions from galaxy import web, util +from galaxy.web import _future_expose_api_anonymous from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin from galaxy.web.base.controller import UsesHistoryMixin @@ -44,7 +46,7 @@ trans.response.status = 500 return { 'error': str( exc ) } - @web.expose_api + @_future_expose_api_anonymous def show( self, trans, id, **kwd ): """ GET /api/tools/{tool_id} @@ -52,18 +54,16 @@ """ io_details = util.string_as_bool( kwd.get( 'io_details', False ) ) link_details = util.string_as_bool( kwd.get( 'link_details', False ) ) - try: - id = urllib.unquote_plus( id ) - tool = self.app.toolbox.get_tool( id ) - if not tool: - trans.response.status = 404 - return { 'error': 'tool not found', 'id': id } - return tool.to_dict( trans, io_details=io_details, link_details=link_details ) + tool = self._get_tool( id ) + return tool.to_dict( trans, io_details=io_details, link_details=link_details ) - except Exception, exc: - log.error( 'could not convert tool (%s) to dictionary: %s', id, str( exc ), exc_info=True ) - trans.response.status = 500 - return { 'error': str( exc ) } + @_future_expose_api_anonymous + def citations( self, trans, id, **kwds ): + tool = self._get_tool( id ) + rval = [] + for citation in tool.citations: + rval.append( citation.to_dict( 'bibtex' ) ) + return rval @web.expose_api_anonymous def create( self, trans, payload, **kwd ): @@ -170,6 +170,12 @@ # # -- Helper methods -- # + def _get_tool( self, id ): + id = urllib.unquote_plus( id ) + tool = self.app.toolbox.get_tool( id ) + if not tool: + raise exceptions.ObjectNotFound("Could not find tool with id '%s'" % id) + return tool def _rerun_tool( self, trans, payload, **kwargs ): """ diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/webapps/galaxy/buildapp.py --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -174,6 +174,7 @@ webapp.mapper.resource( 'ftp_file', 'ftp_files', path_prefix='/api' ) webapp.mapper.resource( 'group', 'groups', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'quota', 'quotas', path_prefix='/api' ) + webapp.mapper.connect( '/api/tools/{id:.+?}/citations', action='citations', controller="tools" ) webapp.mapper.connect( '/api/tools/{id:.+?}', action='show', controller="tools" ) webapp.mapper.resource( 'tool', 'tools', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'user', 'users', path_prefix='/api' ) @@ -181,6 +182,7 @@ webapp.mapper.resource( 'visualization', 'visualizations', path_prefix='/api' ) webapp.mapper.resource( 'workflow', 'workflows', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'history', 'histories', path_prefix='/api' ) + webapp.mapper.connect( '/api/histories/{history_id}/citations', action='citations', controller="histories" ) webapp.mapper.resource( 'configuration', 'configuration', path_prefix='/api' ) webapp.mapper.resource( 'datatype', 'datatypes', diff -r 6fbe4d95a8dc64bd222dbf1170bdadcc5321e855 -r 8536414e03580828831dbd68ba6fd8a888630cd9 lib/galaxy/webapps/galaxy/controllers/history.py --- a/lib/galaxy/webapps/galaxy/controllers/history.py +++ b/lib/galaxy/webapps/galaxy/controllers/history.py @@ -448,6 +448,13 @@ # ......................................................................... html @web.expose + def citations( self, trans ): + # Get history + history = trans.history + history_id = trans.security.encode_id( history.id ) + return trans.fill_template( "history/citations.mako", history=history, history_id=history_id ) + + @web.expose def display_structured( self, trans, id=None ): """ Display a history as a nested structure showing the jobs and workflow This diff is so big that we needed to truncate the remainder. https://bitbucket.org/galaxy/galaxy-central/commits/d5870d822b24/ Changeset: d5870d822b24 User: jmchilton Date: 2014-07-28 19:15:32 Summary: Merged in jmchilton/galaxy-central-fork-1 (pull request #440) Initial BibTeX/DOI citation support in tools and histories. Affected #: 16 files diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae .hgignore --- a/.hgignore +++ b/.hgignore @@ -16,6 +16,7 @@ # Database stuff database/beaker_sessions +database/citations database/community_files database/compiled_templates database/files diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -376,6 +376,10 @@ # Default chunk size for chunkable datatypes -- 64k self.display_chunk_size = int( kwargs.get( 'display_chunk_size', 65536) ) + self.citation_cache_type = kwargs.get( "citation_cache_type", "file" ) + self.citation_cache_data_dir = self.resolve_path( kwargs.get( "citation_cache_data_dir", "database/citations/data" ) ) + self.citation_cache_lock_dir = self.resolve_path( kwargs.get( "citation_cache_lock_dir", "database/citations/locks" ) ) + @property def sentry_dsn_public( self ): """ @@ -570,6 +574,10 @@ tool_configs = self.config.tool_configs if self.config.migrated_tools_config not in tool_configs: tool_configs.append( self.config.migrated_tools_config ) + + from galaxy.managers.citations import CitationsManager + self.citations_manager = CitationsManager( self ) + from galaxy import tools self.toolbox = tools.ToolBox( tool_configs, self.config.tool_path, self ) # Search support for tools diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/managers/citations.py --- /dev/null +++ b/lib/galaxy/managers/citations.py @@ -0,0 +1,170 @@ +import pkg_resources + +import functools +import os +import urllib2 + +pkg_resources.require('Beaker') +from beaker.cache import CacheManager +from beaker.util import parse_cache_config_options + +import logging +log = logging.getLogger( __name__ ) + + +class CitationsManager( object ): + + def __init__( self, app ): + self.app = app + self.doi_cache = DoiCache( app.config ) + + def citations_for_tool( self, tool ): + return tool.citations + + def citations_for_tool_ids( self, tool_ids ): + citation_collection = CitationCollection() + for tool_id in tool_ids: + tool = self._get_tool( tool_id ) + for citation in self.citations_for_tool( tool ): + citation_collection.add( citation ) + return citation_collection.citations + + def parse_citation( self, citation_elem, tool_directory ): + return parse_citation( citation_elem, tool_directory, self ) + + def _get_tool( self, tool_id ): + tool = self.app.toolbox.get_tool( tool_id ) + return tool + + +class DoiCache( object ): + + def __init__( self, config ): + cache_opts = { + 'cache.type': getattr( config, 'citation_cache_type', 'file'), + 'cache.data_dir': getattr( config, 'citation_cache_data_dir', None), + 'cache.lock_dir': getattr( config, 'citation_cache_lock_dir', None), + } + self._cache = CacheManager(**parse_cache_config_options(cache_opts)).get_cache('doi') + + def _raw_get_bibtex( self, doi ): + dx_url = "http://dx.doi.org/" + doi + headers = {'Accept': "text/bibliography; style=bibtex" } + req = urllib2.Request(dx_url, data="", headers=headers) + response = urllib2.urlopen(req) + bibtex = response.read() + return bibtex + + def get_bibtex( self, doi ): + createfunc = functools.partial(self._raw_get_bibtex, doi) + return self._cache.get(key=doi, createfunc=createfunc) + + +def parse_citation( elem, directory, citation_manager ): + """ Parse an abstract citation entry from the specified XML element. + The directory parameter should be used to find external files for this + citation. + """ + citation_type = elem.attrib.get( 'type', None ) + citation_class = CITATION_CLASSES.get( citation_type, None ) + if not citation_class: + log.warn("Unknown or unspecified citation type: %s" % citation_type) + return None + return citation_class( elem, directory, citation_manager ) + + +class CitationCollection( object ): + + def __init__( self ): + self.citations = [] + + def __iter__( self ): + return self.citations.__iter__() + + def __len__( self ): + return len( self.citations ) + + def add( self, new_citation ): + for citation in self.citations: + if citation.equals( new_citation ): + # TODO: We have two equivalent citations, pick the more + # informative/complete/correct. + return False + + self.citations.append( new_citation ) + return True + + +class BaseCitation( object ): + + def to_dict( self, citation_format ): + if citation_format == "bibtex": + return dict( + format="bibtex", + content=self.to_bibtex(), + ) + else: + raise Exception("Unknown citation format %s" % citation_format) + + def equals( self, other_citation ): + if self.has_doi() and other_citation.has_doi(): + return self.doi() == other_citation.doi() + else: + # TODO: Do a better job figuring out if this is the same citation. + return self.to_bibtex() == other_citation.to_bibtex() + + def has_doi( self ): + return False + + +class BibtexCitation( BaseCitation ): + + def __init__( self, elem, directory, citation_manager ): + bibtex_file = elem.attrib.get("file", None) + if bibtex_file: + raw_bibtex = open(os.path.join(directory, bibtex_file), "r").read() + else: + raw_bibtex = elem.text.strip() + self._set_raw_bibtex( raw_bibtex ) + + def _set_raw_bibtex( self, raw_bibtex ): + self.raw_bibtex = raw_bibtex + + def to_bibtex( self ): + return self.raw_bibtex + + +class DoiCitation( BaseCitation ): + BIBTEX_UNSET = object() + + def __init__( self, elem, directory, citation_manager ): + self.__doi = elem.text.strip() + self.doi_cache = citation_manager.doi_cache + self.raw_bibtex = DoiCitation.BIBTEX_UNSET + + def has_doi( self ): + return True + + def doi( self ): + return self.__doi + + def to_bibtex( self ): + if self.raw_bibtex is DoiCitation.BIBTEX_UNSET: + try: + self.raw_bibtex = self.doi_cache.get_bibtex(self.__doi) + except Exception: + log.exception("Failed to fetch bibtex for DOI %s" % self.__doi) + + if self.raw_bibtex is DoiCitation.BIBTEX_UNSET: + return """@MISC{%s, + DOI = '%s', + note = 'Failed to fetch BibTeX for DOI.' + }""" % (self.__doi, self.__doi) + else: + return self.raw_bibtex + + +CITATION_CLASSES = dict( + bibtex=BibtexCitation, + doi=DoiCitation, +) diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -1360,6 +1360,9 @@ requirements, containers = parse_requirements_from_xml( root ) self.requirements = requirements self.containers = containers + + self.citations = self._parse_citations( root ) + # Determine if this tool can be used in workflows self.is_workflow_compatible = self.check_workflow_compatible(root) # Trackster configuration. @@ -1686,6 +1689,20 @@ trace_msg = repr( traceback.format_tb( trace ) ) log.error( "Traceback: %s" % trace_msg ) + def _parse_citations( self, root ): + citations = [] + citations_elem = root.find("citations") + if not citations_elem: + return citations + + for citation_elem in citations_elem: + if citation_elem.tag != "citation": + pass + citation = self.app.citations_manager.parse_citation( citation_elem, self.tool_dir ) + if citation: + citations.append( citation ) + return citations + # TODO: This method doesn't have to be part of the Tool class. def parse_error_level( self, err_level ): """ diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/api/histories.py --- a/lib/galaxy/webapps/galaxy/api/histories.py +++ b/lib/galaxy/webapps/galaxy/api/histories.py @@ -18,7 +18,7 @@ from galaxy.web.base.controller import ExportsHistoryMixin from galaxy.web.base.controller import ImportsHistoryMixin -from galaxy.managers import histories +from galaxy.managers import histories, citations from galaxy import util from galaxy.util import string_as_bool @@ -34,6 +34,7 @@ def __init__( self, app ): super( HistoriesController, self ).__init__( app ) + self.citations_manager = citations.CitationsManager( app ) self.mgrs = util.bunch.Bunch( histories=histories.HistoryManager() ) @@ -117,6 +118,20 @@ history_data[ 'contents_url' ] = url_for( 'history_contents', history_id=history_id ) return history_data + @expose_api_anonymous + def citations( self, trans, history_id, **kwd ): + history = self.mgrs.histories.get( trans, self._decode_id( trans, history_id ), check_ownership=False, check_accessible=True ) + tool_ids = set([]) + for dataset in history.datasets: + job = dataset.creating_job + if not job: + continue + tool_id = job.tool_id + if not tool_id: + continue + tool_ids.add(tool_id) + return map( lambda citation: citation.to_dict( "bibtex" ), self.citations_manager.citations_for_tool_ids( tool_ids ) ) + @expose_api def set_as_current( self, trans, id, **kwd ): """ diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/api/tools.py --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -1,6 +1,8 @@ import urllib +from galaxy import exceptions from galaxy import web, util +from galaxy.web import _future_expose_api_anonymous from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin from galaxy.web.base.controller import UsesHistoryMixin @@ -44,7 +46,7 @@ trans.response.status = 500 return { 'error': str( exc ) } - @web.expose_api + @_future_expose_api_anonymous def show( self, trans, id, **kwd ): """ GET /api/tools/{tool_id} @@ -52,18 +54,16 @@ """ io_details = util.string_as_bool( kwd.get( 'io_details', False ) ) link_details = util.string_as_bool( kwd.get( 'link_details', False ) ) - try: - id = urllib.unquote_plus( id ) - tool = self.app.toolbox.get_tool( id ) - if not tool: - trans.response.status = 404 - return { 'error': 'tool not found', 'id': id } - return tool.to_dict( trans, io_details=io_details, link_details=link_details ) + tool = self._get_tool( id ) + return tool.to_dict( trans, io_details=io_details, link_details=link_details ) - except Exception, exc: - log.error( 'could not convert tool (%s) to dictionary: %s', id, str( exc ), exc_info=True ) - trans.response.status = 500 - return { 'error': str( exc ) } + @_future_expose_api_anonymous + def citations( self, trans, id, **kwds ): + tool = self._get_tool( id ) + rval = [] + for citation in tool.citations: + rval.append( citation.to_dict( 'bibtex' ) ) + return rval @web.expose_api_anonymous def create( self, trans, payload, **kwd ): @@ -170,6 +170,12 @@ # # -- Helper methods -- # + def _get_tool( self, id ): + id = urllib.unquote_plus( id ) + tool = self.app.toolbox.get_tool( id ) + if not tool: + raise exceptions.ObjectNotFound("Could not find tool with id '%s'" % id) + return tool def _rerun_tool( self, trans, payload, **kwargs ): """ diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/buildapp.py --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -174,6 +174,7 @@ webapp.mapper.resource( 'ftp_file', 'ftp_files', path_prefix='/api' ) webapp.mapper.resource( 'group', 'groups', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'quota', 'quotas', path_prefix='/api' ) + webapp.mapper.connect( '/api/tools/{id:.+?}/citations', action='citations', controller="tools" ) webapp.mapper.connect( '/api/tools/{id:.+?}', action='show', controller="tools" ) webapp.mapper.resource( 'tool', 'tools', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'user', 'users', path_prefix='/api' ) @@ -181,6 +182,7 @@ webapp.mapper.resource( 'visualization', 'visualizations', path_prefix='/api' ) webapp.mapper.resource( 'workflow', 'workflows', path_prefix='/api' ) webapp.mapper.resource_with_deleted( 'history', 'histories', path_prefix='/api' ) + webapp.mapper.connect( '/api/histories/{history_id}/citations', action='citations', controller="histories" ) webapp.mapper.resource( 'configuration', 'configuration', path_prefix='/api' ) webapp.mapper.resource( 'datatype', 'datatypes', diff -r 6292ada3115b4d2b658d1e574d20b08d887e7bec -r d5870d822b2492d99cc46fc31ab21c5b9b237cae lib/galaxy/webapps/galaxy/controllers/history.py --- a/lib/galaxy/webapps/galaxy/controllers/history.py +++ b/lib/galaxy/webapps/galaxy/controllers/history.py @@ -448,6 +448,13 @@ # ......................................................................... html @web.expose + def citations( self, trans ): + # Get history + history = trans.history + history_id = trans.security.encode_id( history.id ) + return trans.fill_template( "history/citations.mako", history=history, history_id=history_id ) + + @web.expose def display_structured( self, trans, id=None ): """ Display a history as a nested structure showing the jobs and workflow This diff is so big that we needed to truncate the remainder. Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: jmchilton: Bugfix: Fix shutil.move for converted files in upload.py.
by commits-noreply＠bitbucket.org 28 Jul '14

28 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/6292ada3115b/ Changeset: 6292ada3115b User: jmchilton Date: 2014-06-10 16:36:28 Summary: Bugfix: Fix shutil.move for converted files in upload.py. Affected #: 2 files diff -r 44081a39d8f0f36d92991a6faece6636914c1169 -r 6292ada3115b4d2b658d1e574d20b08d887e7bec lib/galaxy/datatypes/sniff.py --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -90,7 +90,7 @@ f.close() return False -def convert_newlines( fname, in_place=True ): +def convert_newlines( fname, in_place=True, tmp_dir=None, tmp_prefix=None ): """ Converts in place a file from universal line endings to Posix line endings. @@ -102,7 +102,7 @@ >>> file(fname).read() '1 2\\n3 4\\n' """ - fd, temp_name = tempfile.mkstemp() + fd, temp_name = tempfile.mkstemp( prefix=tmp_prefix, dir=tmp_dir ) fp = os.fdopen( fd, "wt" ) i = None for i, line in enumerate( file( fname, "U" ) ): @@ -150,7 +150,7 @@ else: return ( i, temp_name ) -def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+" ): +def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+", tmp_dir=None, tmp_prefix=None ): """ Combines above methods: convert_newlines() and sep2tabs() so that files do not need to be read twice @@ -163,7 +163,7 @@ '1\\t2\\n3\\t4\\n' """ regexp = re.compile( patt ) - fd, temp_name = tempfile.mkstemp() + fd, temp_name = tempfile.mkstemp( prefix=tmp_prefix, dir=tmp_dir ) fp = os.fdopen( fd, "wt" ) for i, line in enumerate( file( fname, "U" ) ): line = line.rstrip( '\r\n' ) diff -r 44081a39d8f0f36d92991a6faece6636914c1169 -r 6292ada3115b4d2b658d1e574d20b08d887e7bec tools/data_source/upload.py --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -272,10 +272,12 @@ # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: + tmpdir = output_adjacent_tmpdir( output_path ) + tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: - line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) + line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: - line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) + line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) if dataset.file_type == 'auto': ext = sniff.guess_ext( dataset.path, registry.sniff_order ) else: @@ -343,10 +345,12 @@ dataset.path = temp_name dp = temp_name if not value.is_binary: + tmpdir = output_adjacent_tmpdir( output_path ) + tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): - sniff.convert_newlines_sep2tabs( dp ) + sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: - sniff.convert_newlines( dp ) + sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) shutil.move( dp, os.path.join( files_path, name ) ) # Move the dataset to its "real" path shutil.move( dataset.primary_file, output_path ) @@ -356,6 +360,15 @@ stdout = 'uploaded %s file' % dataset.file_type ) json_file.write( to_json_string( info ) + "\n" ) + +def output_adjacent_tmpdir( output_path ): + """ For temp files that will ultimately be moved to output_path anyway + just create the file directly in output_path's directory so shutil.move + will work optimially. + """ + return os.path.dirname( output_path ) + + def __main__(): if len( sys.argv ) < 4: Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: davebgx: Improve fetching of api key for newly created user.
by commits-noreply＠bitbucket.org 28 Jul '14

28 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/44081a39d8f0/ Changeset: 44081a39d8f0 User: davebgx Date: 2014-07-28 18:17:47 Summary: Improve fetching of api key for newly created user. Affected #: 1 file diff -r 799ceff557bbbfea80cbfd164a297a0364b0c699 -r 44081a39d8f0f36d92991a6faece6636914c1169 lib/tool_shed/scripts/bootstrap_tool_shed/bootstrap_tool_shed.sh --- a/lib/tool_shed/scripts/bootstrap_tool_shed/bootstrap_tool_shed.sh +++ b/lib/tool_shed/scripts/bootstrap_tool_shed/bootstrap_tool_shed.sh @@ -73,12 +73,11 @@ fi done -echo -n "Retrieving admin user's API key..." -api_key_json=`curl -s --user $admin_user_email:$admin_user_password $local_shed_url/api/authenticate/baseauth/` -api_key=`echo $api_key_json | grep api_key | awk 'BEGIN { FS="\"" } ; { print \$4 }' | sed 's/\\s\+//'` +echo -n "Retrieving admin user's API key from $local_shed_url..." +api_key=`curl -s --user $admin_user_email:$admin_user_password $local_shed_url/api/authenticate/baseauth/ | sed 's/.\+api_key[^0-9a-f]\+$[0-9a-f]\+$.\+/\1/'` if [[ -z $api_key && ${api_key+x} ]] ; then - stop_err "Error getting API key for user $admin_user_email." + stop_err "Error getting API key for user $admin_user_email." fi echo " done." Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: 2 new changesets
by commits-noreply＠bitbucket.org 28 Jul '14

28 Jul '14

2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/42cc410f02da/ Changeset: 42cc410f02da User: davebgx Date: 2014-07-28 17:58:52 Summary: Migrate tools from the distribution to the tool shed. Affected #: 33 files diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 lib/tool_shed/galaxy_install/migrate/versions/0012_tools.py --- /dev/null +++ b/lib/tool_shed/galaxy_install/migrate/versions/0012_tools.py @@ -0,0 +1,48 @@ +""" +The following tools have been eliminated from the distribution: + +1: Compute an expression on every row +2: Correlation for numeric columns +3: Count GFF Features +4: Filter on ambiguities in polymorphism datasets +5: Generate A Matrix for using PC and LDA +6: Histogram of a numeric column +7: Perform Linear Discriminant Analysis +8: Maximal Information-based Nonparametric Exploration +9: Pearson and apos Correlation between any two numeric columns +10: Convert from pgSnp to gd_snp +11: Draw ROC plot on "Perform LDA" output +12: Scatterplot of two numeric columns +13: snpFreq significant SNPs in case-control data +14: Build custom track for UCSC genome browser +15: VCF to pgSnp + +The tools are now available in the repositories respectively: + +1: column_maker +2: correlation +3: count_gff_features +4: dna_filtering +5: generate_pc_lda_matrix +6: histogram +7: lda_analysis +8: mine +9: pearson_correlation +10: pgsnp2gd_snp +11: plot_from_lda +12: scatterplot +13: snpfreq +14: ucsc_custom_track +15: vcf2pgsnp + +from the main Galaxy tool shed at http://toolshed.g2.bx.psu.edu +and will be installed into your local Galaxy instance at the +location discussed above by running the following command. + +""" + +def upgrade( migrate_engine ): + print __doc__ + +def downgrade( migrate_engine ): + pass diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 scripts/migrate_tools/0012_tools.sh --- /dev/null +++ b/scripts/migrate_tools/0012_tools.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd `dirname $0`/../.. +python ./scripts/migrate_tools/migrate_tools.py 0012_tools.xml $@ diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 scripts/migrate_tools/0012_tools.xml --- /dev/null +++ b/scripts/migrate_tools/0012_tools.xml @@ -0,0 +1,48 @@ +<?xml version="1.0"?> +<toolshed name="toolshed.g2.bx.psu.edu"> + <repository changeset_revision="08a01b2ce4cd" owner="devteam" name="column_maker" description="Compute an expression on every row"> + <tool file="stats/column_maker.xml" id="Add_a_column1" version="1.1.0" /> + </repository> + <repository changeset_revision="24e01abf9e34" owner="devteam" name="correlation" description="Correlation for numeric columns"> + <tool file="stats/cor.xml" id="cor2" version="1.0.0" /> + </repository> + <repository changeset_revision="fabda887a71f" owner="devteam" name="count_gff_features" description="Count GFF Features"> + <tool file="stats/count_gff_features.xml" id="count_gff_features" version="0.1" /> + </repository> + <repository changeset_revision="a6f0d355b05f" owner="devteam" name="dna_filtering" description="Filter on ambiguities in polymorphism datasets"> + <tool file="stats/dna_filtering.xml" id="dna_filter" version="1.0.0" /> + </repository> + <repository changeset_revision="04cdbd00dcec" owner="devteam" name="generate_pc_lda_matrix" description="Generate A Matrix for using PC and LDA"> + <tool file="stats/generate_matrix_for_pca_lda.xml" id="generate_matrix_for_pca_and_lda1" version="1.0.0" /> + </repository> + <repository changeset_revision="6ff47de059a0" owner="devteam" name="histogram" description="Histogram of a numeric column"> + <tool file="plotting/histogram2.xml" id="histogram_rpy" version="1.0.3" /> + </repository> + <repository changeset_revision="f38763b52f33" owner="devteam" name="lda_analysis" description="Perform Linear Discriminant Analysis"> + <tool file="stats/lda_analy.xml" id="lda_analy1" version="1.0.1" /> + </repository> + <repository changeset_revision="783d91de9e6d" owner="devteam" name="mine" description="Maximal Information-based Nonparametric Exploration"> + <tool file="stats/MINE.xml" id="maximal_information_based_nonparametric_exploration" version="0.0.1" /> + </repository> + <repository changeset_revision="5ebbb889236a" owner="devteam" name="pearson_correlation" description="Pearson and apos Correlation between any two numeric columns"> + <tool file="stats/correlation.xml" id="Pearson_and_apos_Correlation1" version="1.0.0" /> + </repository> + <repository changeset_revision="d281062566f9" owner="devteam" name="pgsnp2gd_snp" description="Convert from pgSnp to gd_snp"> + <tool file="phenotype_association/pgSnp2gd_snp.xml" id="pgSnp2gd_snp" version="1.0.0" /> + </repository> + <repository changeset_revision="c5ab37076128" owner="devteam" name="plot_from_lda" description="Draw ROC plot on "Perform LDA" output"> + <tool file="stats/plot_from_lda.xml" id="plot_for_lda_output1" version="1.0.1" /> + </repository> + <repository changeset_revision="c12b0759203b" owner="devteam" name="scatterplot" description="Scatterplot of two numeric columns"> + <tool file="plotting/scatterplot.xml" id="scatterplot_rpy" version="1.0.0" /> + </repository> + <repository changeset_revision="72ea0d13dd66" owner="devteam" name="snpfreq" description="snpFreq significant SNPs in case-control data"> + <tool file="phenotype_association/snpFreq.xml" id="hgv_snpFreq" version="1.0.1" /> + </repository> + <repository changeset_revision="618e56c3109b" owner="devteam" name="ucsc_custom_track" description="Build custom track for UCSC genome browser"> + <tool file="visualization/build_ucsc_custom_track.xml" id="build_ucsc_custom_track_1" version="1.0.0" /> + </repository> + <repository changeset_revision="5fca46616675" owner="devteam" name="vcf2pgsnp" description="VCF to pgSnp"> + <tool file="phenotype_association/vcf2pgSnp.xml" id="vcf2pgSnp" version="1.0.0" /> + </repository> +</toolshed> \ No newline at end of file diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tool_conf.xml.sample --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -36,7 +36,6 @@ </section><section id="textutil" name="Text Manipulation"><tool file="filters/fixedValueColumn.xml" /> - <tool file="stats/column_maker.xml" /><tool file="filters/catWrapper.xml" /><tool file="filters/cutWrapper.xml" /><tool file="filters/mergeCols.xml" /> @@ -52,7 +51,6 @@ <tool file="filters/trimmer.xml" /><tool file="filters/wc_gnu.xml" /><tool file="filters/secure_hash_message_digest.xml" /> - <tool file="stats/dna_filtering.xml" /></section><section id="filter" name="Filter and Sort"><tool file="stats/filtering.xml" /> @@ -113,22 +111,11 @@ <section id="stats" name="Statistics"><tool file="stats/gsummary.xml" /><tool file="filters/uniq.xml" /> - <tool file="stats/cor.xml" /> - <tool file="stats/generate_matrix_for_pca_lda.xml" /> - <tool file="stats/lda_analy.xml" /> - <tool file="stats/plot_from_lda.xml" /> - <tool file="stats/MINE.xml" /> - - <label id="gff" text="GFF" /> - <tool file="stats/count_gff_features.xml" /></section><section id="plots" name="Graph/Display Data"> - <tool file="plotting/histogram2.xml" /> - <tool file="plotting/scatterplot.xml" /><tool file="plotting/bar_chart.xml" /><tool file="plotting/boxplot.xml" /><tool file="visualization/LAJ.xml" /> - <tool file="visualization/build_ucsc_custom_track.xml" /><tool file="maf/vcf_to_maf_customtrack.xml" /><tool file="mutation/visualize.xml" /></section> @@ -189,13 +176,11 @@ <tool file="phenotype_association/sift.xml" /><tool file="phenotype_association/linkToGProfile.xml" /><tool file="phenotype_association/linkToDavid.xml" /> - <tool file="phenotype_association/snpFreq.xml" /><tool file="phenotype_association/ldtools.xml" /><tool file="phenotype_association/pass.xml" /><tool file="phenotype_association/gpass.xml" /><tool file="phenotype_association/beam.xml" /><tool file="phenotype_association/lps.xml" /><tool file="phenotype_association/master2pg.xml" /> - <tool file="phenotype_association/vcf2pgSnp.xml" /></section></toolbox> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/pgSnp2gd_snp.pl --- a/tools/phenotype_association/pgSnp2gd_snp.pl +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -#convert from pgSnp file to snp table (Webb format?) - -#snp table format: -#1. chr -#2. position (0 based) -#3. ref allele -#4. second allele -#5. overall quality -#foreach individual (6-9, 10-13, ...) -#a. count of allele in 3 -#b. count of allele in 4 -#c. genotype call (-1, or count of ref allele) -#d. quality of genotype call (quality of non-ref allele from masterVar) - -if (!@ARGV) { - print "usage: pgSnp2gd_snp.pl file.pgSnp[.gz|.bz2] [-tab=snpTable.txt -addColsOnly -build=hg19 -name=na -ref=#1based -chr=#1based ] > newSnpTable.txt\n"; - exit; -} - -my $in = shift @ARGV; -my $tab; -my $tabOnly; -my $build; -my $name; -my $ref; -my $binChr = 1; #position of chrom column, indicates if bin is added -foreach (@ARGV) { - if (/-tab=(.*)/) { $tab = $1; } - elsif (/-addColsOnly/) { $tabOnly = 1; } - elsif (/-build=(.*)/) { $build = $1; } - elsif (/-name=(.*)/) { $name = $1; } - elsif (/-ref=(\d+)/) { $ref = $1 - 1; } #go to index - elsif (/-chr=(\d+)/) { $binChr = $1; } -} - -if ($binChr == 2 && $ref) { $ref--; } #shift over by 1, we will delete bin -if ((!$tab or !$tabOnly) && !$ref) { - print "Error the reference allele must be in a column in the file if not just adding to a previous SNP table.\n"; - exit; -} - -#WARNING loads snp table in memory, this could take > 1G ram -my %old; -my $colcnt = 0; -my @head; -if ($tab) { - open(FH, $tab) or die "Couldn't open $tab, $!\n"; - while (<FH>) { - chomp; - if (/^#/) { push(@head, $_); next; } - my @f = split(/\t/); - $old{"$f[0]:$f[1]"} = join("\t", @f); - $colcnt = scalar @f; - } - close FH or die "Couldn't close $tab, $!\n"; -} - -if ($in =~ /.gz$/) { - open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; -}elsif ($in =~ /.bz2$/) { - open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n"; -}else { - open(FH, $in) or die "Couldn't open $in, $!\n"; -} -prepHeader(); -if (@head) { #keep old header, add new? - print join("\n", @head), "\n"; -} -while (<FH>) { - chomp; - if (/^#/) { next; } - if (/^\s*$/) { next; } - my @f = split(/\t/); - if ($binChr == 2) { #must have a bin column prepended on the beginning - shift @f; #delete it - } - if (!$f[3]) { next; } #WHAT? most likely still zipped? - if ($f[4] > 2) { next; } #can only do cases of 2 alleles - if ($f[2] == $f[1] or $f[2] - $f[1] != 1) { next; } #no indels - if ($f[3] =~ /-/) { next; } #no indels - #if creating a new table need the reference allele in a column - if (%old && $old{"$f[0]:$f[1]"}) { - my @o = split(/\t/, $old{"$f[0]:$f[1]"}); - my $freq = 0; - my $freq2 = 0; - my $sc; - my $g = 1; #genotype == ref allele count - if ($f[4] == 1) { #should be homozygous - if ($f[3] eq $o[2]) { $g = 2; $freq = $f[5]; } - elsif ($f[3] eq $o[3]) { $g = 0; $freq2 = $f[5]; } - else { next; } #doesn't match either allele, skip - $sc = $f[6]; - }else { - my $a = 0; #index of a alleles, freq, scores - my $b = 1; #same for b - my @all = split(/\//, $f[3]); - if ($o[2] ne $all[0] && $o[2] ne $all[1]) { next; } #must match one - if ($o[3] ne $all[0] && $o[3] ne $all[1]) { next; } - if ($o[2] eq $all[1]) { #switch indexes - $a = 1; - $b = 0; - } - my @fr = split(/,/, $f[5]); - $freq = $fr[$a]; - $freq2 = $fr[$b]; - my @s = split(/,/, $f[6]); - $sc = $s[$b]; - } - #print old - print $old{"$f[0]:$f[1]"}; - #add new columns - print "\t$freq\t$freq2\t$g\t$sc\n"; - $old{"$f[0]:$f[1]"} = ''; - }elsif (!$tabOnly) { #new table, or don't have this SNP - #need reference allele - if ($f[3] !~ /$f[$ref]/ && $f[4] == 2) { next; } #no reference allele - my $freq = 0; - my $freq2 = 0; - my $sc; - my $g = 1; #genotype == ref allele count - my $alt; - if ($f[4] == 1) { #should be homozygous - if ($f[3] eq $f[$ref]) { $g = 2; $freq = $f[5]; $alt = 'N'; } - else { $g = 0; $freq2 = $f[5]; $alt = $f[3]; } #matches alternate - $sc = $f[6]; - }else { - my $a = 0; #index of a alleles, freq, scores - my $b = 1; #same for b - my @all = split(/\//, $f[3]); - if ($f[$ref] ne $all[0] && $f[$ref] ne $all[1]) { next; } #must match one - if ($f[$ref] eq $all[1]) { #switch indexes - $a = 1; - $b = 0; - } - my @fr = split(/,/, $f[5]); - $freq = $fr[$a]; - $freq2 = $fr[$b]; - my @s = split(/,/, $f[6]); - $sc = $s[$b]; - $alt = $all[$b]; - } - #print initial columns - print "$f[0]\t$f[1]\t$f[$ref]\t$alt\t-1"; - #pad for other individuals if needed - my $i = 5; - while ($i < $colcnt) { - print "\t-1\t-1\t-1\t-1"; - $i += 4; - } - #add new columns - print "\t$freq\t$freq2\t$g\t$sc\n"; - } -} -close FH or die "Couldn't close $in, $!\n"; - -#if adding to a snp table, now we need to finish those not in the latest set -foreach my $k (keys %old) { - if ($old{$k} ne '') { #not printed yet - print $old{$k}, "\t-1\t-1\t-1\t-1\n"; #plus blank for this one - } -} - -exit; - -#parse old header and add or create new -sub prepHeader { - if (!$build) { $build = 'hg19'; } #set default - my @cnames; - my @ind; - my $n; - if (@head) { #parse previous header - my $h = join("", @head); #may split between lines - if ($h =~ /"column_names":\[(.*?)\]/) { - my @t = split(/,/, $1); - foreach (@t) { s/"//g; } - @cnames = @t; - $n = $cnames[$#cnames]; - $n =~ s/Q//; - $n++; - } - if ($h =~ /"dbkey":"(.*?)"/) { $build = $1; } - if ($h =~ /"individuals":\[(.*)\]/) { - my $t = $1; - $t =~ s/\]\].*/]/; #remove if there is more categories - @ind = split(/,/, $t); - } - }else { #start new header - @cnames = ("chr", "pos", "A", "B", "Q"); - $n = 1; - } - #add current - if (!$name) { $name= 'na'; } - my $stcol = $colcnt + 1; - if ($stcol == 1) { $stcol = 6; } #move past initial columns - push(@ind, "[\"$name\",$stcol]"); - push(@cnames, "${n}A", "${n}B", "${n}G", "${n}Q"); - #reassign head - undef @head; - foreach (@cnames) { $_ = "\"$_\""; } #quote name - $head[0] = "#{\"column_names\":[" . join(",", @cnames) . "],"; - $head[1] = "#\"individuals\":[" . join(",", @ind) . "],"; - $head[2] = "#\"dbkey\":\"$build\",\"pos\":2,\"rPos\":2,\"ref\":1,\"scaffold\":1,\"species\":\"$build\"}"; -} -####End - diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/pgSnp2gd_snp.xml --- a/tools/phenotype_association/pgSnp2gd_snp.xml +++ /dev/null @@ -1,97 +0,0 @@ -<tool id="pgSnp2gd_snp" name="pgSnp to gd_snp" hidden="false"> - <description>Convert from pgSnp to gd_snp</description> - <command interpreter="perl"> - #if $snptab.tab2 == "yes" - #if $snptab.colsOnly == "addColsOnly" #pgSnp2gd_snp.pl $input1 -tab=$snptab.input2 -name=$indName -build=${input1.metadata.dbkey} -addColsOnly -chr=${input1.metadata.chromCol} > $out_file1 - #else #pgSnp2gd_snp.pl $input1 -tab=$snptab.input2 -name=$indName -build=${input1.metadata.dbkey} -ref=${ref} -chr=${input1.metadata.chromCol} > $out_file1 - #end if - #else #pgSnp2gd_snp.pl $input1 -name=$indName -build=${input1.metadata.dbkey} -ref=${ref} -chr=${input1.metadata.chromCol} > $out_file1 - #end if - </command> - <inputs> - <param format="tab" name="input1" type="data" label="pgSnp dataset" /> - <conditional name="snptab"> - <param name="tab2" type="select" label="Append to gd_snp dataset in history"> - <option value="yes">yes</option> - <option value="no" selected="true">no</option> - </param> - <when value="yes"> - <param format="gd_snp" name="input2" type="data" label="gd_snp dataset" /> - <conditional name="needRef"> - <param name="colsOnly" type="select" label="Skip new SNPs"> - <option value="no" selected="true">no</option> - <option value="addColsOnly">yes</option> - </param> - <when value="no"> - <param name="ref" type="data_column" data_ref="input1" label="Column with reference allele" /> - </when> - <when value="addColsOnly"> - </when> - </conditional> - </when> - <when value="no"> - <param name="ref" type="data_column" data_ref="input1" label="Column with reference allele" /> - </when> - </conditional> - <param name="indName" type="text" size="20" label="Label for new individual/group" value="na" /> - </inputs> - <outputs> - <data format="gd_snp" name="out_file1" /> - </outputs> - <tests> - <test> - <param name='input1' value='pgSnpTest.ref.txt' ftype='interval' /> - <param name='tab2' value='no' /> - <param name='ref' value='8' /> - <param name='indName' value='na' /> - <output name="output" file="pgSnp2snp_output.txt" /> - </test> - </tests> - - <help> - -**Dataset formats** - -The input dataset is of Galaxy datatype interval_, with the additional columns -required for pgSnp_ format. -Any further columns beyond those defined for pgSnp will be ignored. -The output dataset is a gd_snp_ table. (`Dataset missing?`_) - -.. _interval: ./static/formatHelp.html#interval -.. _pgSnp: ./static/formatHelp.html#pgSnp -.. _gd_snp: ./static/formatHelp.html#gd_snp -.. _Dataset missing?: ./static/formatHelp.html - ------ - -**What it does** - -This tool converts a pgSnp dataset to gd_snp format, either starting a new -dataset or appending to an old one. When appending, -if any new SNPs appear only in the pgSnp file they can either be skipped entirely, or -backfilled with "-1" (meaning "unknown") for previous individuals/groups in the -input gd_snp dataset. -If any new SNPs are being added (either by creating a new table or by backfilling), -then an extra column with the reference allele must be supplied in the pgSnp dataset, -as shown in the example below. - ------ - -**Example** - -- input pgSnp file, with reference allele added:: - - chr1 1888681 1888682 C/T 2 4,3 0.8893,0.8453 T - chr1 3118325 3118326 T 1 8 0.8796 C - chr1 3211457 3211458 A/C 2 17,10 0.8610,0.8576 A - etc. - -- gd_snp output:: - - chr1 1888681 T C -1 3 4 1 0.8893 - chr1 3118325 C T -1 0 8 0 0.8796 - chr1 3211457 A C -1 17 10 1 0.8576 - etc. - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/snpFreq.xml --- a/tools/phenotype_association/snpFreq.xml +++ /dev/null @@ -1,124 +0,0 @@ -<tool id="hgv_snpFreq" name="snpFreq" version="1.0.1"> - <description>significant SNPs in case-control data</description> - - <command interpreter="perl"> - snpFreq2.pl $inTypeCond.inType 0.05 $input $output - #if $inTypeCond.inType == "tab" - $inTypeCond.group1_1 $inTypeCond.group1_2 $inTypeCond.group1_3 - $inTypeCond.group2_1 $inTypeCond.group2_2 $inTypeCond.group2_3 0.05 - #else if $inTypeCond.inType == "snp" - $group1 $group2 - #end if - </command> - - <inputs> - <conditional name="inTypeCond"> - <param name="inType" type="select" label="Format of input" > - <option value="tab">Alleles pre-counted</option> - <option value="snp">SNP table</option> - </param> - <when value="tab"> - <param format="tabular" name="input" type="data" label="Dataset" /> - <param name="group1_1" label="Column with genotype 1 count for group 1" type="data_column" data_ref="input" /> - <param name="group1_2" label="Column with genotype 2 count for group 1" type="data_column" data_ref="input" /> - <param name="group1_3" label="Column with genotype 3 count for group 1" type="data_column" data_ref="input" /> - <param name="group2_1" label="Column with genotype 1 count for group 2" type="data_column" data_ref="input" /> - <param name="group2_2" label="Column with genotype 2 count for group 2" type="data_column" data_ref="input" /> - <param name="group2_3" label="Column with genotype 3 count for group 2" type="data_column" data_ref="input" /> - </when> - <when value="snp"> - <param format="snp" name="input" type="data" label="SNP Dataset" /> - <param format="ind" name="group1" type="data" label="Group 1" /> - <param format="ind" name="group2" type="data" label="Group 2" /> - </when> - </conditional> - </inputs> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <requirements> - <requirement type="binary">R</requirement> - </requirements> - - <tests> - <test> - <param name="inType" value="tab" /> - <param name="input" ftype="tabular" value="snpFreqInput.txt" dbkey="hg18" /> - <param name="group1_1" value="4" /> - <param name="group1_2" value="5" /> - <param name="group1_3" value="6" /> - <param name="group2_1" value="7" /> - <param name="group2_2" value="8" /> - <param name="group2_3" value="9" /> - <output name="output" file="snpFreqTestOut.txt" /> - </test> - </tests> - - <help> - -**Dataset formats** - -The input is tabular_, with six columns of allele counts. The output is also tabular, -and includes all of the input data plus the additional columns described below. -(`Dataset missing?`_) - -.. _tabular: ${static_path}/formatHelp.html#tab -.. _Dataset missing?: ${static_path}/formatHelp.html - ------ - -**What it does** - -This tool performs a basic analysis of bi-allelic SNPs in case-control -data, using the R statistical environment and Fisher's exact test to -identify SNPs with a significant difference in the allele frequencies -between the two groups. R's "qvalue" package is used to correct for -multiple testing. - -The input file includes counts for each allele combination (AA aa Aa) -for each group at each SNP position. The assignment of codes (1 2 3) -to these genotypes is arbitrary, as long as it is consistent for both -groups. Any other input columns are ignored in the computation, but -are copied to the output. The output appends eight additional columns, -namely the minimum expected counts of the three genotypes for each -group, the p-value, and the q-value. - ------ - -**Example** - -- input file:: - - chr1 210 211 38 4 15 56 0 1 x - chr1 228 229 55 0 2 56 0 1 x - chr1 230 231 46 0 11 55 0 2 x - chr1 234 235 43 0 14 55 0 2 x - chr1 236 237 55 0 2 13 10 34 x - chr1 437 438 55 0 2 46 0 11 x - chr1 439 440 56 0 1 55 0 2 x - chr1 449 450 56 0 1 13 20 24 x - chr1 518 519 56 0 1 38 4 15 x - -Here the group 1 genotype counts are in columns 4 - 6, while those -for group 2 are in columns 7 - 9. - -Note that the "x" column has no meaning. It was added to this example -to show that extra columns can be included, and to make it easier -to see where the new columns are appended in the output. - -- output file:: - - chr1 210 211 38 4 15 56 0 1 x 47 2 8 47 2 8 1.50219088598917e-05 6.32501425679652e-06 - chr1 228 229 55 0 2 56 0 1 x 55.5 0 1.5 55.5 0 1.5 1 0.210526315789474 - chr1 230 231 46 0 11 55 0 2 x 50.5 0 6.5 50.5 0 6.5 0.0155644201009862 0.00409590002657532 - chr1 234 235 43 0 14 55 0 2 x 49 0 8 49 0 8 0.00210854461554067 0.000739840215979182 - chr1 236 237 55 0 2 13 10 34 x 34 5 18 34 5 18 6.14613878554783e-17 4.31307984950725e-17 - chr1 437 438 55 0 2 46 0 11 x 50.5 0 6.5 50.5 0 6.5 0.0155644201009862 0.00409590002657532 - chr1 439 440 56 0 1 55 0 2 x 55.5 0 1.5 55.5 0 1.5 1 0.210526315789474 - chr1 449 450 56 0 1 13 20 24 x 34.5 10 12.5 34.5 10 12.5 2.25757007974134e-18 2.37638955762246e-18 - chr1 518 519 56 0 1 38 4 15 x 47 2 8 47 2 8 1.50219088598917e-05 6.32501425679652e-06 - - </help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/snpFreq2.pl --- a/tools/phenotype_association/snpFreq2.pl +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -#using large SNP tables (~1G) may require large memory ~15G in R - -#expected input: path to file, cols of counts (2 sets of 3), threshold -if (!@ARGV or scalar @ARGV != 11) { - if (!@ARGV or scalar @ARGV != 6) { #snpTable usage - print "usage for tab separated allele counts\n", - "snpFreq.pl inputType #threshold /path/to/snps.txt outfile <6 column numbers(1 based) with counts for alleles, first one group then another>\n"; - print "OR for SNP tables\n"; - print "usage snpFreq.pl inputType #threshold /path/to/snpTable.txt outfile group1File group2File\n"; - exit 1; - } -} - -#get and verify inputs -my ($file, $a1, $a2, $a3, $b1, $b2, $b3, $thresh, $outfile); -if ($ARGV[0] eq 'tab') { - shift @ARGV; - $thresh = shift @ARGV; - if ($thresh !~ /^\d*\.?\d+$/) { - print "Error the threshold must be a number. Got $thresh\n"; - exit 1; - }elsif ($thresh > .3) { - print "Error the threshold can not be greater than 0.3 got $thresh\n"; - exit 1; - } - $file = shift @ARGV; - $outfile = shift @ARGV; - $a1 = shift @ARGV; - if ($a1 =~ /\D/ or $a1 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n"; - exit 1; - } - $a2 = shift @ARGV; - if ($a2 =~ /\D/ or $a2 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n"; - exit 1; - } - $a3 = shift @ARGV; - if ($a3 =~ /\D/ or $a3 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n"; - exit 1; - } - $b1 = shift @ARGV; - if ($b1 =~ /\D/ or $b1 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n"; - exit 1; - } - $b2 = shift @ARGV; - if ($b2 =~ /\D/ or $b2 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n"; - exit 1; - } - $b3 = shift @ARGV; - if ($b3 =~ /\D/ or $b3 < 1) { - print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n"; - exit 1; - } -}else { #snp table convert and assign variables - #snpTable.txt #threshold outfile workingdir - shift @ARGV; - $thresh = shift @ARGV; - if ($thresh !~ /^\d*\.?\d+$/) { - print "Error the threshold must be a number. Got $thresh\n"; - exit 1; - }elsif ($thresh > .3) { - print "Error the threshold can not be greater than 0.3 got $thresh\n"; - exit 1; - } - $file = shift @ARGV; - $outfile = shift @ARGV; - my $grpFile = shift @ARGV; - my @g1; - open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; - while (<FH>) { - chomp; - if (/^(\d+)\s/) { push(@g1, $1); } - } - close FH or die "Couldn't close $grpFile, $!\n"; - $grpFile = shift @ARGV; - my @g2; - open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; - while (<FH>) { - chomp; - if (/^(\d+)\s/) { push(@g2, $1); } - } - close FH or die "Couldn't close $grpFile, $!\n"; - if ($file =~ /.gz$/) { - open(FH, "zcat $file |") or die "Couldn't read $file, $!\n"; - }else { - open(FH, $file) or die "Couldn't read $file, $!\n"; - } - open(OUT, ">", "snpTable.txt") or die "Couldn't open snpTable.txt, $!\n"; - my $size; - while (<FH>) { - chomp; - if (/^#/) { next; } #header - my @f = split(/\t/); - $size = scalar @f; - my @gc1 = (0, 0, 0); - my @gc2 = (0, 0, 0); - foreach my $g (@g1) { - my $i = $g+1; #g is 1 based first col want 0 based snp call column - if ($i > $#f) { die "ERROR looking for index $i which is greater than the list $#f\n"; } - if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref - $gc1[0]++; - }elsif ($f[$i] == 1) { - $gc1[2]++; - }elsif ($f[$i] == 0) { - $gc1[1]++; - }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } - } - foreach my $g (@g2) { - my $i = $g+1; #g is 1 based first col want 0 based snp call column - if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref - $gc2[0]++; - }elsif ($f[$i] == 1) { - $gc2[2]++; - }elsif ($f[$i] == 0) { - $gc2[1]++; - }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } - } - print OUT join("\t", @f), "\t", join("\t", @gc1), "\t", join("\t", @gc2), - "\n"; - } - close FH or die "Couldn't close $file, $!\n"; - close OUT or die "Couldn't close snpTable.txt, $!\n"; - my $i = $size + 1; #next 1 based column after input data - $a1 = $i++; - $a2 = $i++; - $a3 = $i++; - $b1 = $i++; - $b2 = $i++; - $b3 = $i++; - $file = "snpTable.txt"; -} - -#run a fishers exact test (using R) on whole table -my $cmd = qq|options(warn=-1) - tab <- read.table('$file', sep="\t") - size <- length(tab[,1]) - width <- length(tab[1,]) - x <- 1:size - y <- matrix(data=0, nr=size, nc=6) - for(i in 1:size) { - m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2) - t <- fisher.test(m) - x[i] <- t\$p.value - if (x[i] >= 1) { - x[i] <- .999 - } - n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) - n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3]) - y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n - y[i,1] <- round(y[i,1],3) - y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n - y[i,2] <- round(y[i,2],3) - y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n - y[i,3] <- round(y[i,3],3) - n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) - y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n - y[i,4] <- round(y[i,4],3) - y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n - y[i,5] <- round(y[i,5],3) - y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n - y[i,6] <- round(y[i,6],3) - }|; - #results <- data.frame(tab[1:size,1:width], x[1:size]) - #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - #q()|; - -#my $cmd2 = qq|suppressPackageStartupMessages(library(lib.loc='/afs/bx.psu.edu/home/giardine/lib/R', qvalue)) -my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue)) - qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE) - q <- qobj\$qvalues - results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size]) - write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - q()|; - -#for TESTING -my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size]) - write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") - q()|; - -open(FT, "| R --slave --vanilla") - or die "Couldn't call fisher.text, $!\n"; -print FT $cmd, "\n"; #fisher test -print FT $cmd2, "\n"; #qvalues and results -#print FT $pr, "\n"; -close FT or die "Couldn't finish fisher.test, $!\n"; - -exit; diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/vcf2pgSnp.pl --- a/tools/phenotype_association/vcf2pgSnp.pl +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -#convert from a vcf file to a pgSnp file. -#frequency count = chromosome count -#either a single column/individual -#or all columns as a population - -my $in; -my $stCol = 9; -my $endCol; -if (@ARGV && scalar @ARGV == 2) { - $stCol = shift @ARGV; - $in = shift @ARGV; - if ($stCol eq 'all') { $stCol = 10; } - else { $endCol = $stCol; } - $stCol--; #go from 1 based to zero based column number - if ($stCol < 9) { - print "ERROR genotype fields don't start until column 10\n"; - exit; - } -}elsif (@ARGV && scalar @ARGV == 1) { - $in = shift @ARGV; -}elsif (@ARGV) { - print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n"; - exit; -} - -open(FH, $in) or die "Couldn't open $in, $!\n"; -while (<FH>) { - chomp; - if (/^\s*#/) { next; } #skip comments/headers - if (/^\s*$/) { next; } #skip blank lines - my @f = split(/\t/); - #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... - my $a; - my %nt; - my %all; - my $cnt = 0; - my $var; - if ($f[3] eq 'N') { next; } #ignore ref=N - if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite - #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate - if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason - my $ind = 0; - if ($f[8] ne 'GT') { #more than just genotype - my @t = split(/:/, $f[8]); - foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } - if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } - } - #count 0's, 1's, 2's - if (!$endCol) { $endCol = $#f; } - foreach my $col ($stCol .. $endCol) { - if ($ind > 0) { - my @t = split(/:/, $f[$col]); - $f[$col] = $t[$ind] . ":"; #only keep genotype part - } - if ($f[$col] =~ /^(0|1|2).(0|1|2)/) { - $nt{$1}++; - $nt{$2}++; - }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single - $nt{$1}++; - } #else ignore - } - if (%nt) { - if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } - print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info - my $cnt = scalar(keys %nt); - my $fr; - my $sc; - my $all; - if (exists $nt{0}) { - $all = uc($f[3]); - $fr = $nt{0}; - $sc = 0; - } - if (!exists $nt{0} && exists $nt{1}) { - if ($f[4] =~ /([ACTG]),?/) { - $all = $1; - $fr = $nt{1}; - $sc = 0; - }else { die "bad variant nt $f[4] for nt 1"; } - }elsif (exists $nt{1}) { - if ($f[4] =~ /([ACTG]),?/) { - $all .= '/' . $1; - $fr .= ",$nt{1}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 1"; } - } - if (exists $nt{2}) { - if ($f[4] =~ /^[ACTG],([ACTG]),?/) { - $all .= '/' . $1; - $fr .= ",$nt{2}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 2"; } - } - if (exists $nt{3}) { - if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) { - $all .= '/' . $1; - $fr .= ",$nt{3}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 3"; } - } - if (exists $nt{4}) { - if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) { - $all .= '/' . $1; - $fr .= ",$nt{4}"; - $sc .= ",0"; - }else { die "bad variant nt $f[4] for nt 4"; } - } - print "$all\t$cnt\t$fr\t$sc\n"; - } -} -close FH; - -exit; diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/phenotype_association/vcf2pgSnp.xml --- a/tools/phenotype_association/vcf2pgSnp.xml +++ /dev/null @@ -1,79 +0,0 @@ -<tool id="vcf2pgSnp" name="VCF to pgSnp" hidden="false"> - <description>Convert from VCF to pgSnp format</description> - <command interpreter="perl"> - #if $inType.how == "all" #vcf2pgSnp.pl all $input1 > $out_file1 - #else #vcf2pgSnp.pl $inType.ind_column $input1 > $out_file1 - #end if - </command> - <inputs> - <param format="vcf" name="input1" type="data" label="VCF dataset" /> - <conditional name="inType"> - <param name="how" type="select" label="How to treat individuals"> - <option value="all">Group all as a population</option> - <option value="one">Do just one individual</option> - </param> - <when value="one"> - <param name="ind_column" type="data_column" data_ref="input1" label="Column to convert" value="10" /> - </when> - <when value="all"> -  - </when> - </conditional> - </inputs> - <outputs> - <data format="interval" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input1" value="vcf2pgSnp_input.vcf" ftype="vcf" /> - <param name="how" value="all" /> - <output name="output" file="vcf2pgSnp_output.pgSnp" /> - </test> - </tests> - - <help> -**Dataset formats** - -The input dataset is VCF_ format. -The output dataset is pgSnp_. (`Dataset missing?`_) - -.. _Dataset missing?: ./static/formatHelp.html -.. _VCF: ./static/formatHelp.html#vcf -.. _pgSnp: ./static/formatHelp.html#pgSnp - ------ - -**What it does** - -This converts a VCF dataset to pgSnp with the frequency counts being -chromosome counts. If there is more than one column of SNP data it will either -accumulate all columns as a population or convert the column indicated -to pgSnp. - ------ - -**Examples** - -- input:: - - 1 13327 rs144762171 G C 100 PASS VT=SNP;SNPSOURCE=LOWCOV GT:DS:GL 0|0:0.000:-0.03,-1.11,-5.00 0|1:1.000:-1.97,-0.01,-2.51 0|0:0.050:-0.01,-1.69,-5.00 0|0:0.100:-0.48,-0.48,-0.48 - 1 13980 rs151276478 T C 100 PASS VT=SNP;SNPSOURCE=LOWCOV GT:DS:GL 0|0:0.100:-0.48,-0.48,-0.48 0|1:0.950:-0.48,-0.48,-0.48 0|0:0.050:-0.48,-0.48,-0.48 0|0:0.050:-0.48,-0.48,-0.48 - 1 30923 rs140337953 G T 100 PASS VT=SNP;SNPSOURCE=LOWCOV GT:DS:GL 1|1:1.950:-5.00,-0.61,-0.12 0|0:0.450:-0.10,-0.69,-2.81 0|0:0.450:-0.11,-0.64,-3.49 1|1:1.500:-0.48,-0.48,-0.48 - etc. - -- output as a population:: - - chr1 13326 13327 G/C 2 7,1 0,0 - chr1 13979 13980 T/C 2 7,1 0,0 - chr1 30922 30923 G/T 2 4,4 0,0 - etc. - -- output for each column separately:: - - chr1 13326 13327 G 1 2 0 G/C 2 1,1 0,0 G 1 2 0 G 1 2 0 - chr1 13979 13980 T 1 2 0 T/C 2 1,1 0,0 T 1 2 0 T 1 2 0 - chr1 30922 30923 T 1 2 0 G 1 2 0 G 1 2 0 T 1 2 0 - etc. - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/histogram.py --- a/tools/plotting/histogram.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster - -import sys -from rpy import * - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - # Handle input params - in_fname = sys.argv[1] - out_fname = sys.argv[2] - try: - column = int( sys.argv[3] ) - 1 - except: - stop_err( "Column not specified, your query does not contain a column of numerical data." ) - title = sys.argv[4] - xlab = sys.argv[5] - breaks = int( sys.argv[6] ) - if breaks == 0: - breaks = "Sturges" - if sys.argv[7] == "true": - density = True - else: density = False - if len( sys.argv ) >= 9 and sys.argv[8] == "true": - frequency = True - else: frequency = False - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - i = 0 - for i, line in enumerate( file( in_fname ) ): - valid = True - line = line.rstrip('\r\n') - # Skip comments - if line and not line.startswith( '#' ): - # Extract values and convert to floats - row = [] - try: - fields = line.split( "\t" ) - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - else: - try: - row.append( float( val ) ) - except ValueError: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - invalid_value = fields[column] - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix += row - - if skipped_lines < i: - try: - a = r.array( matrix ) - r.pdf( out_fname, 8, 8 ) - histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks ) - if density: - density = r.density( a ) - if frequency: - scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints - density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] ) - r.lines( density ) - r.dev_off() - except Exception, exc: - stop_err( "%s" %str( exc ) ) - else: - if i == 0: - stop_err("Input dataset is empty.") - else: - stop_err( "All values in column %s are non-numeric." %sys.argv[3] ) - - print "Histogram of column %s. " %sys.argv[3] - if skipped_lines > 0: - print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value ) - - r.quit( save="no" ) - -if __name__ == "__main__": - main() diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/histogram2.xml --- a/tools/plotting/histogram2.xml +++ /dev/null @@ -1,77 +0,0 @@ -<tool id="histogram_rpy" name="Histogram" version="1.0.3"> - <description>of a numeric column</description> - <command interpreter="python">histogram.py $input $out_file1 $numerical_column "$title" "$xlab" $breaks $density $frequency</command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="numerical_column" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> - <param name="breaks" type="integer" size="4" value="0" label="Number of breaks (bars)"/> - <param name="title" type="text" size="30" value="Histogram" label="Plot title"/> - <param name="xlab" type="text" size="30" value="V1" label="Label for x axis"/> - <param name="density" type="boolean" checked="yes" label="Include smoothed density"/> - <param name="frequency" type="boolean" checked="no" label="Plot as frequency (counts)"/> - </inputs> - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input" value="histogram_in1.tabular" ftype="tabular"/> - <param name="numerical_column" value="2"/> - <param name="breaks" value="0"/> - <param name="title" value="Histogram"/> - <param name="xlab" value="V1"/> - <param name="density" value="true"/> - <param name="frequency" value="false"/> - <output name="out_file1" file="histogram_out1.pdf"/> - </test> - </tests> - <requirements> - <requirement type="python-module">rpy</requirement> - <requirement type="package">R</requirement> - </requirements> - <help> - -.. class:: infomark - -**TIP:** To remove comment lines that do not begin with a *#* character, use *Text Manipulation->Remove beginning* - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool computes a histogram of the numerical values in a column of a dataset. - -- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. -- **Column for x axis** - only numerical columns are possible. -- **Number of breaks(bars)** - breakpoints between histogram cells. Value of '0' will determine breaks automatically. -- **Plot title** - the histogram title. -- **Label for x axis** - the label of the x axis for the histogram. -- **Include smoothed density** - if checked, the resulting graph will join the given corresponding points with line segments. - ------ - -**Example** - -- Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -- Create a histogram on column 2 of the above dataset. - -.. image:: ${static_path}/images/histogram2.png - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/scatterplot.py --- a/tools/plotting/scatterplot.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster - -import sys -from rpy import * - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - - in_fname = sys.argv[1] - out_fname = sys.argv[2] - try: - columns = int( sys.argv[3] ) - 1, int( sys.argv[4] ) - 1 - except: - stop_err( "Columns not specified, your query does not contain a column of numerical data." ) - title = sys.argv[5] - xlab = sys.argv[6] - ylab = sys.argv[7] - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - invalid_column = 0 - i = 0 - for i, line in enumerate( file( in_fname ) ): - valid = True - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - row = [] - fields = line.split( "\t" ) - for column in columns: - try: - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - else: - row.append( float( fields[column] ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i + 1 - try: - invalid_value = fields[column] - except: - invalid_value = '' - invalid_column = column + 1 - break - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix.append( row ) - - if skipped_lines < i: - try: - r.pdf( out_fname, 8, 8 ) - r.plot( array( matrix ), type="p", main=title, xlab=xlab, ylab=ylab, col="blue", pch=19 ) - r.dev_off() - except Exception, exc: - stop_err( "%s" %str( exc ) ) - else: - stop_err( "All values in both columns %s and %s are non-numeric or empty." % ( sys.argv[3], sys.argv[4] ) ) - - print "Scatter plot on columns %s, %s. " % ( sys.argv[3], sys.argv[4] ) - if skipped_lines > 0: - print "Skipped %d lines starting with line #%d, value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column ) - - r.quit( save="no" ) - -if __name__ == "__main__": - main() diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/plotting/scatterplot.xml --- a/tools/plotting/scatterplot.xml +++ /dev/null @@ -1,71 +0,0 @@ -<tool id="scatterplot_rpy" name="Scatterplot"> - <description>of two numeric columns</description> - <command interpreter="python">scatterplot.py $input $out_file1 $col1 $col2 "$title" "$xlab" "$ylab"</command> - <inputs> - <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="col1" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> - <param name="col2" type="data_column" data_ref="input" numerical="True" label="Numerical column for y axis" /> - <param name="title" size="30" type="text" value="Scatterplot" label="Plot title"/> - <param name="xlab" size="30" type="text" value="V1" label="Label for x axis"/> - <param name="ylab" size="30" type="text" value="V2" label="Label for y axis"/> - </inputs> - <outputs> - <data format="pdf" name="out_file1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> -  - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool creates a simple scatter plot between two variables containing numeric values of a selected dataset. - -- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. - -- **Plot title** The scatterplot title -- **Label for x axis** and **Label for y axis** The labels for x and y axis of the scatterplot. - ------ - -**Example** - -- Input file:: - - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - -- Create a simple scatterplot between the variables in column 2 and column 3 of the above dataset. - -.. image:: ${static_path}/images/scatterplot.png - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/MINE.xml --- a/tools/stats/MINE.xml +++ /dev/null @@ -1,82 +0,0 @@ -<tool id="maximal_information_based_nonparametric_exploration" name="MINE" version="0.0.1"> - <description>- Maximal Information-based Nonparametric Exploration</description> - <requirements> - <requirement type="package" version="1.0">MINE</requirement> - </requirements> - <command interpreter="python">mine_wrapper.py - --jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/mine/MINE.jar" - - --infile "${input_file}" - - #if str( $master_variable_type.master_variable_type_selector ) in [ 'allPairs', 'adjacentPairs' ]: - --master_variable "${master_variable_type.master_variable_type_selector}" - #else: - --master_variable "${master_variable_type.master_variable}" - #end if - - --cv "${cv}" - - --exp "${exp}" - - --c "${c}" - - ##--gc ##skip - - - #if str( $master_variable_type.master_variable_type_selector ) != 'allPairs' and $master_variable_type.permute: - --permute - #end if - - --output_results "${output_results}" - - --output_log "${output_log}" - </command> - <inputs> - <param name="input_file" type="data" format="csv" label="CSV file" /> - - <conditional name="master_variable_type"> - <param name="master_variable_type_selector" type="select" label="Choose the master variable type"> - <option value="allPairs">allPairs</option> - <option value="adjacentPairs">adjacentPairs</option> - <option value="compare_against_ith" selected="True">compare against i-th</option> - </param> - <when value="compare_against_ith"> - <param type="integer" value="0" name="master_variable" /> - <param type="boolean" truevalue="--permute" false_value="" name="permute" checked="False" /> - </when> - <when value="adjacentPairs"> - <param type="boolean" truevalue="--permute" false_value="" name="permute" checked="False" /> - </when> - </conditional> - - <param type="float" value="0" name="cv" /> - - <param type="float" value="0.6" name="exp" /> - - <param type="float" value="15" name="c" /> - - </inputs> - <outputs> - <data format="csv" name="output_results" label="${tool.name} on ${on_string} (Results)" /> - <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" /> - </outputs> - <tests> -  - </tests> - <help> -**What it does** - -Applies the Maximal Information-based Nonparametric Exploration strategy to an input dataset. - -See http://www.exploredata.net/ for more information. - ------- - -**Citation** - -For the underlying tool, please cite `David N. Reshef, Yakir A. Reshef, Hilary K. Finucane5, Sharon R. Grossman, Gilean McVean, Peter J. Turnbaugh, Eric S. Lander, Michael Mitzenmacher, Pardis C. Sabeti Detecting Novel Associations in Large Data Sets. Science. 2011 Dec. <http://www.sciencemag.org/content/334/6062/1518>`_ - -If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* - - </help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/column_maker.py --- a/tools/stats/column_maker.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python -# This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of -# a computation performed on every row in the original file. The tool will skip over invalid lines within the file, -# informing the user about the number of lines skipped. - -import sys, re -# These functions may be used in compute expression: -from math import log,exp,sqrt,ceil,floor - - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() - -inp_file = sys.argv[1] -out_file = sys.argv[2] -expr = sys.argv[3] -round_result = sys.argv[4] -try: - in_columns = int( sys.argv[5] ) -except: - stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) -if in_columns < 2: - # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method. - stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) -try: - in_column_types = sys.argv[6].split( ',' ) -except: - stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) -if len( in_column_types ) != in_columns: - stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." ) - -# Unescape if input has been escaped -mapped_str = { - '__lt__': '<', - '__le__': '<=', - '__eq__': '==', - '__ne__': '!=', - '__gt__': '>', - '__ge__': '>=', - '__sq__': '\'', - '__dq__': '"', -} -for key, value in mapped_str.items(): - expr = expr.replace( key, value ) - -operators = 'is|not|or|and' -builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor' -string_and_list_methods = [ name for name in dir('') + dir([]) if not name.startswith('_') ] -whitelist = "^([c0-9\+\-\*\/\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) -if not re.compile(whitelist).match(expr): - stop_err("Invalid expression") - -# Prepare the column variable names and wrappers for column data types -cols, type_casts = [], [] -for col in range( 1, in_columns + 1 ): - col_name = "c%d" % col - cols.append( col_name ) - col_type = in_column_types[ col - 1 ].strip() - if round_result == 'no' and col_type == 'int': - col_type = 'float' - type_cast = "%s(%s)" % ( col_type, col_name ) - type_casts.append( type_cast ) - -col_str = ', '.join( cols ) # 'c1, c2, c3, c4' -type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' -assign = "%s = line.split( '\\t' )" % col_str -wrap = "%s = %s" % ( col_str, type_cast_str ) -skipped_lines = 0 -first_invalid_line = 0 -invalid_line = None -lines_kept = 0 -total_lines = 0 -out = open( out_file, 'wt' ) - -# Read input file, skipping invalid lines, and perform computation that will result in a new column -code = ''' -for i, line in enumerate( file( inp_file ) ): - total_lines += 1 - line = line.rstrip( '\\r\\n' ) - if not line or line.startswith( '#' ): - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line - continue - try: - %s - %s - new_val = %s - if round_result == "yes": - new_val = int( round( new_val ) ) - new_line = line + '\\t' + str( new_val ) - print >> out, new_line - lines_kept += 1 - except: - skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line -''' % ( assign, wrap, expr ) - -valid_expr = True -try: - exec code -except Exception, e: - out.close() - if str( e ).startswith( 'invalid syntax' ): - valid_expr = False - stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr ) - else: - stop_err( str( e ) ) - -if valid_expr: - out.close() - valid_lines = total_lines - skipped_lines - print 'Creating column %d with expression %s' % ( in_columns + 1, expr ) - if valid_lines > 0: - print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) - else: - print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/column_maker.xml --- a/tools/stats/column_maker.xml +++ /dev/null @@ -1,83 +0,0 @@ -<tool id="Add_a_column1" name="Compute" version="1.1.0"> - <description>an expression on every row</description> - <command interpreter="python"> - column_maker.py $input $out_file1 "$cond" $round ${input.metadata.columns} "${input.metadata.column_types}" - </command> - <inputs> - <param name="cond" size="40" type="text" value="c3-c2" label="Add expression"/> - <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/> - <param name="round" type="select" label="Round result?"> - <option value="no">NO</option> - <option value="yes">YES</option> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" metadata_source="input"/> - </outputs> - <tests> - <test> - <param name="cond" value="c3-c2"/> - <param name="input" value="1.bed"/> - <param name="round" value="no"/> - <output name="out_file1" file="column_maker_out1.interval"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.interval"/> - <param name="round" value="no"/> - <output name="out_file1" file="column_maker_out2.interval"/> - </test> - <test> - <param name="cond" value="c4*1"/> - <param name="input" value="1.interval"/> - <param name="round" value="yes"/> - <output name="out_file1" file="column_maker_out3.interval"/> - </test> - </tests> - <help> - - .. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**What it does** - -This tool computes an expression for every row of a dataset and appends the result as a new column (field). - -- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file - -- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position - ------ - -**Example** - -If this is your input:: - - chr1 151077881 151077918 2 200 - - chr1 151081985 151082078 3 500 + - -computing "c4*c5" will produce:: - - chr1 151077881 151077918 2 200 - 400.0 - chr1 151081985 151082078 3 500 + 1500.0 - -if, at the same time, "Round result?" is set to **YES** results will look like this:: - - chr1 151077881 151077918 2 200 - 400 - chr1 151081985 151082078 3 500 + 1500 - -You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following:: - - chr1 151077881 151077918 2 200 - True - chr1 151081985 151082078 3 500 + True - -or computing "type(c2)==type('') for Input will return:: - - chr1 151077881 151077918 2 200 - False - chr1 151081985 151082078 3 500 + False - -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/cor.py --- a/tools/stats/cor.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python -#Greg Von Kuster -""" -Calculate correlations between numeric columns in a tab delim file. -usage: %prog infile output.txt columns method -""" - -import sys -from rpy import * - -def stop_err(msg): - sys.stderr.write(msg) - sys.exit() - -def main(): - method = sys.argv[4] - assert method in ( "pearson", "kendall", "spearman" ) - - try: - columns = map( int, sys.argv[3].split( ',' ) ) - except: - stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." ) - - matrix = [] - skipped_lines = 0 - first_invalid_line = 0 - invalid_value = '' - invalid_column = 0 - - for i, line in enumerate( file( sys.argv[1] ) ): - valid = True - line = line.rstrip('\n\r') - - if line and not line.startswith( '#' ): - # Extract values and convert to floats - row = [] - for column in columns: - column -= 1 - fields = line.split( "\t" ) - if len( fields ) <= column: - valid = False - else: - val = fields[column] - if val.lower() == "na": - row.append( float( "nan" ) ) - else: - try: - row.append( float( fields[column] ) ) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - invalid_value = fields[column] - invalid_column = column+1 - else: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = i+1 - - if valid: - matrix.append( row ) - - if skipped_lines < i: - try: - out = open( sys.argv[2], "w" ) - except: - stop_err( "Unable to open output file" ) - - # Run correlation - try: - value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method ) - except Exception, exc: - out.close() - stop_err("%s" %str( exc )) - for row in value: - print >> out, "\t".join( map( str, row ) ) - out.close() - - if skipped_lines > 0: - msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line ) - if invalid_value and invalid_column > 0: - msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column ) - print msg - -if __name__ == "__main__": - main() diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/cor.xml --- a/tools/stats/cor.xml +++ /dev/null @@ -1,101 +0,0 @@ -<tool id="cor2" name="Correlation"> - <description>for numeric columns</description> - <command interpreter="python">cor.py $input1 $out_file1 $numeric_columns $method</command> - <inputs> - <param format="tabular" name="input1" type="data" label="Dataset" help="Dataset missing? See TIP below"/> - <param name="numeric_columns" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input1" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /> - <param name="method" type="select" label="Method"> - <option value="pearson">Pearson</option> - <option value="kendall">Kendall rank</option> - <option value="spearman">Spearman rank</option> - </param> - </inputs> - <outputs> - <data format="txt" name="out_file1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> - <tests> -  - <test> - <param name="input1" value="cor.tabular" /> - <param name="numeric_columns" value="2,3" /> - <param name="method" value="pearson" /> - <output name="out_file1" file="cor_out.txt" /> - </test> - </tests> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - -.. class:: warningmark - -Missing data ("nan") removed from each pairwise comparison - ------ - -**Syntax** - -This tool computes the matrix of correlation coefficients between numeric columns. - -- All invalid, blank and comment lines are skipped when performing computations. The number of skipped lines is displayed in the resulting history item. - -- **Pearson's Correlation** reflects the degree of linear relationship between two variables. It ranges from +1 to -1. A correlation of +1 means that there is a perfect positive linear relationship between variables. The formula for Pearson's correlation is: - - .. image:: ${static_path}/images/pearson.png - - where n is the number of items - -- **Kendall's rank correlation** is used to measure the degree of correspondence between two rankings and assessing the significance of this correspondence. The formula for Kendall's rank correlation is: - - .. image:: ${static_path}/images/kendall.png - - where n is the number of items, and P is the sum. - -- **Spearman's rank correlation** assesses how well an arbitrary monotonic function could describe the relationship between two variables, without making any assumptions about the frequency distribution of the variables. The formula for Spearman's rank correlation is - - .. image:: ${static_path}/images/spearman.png - - where D is the difference between the ranks of corresponding values of X and Y, and N is the number of pairs of values. - ------ - -**Example** - -- Input file:: - - #Person Height Self Esteem - 1 68 4.1 - 2 71 4.6 - 3 62 3.8 - 4 75 4.4 - 5 58 3.2 - 6 60 3.1 - 7 67 3.8 - 8 68 4.1 - 9 71 4.3 - 10 69 3.7 - 11 68 3.5 - 12 67 3.2 - 13 63 3.7 - 14 62 3.3 - 15 60 3.4 - 16 63 4.0 - 17 65 4.1 - 18 67 3.8 - 19 63 3.4 - 20 61 3.6 - -- Computing the correlation coefficients between columns 2 and 3 of the above file (using Pearson's Correlation), the output is:: - - 1.0 0.730635686279 - 0.730635686279 1.0 - - So the correlation for our twenty cases is .73, which is a fairly strong positive relationship. - </help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/correlation.pl --- a/tools/stats/correlation.pl +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/perl - -########################################################################### -# Purpose: To calculate the correlation of two sets of scores in one file. -# Usage: correlation.pl infile.bed output.txt column1 column2 -# (column start from 1) -# Written by: Yi Zhang (June, 2005) -########################################################################### -if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) { - print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n"; - print STDERR " (column start from 1)\n"; - exit; -} -my $file = $ARGV[0]; -my $out = $ARGV[1]; - -die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/); - -my $col1 = $ARGV[2] - 1; -my $col2 = $ARGV[3] - 1; - -my ($f, $o); -my (@a, @b); - -my $n_t = 0; -open($f, $file) or die "Could't open $file, $!\n"; -while(<$f>) { - chomp; - my @t = split(/\t/); - if ($n_t == 0) { - $n_t = scalar(@t) - 1; - die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t ); - } - die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/); - push(@a, $t[$col1]); - push(@b, $t[$col2]); -} -close($f); - -my $result = correlation(\@a, \@b); - -open($o, ">$out") or die "Couldn't open $out, $!\n"; -$col1 = $col1 + 1; -$col2 = $col2 + 1; -print $o "The correlation of column $col1 and $col2 is $result\n"; -close($o); -print "The correlation of column $col1 and $col2 is $result\n"; - -sub correlation { - my ($array1ref, $array2ref) = @_; - my ($sum1, $sum2); - my ($sum1_squared, $sum2_squared); - foreach (@$array1ref) { $sum1 += $_; $sum1_squared += $_**2; } - foreach (@$array2ref) { $sum2 += $_; $sum2_squared += $_**2; } - my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref); - my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) * - ((@$array1ref * $sum2_squared) - ($sum2**2))); - my $r; - if ($denominator == 0) { - print STDERR "The denominator is 0.\n"; - exit 0; - } else { - $r = $numerator / $denominator; - } - return $r; -} - -sub covariance { - my ($array1ref, $array2ref) = @_; - my ($i, $result); - for ($i = 0; $i < @$array1ref; $i++) { - $result += $array1ref->[$i] * $array2ref->[$i]; - } - $result /= @$array1ref; - $result -= mean($array1ref) * mean($array2ref); -} - -sub mean { - my ($arrayref) = @_; - my $result; - foreach (@$arrayref) { $result += $_; } - return $result/@$arrayref; -} - diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/correlation.xml --- a/tools/stats/correlation.xml +++ /dev/null @@ -1,15 +0,0 @@ -<tool id="Pearson_and_apos_Correlation1" name="Pearson and apos Correlation"> - <description>between any two numeric columns</description> - <command interpreter="perl">correlation.pl $input $out_file1 $col1 $col2</command> - <inputs> - - <param name="col1" size="3" type="text" value="5" label="Correlate data in column"/> - <param name="col2" size="3" type="text" value="6" label="with data in column"/> - <param format="txt" name="input" type="data" label="in Query"/> - </inputs> - <outputs> - <data format="txt" name="out_file1" /> - </outputs> - <help>Computes Pearsons correlation coefficient between any two numerical columns. Column numbers start at 1. -</help> -</tool> diff -r cdc80d206d540234649a7034a70e00cfa676b044 -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 tools/stats/count_gff_features.py --- a/tools/stats/count_gff_features.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python -# This tool takes a gff file as input and counts the number of features in it. - -import sys, fileinput -from galaxy import eggs -from galaxy.datatypes.util.gff_util import GFFReaderWrapper -from bx.intervals.io import GenomicInterval - -# Get args. -input_file = sys.argv[1:] - -# Count features. -count = 0 -for feature in GFFReaderWrapper( fileinput.FileInput( input_file ), fix_strand=True ): - if isinstance( feature, GenomicInterval ): - count += 1 - -print count \ No newline at end of file This diff is so big that we needed to truncate the remainder. https://bitbucket.org/galaxy/galaxy-central/commits/799ceff557bb/ Changeset: 799ceff557bb User: davebgx Date: 2014-07-28 18:07:07 Summary: Remove references to migrated tools from tool_conf.xml.main Affected #: 1 file diff -r 42cc410f02da8ee62dfc3b0de9f7b1c6c2848f99 -r 799ceff557bbbfea80cbfd164a297a0364b0c699 tool_conf.xml.main --- a/tool_conf.xml.main +++ b/tool_conf.xml.main @@ -27,7 +27,6 @@ </section><section id="textutil" name="Text Manipulation"><tool file="filters/fixedValueColumn.xml" /> - <tool file="stats/column_maker.xml" /><tool file="filters/catWrapper.xml" /><tool file="filters/condense_characters.xml" /><tool file="filters/convert_characters.xml" /> @@ -58,7 +57,6 @@ <tool file="stats/filtering.xml" /><tool file="filters/sorter.xml" /><tool file="filters/grep.xml" /> - <tool file="stats/dna_filtering.xml" /><label id="gff" text="GFF" /><tool file="filters/gff/extract_GFF_Features.xml" /><tool file="filters/gff/gff_filter_by_attribute.xml" /> @@ -96,16 +94,9 @@ <section id="stats" name="Statistics"><tool file="stats/gsummary.xml" /><tool file="filters/uniq.xml" /> - <tool file="stats/cor.xml" /> - <tool file="stats/generate_matrix_for_pca_lda.xml" /> - <tool file="stats/lda_analy.xml" /> - <tool file="stats/plot_from_lda.xml" /></section><section id="plots" name="Graph/Display Data"> - <tool file="plotting/histogram2.xml" /> - <tool file="plotting/scatterplot.xml" /><tool file="plotting/boxplot.xml" /> - <tool file="visualization/build_ucsc_custom_track.xml" /><tool file="maf/vcf_to_maf_customtrack.xml" /><tool file="mutation/visualize.xml" /></section> @@ -115,14 +106,12 @@ <tool file="phenotype_association/sift.xml" /><tool file="phenotype_association/linkToGProfile.xml" /><tool file="phenotype_association/linkToDavid.xml" /> - <tool file="phenotype_association/snpFreq.xml" /><tool file="phenotype_association/ldtools.xml" /><tool file="phenotype_association/pass.xml" /><tool file="phenotype_association/gpass.xml" /><tool file="phenotype_association/beam.xml" /><tool file="phenotype_association/lps.xml" /><tool file="phenotype_association/master2pg.xml" /> - <tool file="phenotype_association/vcf2pgSnp.xml" /></section><label id="ngs" text="NGS Toolbox Beta" /><section id="cshl_library_information" name="NGS: QC and manipulation"> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: natefoo: Remove config options for old data management features that were already removed.
by commits-noreply＠bitbucket.org 27 Jul '14

27 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/cdc80d206d54/ Changeset: cdc80d206d54 User: natefoo Date: 2014-07-27 05:54:22 Summary: Remove config options for old data management features that were already removed. Affected #: 2 files diff -r 03dba199c52645b104dc46c78b364b3ee763cfac -r cdc80d206d540234649a7034a70e00cfa676b044 lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -64,8 +64,6 @@ tempfile.tempdir = self.new_file_path self.openid_consumer_cache_path = resolve_path( kwargs.get( "openid_consumer_cache_path", "database/openid_consumer_cache" ), self.root ) self.cookie_path = kwargs.get( "cookie_path", "/" ) - self.genome_data_path = kwargs.get( "genome_data_path", "tool-data/genome" ) - self.rsync_url = kwargs.get( "rsync_url", "rsync://datacache.galaxyproject.org/indexes" ) # Galaxy OpenID settings self.enable_openid = string_as_bool( kwargs.get( 'enable_openid', False ) ) self.openid_config = kwargs.get( 'openid_config_file', 'openid_conf.xml' ) diff -r 03dba199c52645b104dc46c78b364b3ee763cfac -r cdc80d206d540234649a7034a70e00cfa676b044 universe_wsgi.ini.sample --- a/universe_wsgi.ini.sample +++ b/universe_wsgi.ini.sample @@ -124,12 +124,6 @@ # -- Files and directories -# Path where genome builds are stored. This defaults to tool-data/genome -#genome_data_path = tool-data/genome - -# URL for rsync server to download pre-built indexes. -#rsync_url = rsync://datacache.galaxyproject.org/indexes - # Dataset files are stored in this directory. #file_path = database/files Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: dan: Provide some clarification for the 'Manage your information' menu item.
by commits-noreply＠bitbucket.org 25 Jul '14

25 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/03dba199c526/ Changeset: 03dba199c526 User: dan Date: 2014-07-25 23:32:53 Summary: Provide some clarification for the 'Manage your information' menu item. Affected #: 1 file diff -r 7601982d2180182671bb68349159947fc415800b -r 03dba199c52645b104dc46c78b364b3ee763cfac templates/user/index.mako --- a/templates/user/index.mako +++ b/templates/user/index.mako @@ -11,7 +11,7 @@ <ul> %if t.webapp.name == 'galaxy': %if not trans.app.config.use_remote_user: - <li><a href="${h.url_for( controller='user', action='manage_user_info', cntrller=cntrller )}">${_('Manage your information')}</a></li> + <li><a href="${h.url_for( controller='user', action='manage_user_info', cntrller=cntrller )}">${_('Manage your information')}</a> (email, password, etc.)</li> %endif <li><a href="${h.url_for( controller='user', action='set_default_permissions', cntrller=cntrller )}">${_('Change default permissions')}</a> for new histories</li><li><a href="${h.url_for( controller='user', action='api_keys', cntrller=cntrller )}">${_('Manage your API keys')}</a></li> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: jmchilton: Workflow test utility build ds_map more easily.
by commits-noreply＠bitbucket.org 25 Jul '14

25 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/7601982d2180/ Changeset: 7601982d2180 User: jmchilton Date: 2014-07-25 21:47:34 Summary: Workflow test utility build ds_map more easily. Probably should do something like this server side so workflow API can be used more deterministically (wouldn't need to import a workflow and then hit the API to know how to use it). Affected #: 1 file diff -r f1123017cfcd1f1c2531b62624ceac4e28aa6dd4 -r 7601982d2180182671bb68349159947fc415800b test/api/test_workflows.py --- a/test/api/test_workflows.py +++ b/test/api/test_workflows.py @@ -412,28 +412,30 @@ def _setup_workflow_run( self, workflow, history_id=None ): uploaded_workflow_id = self.workflow_populator.create_workflow( workflow ) - workflow_inputs = self._workflow_inputs( uploaded_workflow_id ) - step_1 = step_2 = None - for key, value in workflow_inputs.iteritems(): - label = value[ "label" ] - if label == "WorkflowInput1": - step_1 = key - if label == "WorkflowInput2": - step_2 = key if not history_id: history_id = self.dataset_populator.new_history() hda1 = self.dataset_populator.new_dataset( history_id, content="1 2 3" ) hda2 = self.dataset_populator.new_dataset( history_id, content="4 5 6" ) + label_map = { + 'WorkflowInput1': self._ds_entry(hda1), + 'WorkflowInput2': self._ds_entry(hda2) + } workflow_request = dict( history="hist_id=%s" % history_id, workflow_id=uploaded_workflow_id, - ds_map=dumps( { - step_1: self._ds_entry(hda1), - step_2: self._ds_entry(hda2), - } ), + ds_map=self._build_ds_map( uploaded_workflow_id, label_map ), ) return workflow_request, history_id + def _build_ds_map( self, workflow_id, label_map ): + workflow_inputs = self._workflow_inputs( workflow_id ) + ds_map = {} + for key, value in workflow_inputs.iteritems(): + label = value[ "label" ] + if label in label_map: + ds_map[ key ] = label_map[ label ] + return dumps( ds_map ) + def _setup_random_x2_workflow( self, name ): workflow = self.workflow_populator.load_random_x2_workflow( name ) uploaded_workflow_id = self.workflow_populator.create_workflow( workflow ) Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: 2 new changesets
by commits-noreply＠bitbucket.org 25 Jul '14

25 Jul '14

2 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/fe0b5d1b527f/ Changeset: fe0b5d1b527f User: jmchilton Date: 2014-07-25 18:51:10 Summary: Extend API test timeouts. Some buildbot API tests are timing out waiting but work locally, I suspect this will fix the problem. Affected #: 1 file diff -r f002131cb905ad4e578635c8a8b655e07cf701f0 -r fe0b5d1b527fadd503baeaa15089b19c9f75fbde test/api/helpers.py --- a/test/api/helpers.py +++ b/test/api/helpers.py @@ -12,7 +12,7 @@ # row - first grabbing 8 lines at random and then 6. workflow_random_x2_str = resource_string( __name__, "test_workflow_2.ga" ) -DEFAULT_HISTORY_TIMEOUT = 5 # Secs to wait on history to turn ok +DEFAULT_HISTORY_TIMEOUT = 10 # Secs to wait on history to turn ok def skip_without_tool( tool_id ): https://bitbucket.org/galaxy/galaxy-central/commits/f1123017cfcd/ Changeset: f1123017cfcd User: jmchilton Date: 2014-07-25 18:51:11 Summary: Fix API tests for 879c485... Which made tool multirun link/match instead cross producting (will someday figure a terminology for this :)). Affected #: 1 file diff -r fe0b5d1b527fadd503baeaa15089b19c9f75fbde -r f1123017cfcd1f1c2531b62624ceac4e28aa6dd4 test/api/test_tools.py --- a/test/api/test_tools.py +++ b/test/api/test_tools.py @@ -189,13 +189,16 @@ ], } outputs = self._cat1_outputs( history_id, inputs=inputs ) - self.assertEquals( len( outputs ), 4 ) + self.assertEquals( len( outputs ), 2 ) self.dataset_populator.wait_for_history( history_id, assert_ok=True ) outputs_contents = [ self._get_content( history_id, dataset=o ).strip() for o in outputs ] assert "123\n789" in outputs_contents - assert "456\n789" in outputs_contents - assert "123\n0ab" in outputs_contents assert "456\n0ab" in outputs_contents + # TODO: Once cross production (instead of linking inputs) is an option + # again redo test with these checks... + # self.assertEquals( len( outputs ), 4 ) + # assert "123\n0ab" in outputs_contents + # assert "456\n789" in outputs_contents @skip_without_tool( "cat1" ) def test_map_over_collection( self ): Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0

commit/galaxy-central: jgoecks: Trackster: draw incomplete features only once.
by commits-noreply＠bitbucket.org 25 Jul '14

25 Jul '14

1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/f002131cb905/ Changeset: f002131cb905 User: jgoecks Date: 2014-07-25 18:01:34 Summary: Trackster: draw incomplete features only once. Affected #: 1 file diff -r e359c370aeac38cafa61c01112c09c1560153523 -r f002131cb905ad4e578635c8a8b655e07cf701f0 static/scripts/viz/trackster/tracks.js --- a/static/scripts/viz/trackster/tracks.js +++ b/static/scripts/viz/trackster/tracks.js @@ -1907,6 +1907,8 @@ this.feature_mapper = feature_mapper; this.has_icons = false; this.incomplete_features = incomplete_features; + // Features drawn based on data from other tiles. + this.other_tiles_features_drawn = {}; this.seq_data = seq_data; // Add message + action icons to tile's html. @@ -3714,7 +3716,8 @@ }); // - // Draw incomplete features across tiles. + // Finish drawing of features that span multiple tiles. Features that span multiple tiles + // are labeled incomplete on the tile level because they cannot be completely drawn. // if (line_track_tiles.length === 0) { // Gather incomplete features together. @@ -3728,19 +3731,31 @@ // Draw incomplete features on each tile. var self = this; _.each(tiles, function(tile) { - // To draw incomplete features, copy original canvas and then draw incomplete features - // on the canvas. - var features = { data: _.values( all_incomplete_features ) }, - new_canvas = self.view.canvas_manager.new_canvas(); - new_canvas.height = self.get_canvas_height(features, tile.mode, tile.w_scale, 100); - new_canvas.width = tile.canvas.width; - new_canvas.getContext('2d').drawImage(tile.canvas, 0, 0); - new_canvas.getContext('2d').translate(track.left_offset, 0); - var new_tile = self.draw_tile(features, new_canvas.getContext('2d'), - tile.mode, tile.region, tile.w_scale, tile.seq_data); - $(new_tile.canvas).addClass('incomplete_features'); - $(tile.canvas).replaceWith($(new_tile.canvas)); - tile.canvas = new_canvas; + // Remove features already drawn on tile originally. + var tile_incomplete_features =_.omit(all_incomplete_features, + _.map(tile.incomplete_features, function(f) { return f[0]; })); + + // Remove features already drawn on tile in past postdraw actions. + tile_incomplete_features = _.omit(tile_incomplete_features, _.keys(tile.other_tiles_features_drawn)); + + // Draw tile's incomplete features. + if (_.size(tile_incomplete_features) !== 0) { + // To draw incomplete features, create new canvas, copy original canvas/tile onto new + // canvas, and then draw incomplete features on the new canvas. + var features = { data: _.values( tile_incomplete_features ) }, + new_canvas = self.view.canvas_manager.new_canvas(), + new_canvas_ctx = new_canvas.getContext('2d'); + new_canvas.height = Math.max(tile.canvas.height, + self.get_canvas_height(features, tile.mode, tile.w_scale, 100)); + new_canvas.width = tile.canvas.width; + new_canvas_ctx.drawImage(tile.canvas, 0, 0); + new_canvas_ctx.translate(track.left_offset, 0); + var new_tile = self.draw_tile(features, new_canvas_ctx, tile.mode, + tile.region, tile.w_scale, tile.seq_data); + $(tile.canvas).replaceWith($(new_tile.canvas)); + tile.canvas = new_canvas; + _.extend(tile.other_tiles_features_drawn, all_incomplete_features); + } }); } Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.

1 0