September 2008 - galaxy-dev - lists.galaxyproject.org

[hg] galaxy 1528: Eliminate enable_beta_features ( and the conce...
by greg＠scofield.bx.psu.edu 24 Sep '08

24 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/447c74d98fe5 changeset: 1528:447c74d98fe5 user: Greg Von Kuster <greg(a)bx.psu.edu> date: Wed Sep 24 15:05:38 2008 -0400 description: Eliminate enable_beta_features ( and the concept of 'beta' from the controllers ). 8 file(s) affected in this change: lib/galaxy/config.py lib/galaxy/web/base/controller.py lib/galaxy/web/buildapp.py lib/galaxy/web/controllers/workflow.py lib/galaxy/webapps/reports/base/controller.py lib/galaxy/webapps/reports/buildapp.py templates/history/options.mako templates/root/tool_menu.mako diffs (133 lines): diff -r b6420d2f247c -r 447c74d98fe5 lib/galaxy/config.py --- a/lib/galaxy/config.py Wed Sep 24 13:52:01 2008 -0400 +++ b/lib/galaxy/config.py Wed Sep 24 15:05:38 2008 -0400 @@ -23,7 +23,6 @@ def __init__( self, **kwargs ): self.config_dict = kwargs self.root = kwargs.get( 'root_dir', '.' ) - self.enable_beta_features = kwargs.get( "enable_beta_features", False ) # Database related configuration self.database = resolve_path( kwargs.get( "database_file", "database/universe.d" ), self.root ) self.database_connection = kwargs.get( "database_connection", False ) diff -r b6420d2f247c -r 447c74d98fe5 lib/galaxy/web/base/controller.py --- a/lib/galaxy/web/base/controller.py Wed Sep 24 13:52:01 2008 -0400 +++ b/lib/galaxy/web/base/controller.py Wed Sep 24 15:05:38 2008 -0400 @@ -16,8 +16,6 @@ """ Base class for Galaxy web application controllers. """ - - beta = False def __init__( self, app ): """Initialize an interface for application 'app'""" diff -r b6420d2f247c -r 447c74d98fe5 lib/galaxy/web/buildapp.py --- a/lib/galaxy/web/buildapp.py Wed Sep 24 13:52:01 2008 -0400 +++ b/lib/galaxy/web/buildapp.py Wed Sep 24 15:05:38 2008 -0400 @@ -41,8 +41,7 @@ for key in dir( module ): T = getattr( module, key ) if isclass( T ) and T is not BaseController and issubclass( T, BaseController ): - if app.config.enable_beta_features or not ( T.beta ): - webapp.add_controller( name, T( app ) ) + webapp.add_controller( name, T( app ) ) def app_factory( global_conf, **kwargs ): """ diff -r b6420d2f247c -r 447c74d98fe5 lib/galaxy/web/controllers/workflow.py --- a/lib/galaxy/web/controllers/workflow.py Wed Sep 24 13:52:01 2008 -0400 +++ b/lib/galaxy/web/controllers/workflow.py Wed Sep 24 15:05:38 2008 -0400 @@ -13,7 +13,6 @@ from galaxy.model.mapping import desc class WorkflowController( BaseController ): - beta = True @web.expose @web.require_login( "use Galaxy workflows" ) diff -r b6420d2f247c -r 447c74d98fe5 lib/galaxy/webapps/reports/base/controller.py --- a/lib/galaxy/webapps/reports/base/controller.py Wed Sep 24 13:52:01 2008 -0400 +++ b/lib/galaxy/webapps/reports/base/controller.py Wed Sep 24 15:05:38 2008 -0400 @@ -8,7 +8,6 @@ class BaseController( object ): """Base class for Galaxy webapp application controllers.""" - beta = False def __init__( self, app ): """Initialize an interface for application 'app'""" self.app = app \ No newline at end of file diff -r b6420d2f247c -r 447c74d98fe5 lib/galaxy/webapps/reports/buildapp.py --- a/lib/galaxy/webapps/reports/buildapp.py Wed Sep 24 13:52:01 2008 -0400 +++ b/lib/galaxy/webapps/reports/buildapp.py Wed Sep 24 15:05:38 2008 -0400 @@ -40,7 +40,6 @@ for key in dir( module ): T = getattr( module, key ) if isclass( T ) and T is not BaseController and issubclass( T, BaseController ): - #if app.config.enable_beta_features or not ( T.beta ): webapp.add_controller( name, T( app ) ) def app_factory( global_conf, **kwargs ): diff -r b6420d2f247c -r 447c74d98fe5 templates/history/options.mako --- a/templates/history/options.mako Wed Sep 24 13:52:01 2008 -0400 +++ b/templates/history/options.mako Wed Sep 24 15:05:38 2008 -0400 @@ -16,9 +16,7 @@ %if len( history.active_datasets ) > 0: <li><a href="${h.url_for('/history_new')}">Create</a> a new empty history</li> %endif - %if app.config.enable_beta_features: <li><a href="${h.url_for( controller='workflow', action='build_from_current_history' )}">Construct workflow</a> from the current history</li> - %endif <li><a href="${h.url_for( action='history_share' )}" target="galaxy_main">Share</a> current history</div> %endif <li><a href="${h.url_for( action='history_delete', id=history.id )}" confirm="Are you sure you want to delete the current history?">Delete</a> current history</div> diff -r b6420d2f247c -r 447c74d98fe5 templates/root/tool_menu.mako --- a/templates/root/tool_menu.mako Wed Sep 24 13:52:01 2008 -0400 +++ b/templates/root/tool_menu.mako Wed Sep 24 15:05:38 2008 -0400 @@ -82,27 +82,25 @@ ## at least some workflows will appear here (the user should be able to ## configure which of their stored workflows appear in the tools menu). -%if app.config.enable_beta_features: - <div class="toolSectionPad"></div> - <div class="toolSectionPad"></div> - <div class="toolSectionTitle" id="title_XXinternalXXworkflow"> - <span>Workflow <i>(beta)</i></span> - </div> - <div id="XXinternalXXworkflow" class="toolSectionBody"> - <div class="toolSectionBg"> - <div class="toolTitle"> - <a href="${h.url_for( controller='workflow', action='index' )}" target="galaxy_main">Manage</a> workflows - </div> - %if t.user: - %for m in t.user.stored_workflow_menu_entries: - <div class="toolTitle"> - <a href="${h.url_for( controller='workflow', action='run', id=trans.security.encode_id(m.stored_workflow_id) )}" target="galaxy_main">${m.stored_workflow.name}</a> - </div> - %endfor - %endif - </div> - </div> -%endif +<div class="toolSectionPad"></div> +<div class="toolSectionPad"></div> +<div class="toolSectionTitle" id="title_XXinternalXXworkflow"> + <span>Workflow <i>(beta)</i></span> +</div> +<div id="XXinternalXXworkflow" class="toolSectionBody"> + <div class="toolSectionBg"> + <div class="toolTitle"> + <a href="${h.url_for( controller='workflow', action='index' )}" target="galaxy_main">Manage</a> workflows + </div> + %if t.user: + %for m in t.user.stored_workflow_menu_entries: + <div class="toolTitle"> + <a href="${h.url_for( controller='workflow', action='run', id=trans.security.encode_id(m.stored_workflow_id) )}" target="galaxy_main">${m.stored_workflow.name}</a> + </div> + %endfor + %endif + </div> +</div> </div>

1 0

[hg] galaxy 1524: Integrate with intermine ( data source ) and e...
by greg＠scofield.bx.psu.edu 24 Sep '08

24 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/aae4754d6828 changeset: 1524:aae4754d6828 user: Greg Von Kuster <greg(a)bx.psu.edu> date: Mon Sep 22 10:36:34 2008 -0400 description: Integrate with intermine ( data source ) and epigraph ( data destination ). Receiving data from epigraph coming soon. Data is sent to epigraph using a combination of DATA_URL and REDIRECT_URL tool params. This tool creates jobs, but does not queue them for execution. 12 file(s) affected in this change: lib/galaxy/tools/__init__.py lib/galaxy/tools/actions/__init__.py lib/galaxy/tools/parameters/basic.py lib/galaxy/web/controllers/async.py lib/galaxy/web/controllers/tool_runner.py templates/root/redirect.mako templates/tool_form.tmpl tool_conf.xml.sample tools/data_destination/epigraph.xml tools/data_source/flymine.xml tools/data_source/flymine_filter_code.py tools/data_source/intermine.py diffs (429 lines): diff -r dabed25dfbaf -r aae4754d6828 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py Sun Sep 21 17:36:28 2008 -0400 +++ b/lib/galaxy/tools/__init__.py Mon Sep 22 10:36:34 2008 -0400 @@ -239,6 +239,16 @@ self.command = interpreter + " " + self.command else: self.command = '' + # Parameters used to build URL for redirection to external app + redirect_url_params = root.find( "redirect_url_params" ) + if redirect_url_params is not None and redirect_url_params.text is not None: + # get rid of leading / trailing white space + redirect_url_params = redirect_url_params.text.strip() + # Replace remaining white space with something we can safely split on later + # when we are building the params + self.redirect_url_params = redirect_url_params.replace( ' ', '**^**' ) + else: + self.redirect_url_params = '' # Short description of the tool self.description = util.xml_text(root, "description") # Job runner @@ -677,7 +687,7 @@ return "tool_form.tmpl", dict( errors=errors, tool_state=state, incoming=incoming, error_message=error_message ) # If we've completed the last page we can execute the tool elif state.page == self.last_page: - out_data = self.execute( trans, params ) + out_data = self.execute( trans, incoming=params ) return 'tool_executed.tmpl', dict( out_data=out_data ) # Otherwise move on to the next page else: @@ -689,8 +699,8 @@ # Just a refresh, render the form with updated state and errors. return 'tool_form.tmpl', dict( errors=errors, tool_state=state ) - def update_state( self, trans, inputs, state, incoming, - prefix="", context=None, update_only=False, old_errors={}, changed_dependencies={} ): + def update_state( self, trans, inputs, state, incoming, prefix="", context=None, + update_only=False, old_errors={}, changed_dependencies={} ): """ Update the tool state in `state` using the user input in `incoming`. This is designed to be called recursively: `inputs` contains the @@ -877,14 +887,14 @@ raise Exception( "Unexpected parameter type" ) return args - def execute( self, trans, incoming={}, set_output_hid = True ): + def execute( self, trans, incoming={}, set_output_hid=True ): """ Execute the tool using parameter values in `incoming`. This just dispatches to the `ToolAction` instance specified by `self.tool_action`. In general this will create a `Job` that when run will build the tool's outputs, e.g. `DefaultToolAction`. """ - return self.tool_action.execute( self, trans, incoming, set_output_hid = set_output_hid ) + return self.tool_action.execute( self, trans, incoming=incoming, set_output_hid=set_output_hid ) def params_to_strings( self, params, app ): return params_to_strings( self.inputs, params, app ) @@ -1045,7 +1055,54 @@ #e.args = ( 'Error substituting into command line. Params: %r, Command: %s' % ( param_dict, self.command ) ) raise return command_line - + + def build_redirect_url_params( self, param_dict ): + """Substitute parameter values into self.redirect_url_params""" + if not self.redirect_url_params: + return + redirect_url_params = None + # Substituting parameter values into the url params + redirect_url_params = fill_template( self.redirect_url_params, context=param_dict ) + # Remove newlines + redirect_url_params = redirect_url_params.replace( "\n", " " ).replace( "\r", " " ) + return redirect_url_params + + def parse_redirect_url( self, inp_data, param_dict ): + """Parse the REDIRECT_URL tool param""" + # Tools that send data to an external application via a redirect must include the following 3 + # tool params: + # REDIRECT_URL - the url to which the data is being sent + # DATA_URL - the url to which the receiving application will send an http post to retrieve the Galaxy data + # GALAXY_URL - the to which the external application may post data as a response + redirect_url = param_dict.get( 'REDIRECT_URL' ) + redirect_url_params = self.build_redirect_url_params( param_dict ) + # Add the parameters to the redirect url. We're splitting the param string on '**^**' + # because the self.parse() method replaced white space with that separator. + params = redirect_url_params.split( '**^**' ) + rup_dict = {} + for param in params: + p_list = param.split( '=' ) + p_name = p_list[0] + p_val = p_list[1] + rup_dict[ p_name ] = p_val + DATA_URL = param_dict.get( 'DATA_URL', None ) + assert DATA_URL is not None, "DATA_URL parameter missing in tool config." + # Get the dataset - there should only be 1 + for name in inp_data.keys(): + data = inp_data[ name ] + DATA_URL += "/%s/display" % str( data.id ) + redirect_url += "?DATA_URL=%s" % DATA_URL + # Add the redirect_url_params to redirect_url + for p_name in rup_dict: + redirect_url += "&%s=%s" % ( p_name, rup_dict[ p_name ] ) + # Add the current user email to redirect_url + if data.history.user: + USERNAME = str( data.history.user.email ) + else: + USERNAME = 'Anonymous' + redirect_url += "&USERNAME=%s" % USERNAME + return redirect_url + def call_hook( self, hook_name, *args, **kwargs ): """ Call the custom code hook function identified by 'hook_name' if any, diff -r dabed25dfbaf -r aae4754d6828 lib/galaxy/tools/actions/__init__.py --- a/lib/galaxy/tools/actions/__init__.py Sun Sep 21 17:36:28 2008 -0400 +++ b/lib/galaxy/tools/actions/__init__.py Mon Sep 22 10:36:34 2008 -0400 @@ -2,6 +2,8 @@ from galaxy.tools.parameters import * from galaxy.util.template import fill_template from galaxy.util.none_like import NoneDataset +from galaxy.web import url_for +from galaxy.jobs import JOB_OK import logging log = logging.getLogger( __name__ ) @@ -63,7 +65,7 @@ tool.visit_inputs( param_values, visitor ) return input_datasets - def execute(self, tool, trans, incoming={}, set_output_hid = True ): + def execute(self, tool, trans, incoming={}, set_output_hid=True ): out_data = {} # Collect any input datasets from the incoming parameters inp_data = self.collect_input_datasets( tool, incoming, trans ) @@ -90,15 +92,12 @@ on_text = '%s, %s, and others' % tuple(input_names[0:2]) else: on_text = "" - # Add the dbkey to the incoming parameters incoming[ "dbkey" ] = input_dbkey - # Keep track of parent / child relationships, we'll create all the # datasets first, then create the associations parent_to_child_pairs = [] child_dataset_names = set() - for name, output in tool.outputs.items(): if output.parent: parent_to_child_pairs.append( ( output.parent, name ) ) @@ -149,23 +148,19 @@ out_data[ name ] = data # Store all changes to database trans.app.model.flush() - # Add all the top-level (non-child) datasets to the history for name in out_data.keys(): if name not in child_dataset_names and name not in incoming: #don't add children; or already existing datasets, i.e. async created data = out_data[ name ] trans.history.add_dataset( data, set_hid = set_output_hid ) data.flush() - # Add all the children to their parents for parent_name, child_name in parent_to_child_pairs: parent_dataset = out_data[ parent_name ] child_dataset = out_data[ child_name ] parent_dataset.children.append( child_dataset ) - # Store data after custom code runs trans.app.model.flush() - # Create the job object job = trans.app.model.Job() job.session_id = trans.get_galaxy_session( create=True ).id @@ -189,8 +184,19 @@ for name, dataset in out_data.iteritems(): job.add_output_dataset( name, dataset ) trans.app.model.flush() - - # Queue the job for execution - trans.app.job_queue.put( job.id, tool ) - trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id ) - return out_data + # Some tools are not really executable, but jobs are still created for them ( for record keeping ). + # Examples include tools that redirect to other applications ( epigraph ). These special tools must + # include something that can be retrieved from the params ( e.g., REDIRECT_URL ) to keep the job + # from being queued. + if 'REDIRECT_URL' in incoming: + redirect_url = tool.parse_redirect_url( inp_data, incoming ) + # Job should not be queued, so set state to ok + job.state = JOB_OK + job.info = "Redirected to: %s" % redirect_url + job.flush() + trans.response.send_redirect( url_for( controller='tool_runner', action='redirect', redirect_url=redirect_url ) ) + else: + # Queue the job for execution + trans.app.job_queue.put( job.id, tool ) + trans.log_event( "Added job to the job queue, id: %s" % str(job.id), tool_id=job.tool_id ) + return out_data diff -r dabed25dfbaf -r aae4754d6828 lib/galaxy/tools/parameters/basic.py --- a/lib/galaxy/tools/parameters/basic.py Sun Sep 21 17:36:28 2008 -0400 +++ b/lib/galaxy/tools/parameters/basic.py Mon Sep 22 10:36:34 2008 -0400 @@ -332,6 +332,8 @@ return form_builder.HiddenField( self.name, self.value ) def get_initial_value( self, trans, context ): return self.value + def get_label( self ): + return None ## This is clearly a HACK, parameters should only be used for things the user ## can change, there needs to be a different way to specify this. I'm leaving @@ -354,6 +356,9 @@ return form_builder.HiddenField( self.name, self.get_value( trans ) ) def get_initial_value( self, trans, context ): return self.value + def get_label( self ): + # BaseURLToolParameters are ultimately "hidden" parameters + return None class SelectToolParameter( ToolParameter ): """ diff -r dabed25dfbaf -r aae4754d6828 lib/galaxy/web/controllers/async.py --- a/lib/galaxy/web/controllers/async.py Sun Sep 21 17:36:28 2008 -0400 +++ b/lib/galaxy/web/controllers/async.py Mon Sep 22 10:36:34 2008 -0400 @@ -68,8 +68,8 @@ galaxy_url = trans.request.base + '/async/%s/%s/%s' % ( tool_id, data.id, key ) galaxy_url = params.get("GALAXY_URL",galaxy_url) params = dict( url=URL, GALAXY_URL=galaxy_url ) - params[tool.outputs.keys()[0]] = data.id #assume there is exactly one output file possible - #tool.execute( app=self.app, history=history, incoming=params ) + # Assume there is exactly one output file possible + params[tool.outputs.keys()[0]] = data.id tool.execute( trans, incoming=params ) else: log.debug('async error -> %s' % STATUS) diff -r dabed25dfbaf -r aae4754d6828 lib/galaxy/web/controllers/tool_runner.py --- a/lib/galaxy/web/controllers/tool_runner.py Sun Sep 21 17:36:28 2008 -0400 +++ b/lib/galaxy/web/controllers/tool_runner.py Mon Sep 22 10:36:34 2008 -0400 @@ -51,3 +51,10 @@ add_frame.wiki_url = trans.app.config.wiki_url add_frame.from_noframe = True return trans.fill_template( template, history=history, toolbox=toolbox, tool=tool, util=util, add_frame=add_frame, **vars ) + + @web.expose + def redirect( self, trans, redirect_url=None, **kwd ): + if not redirect_url: + return trans.show_error_message( "Required URL for redirection missing" ) + trans.log_event( "Redirecting to: %s" % redirect_url ) + return trans.fill_template( 'root/redirect.mako', redirect_url=redirect_url ) diff -r dabed25dfbaf -r aae4754d6828 templates/root/redirect.mako --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/root/redirect.mako Mon Sep 22 10:36:34 2008 -0400 @@ -0,0 +1,5 @@ +<%inherit file="/base.mako"/> + +<script type="text/javascript"> + top.location.href = '${redirect_url}'; +</script> \ No newline at end of file diff -r dabed25dfbaf -r aae4754d6828 templates/tool_form.tmpl --- a/templates/tool_form.tmpl Sun Sep 21 17:36:28 2008 -0400 +++ b/templates/tool_form.tmpl Mon Sep 22 10:36:34 2008 -0400 @@ -73,10 +73,12 @@ #set cls = "form-row" #end if <div class="$cls"> - <label> - ${param.get_label()}: - </label> - + #set label = $param.get_label() + #if $label: + <label> + $label: + </label> + #end if #set field = $param.get_html_field( $caller, $parent_state[ $param.name ], $context ) #set $field.refresh_on_change = $param.refresh_on_change <div style="float: left; width: 250px; margin-right: 10px;">$field.get_html( $prefix )</div> diff -r dabed25dfbaf -r aae4754d6828 tool_conf.xml.sample --- a/tool_conf.xml.sample Sun Sep 21 17:36:28 2008 -0400 +++ b/tool_conf.xml.sample Mon Sep 22 10:36:34 2008 -0400 @@ -9,6 +9,7 @@ <tool file="data_source/biomart.xml" /> <tool file="data_source/biomart_test.xml" /> <tool file="data_source/gbrowse_elegans.xml" /> + <tool file="data_source/flymine.xml" /> <tool file="data_source/encode_db.xml" /> <tool file="data_source/hbvar.xml" /> <tool file="validation/fix_errors.xml" /> @@ -20,6 +21,9 @@ <tool file="data_source/encode_import_transcription_regulation.xml"/> <tool file="data_source/encode_import_all_latest_datasets.xml" /> <tool file="data_source/encode_import_gencode.xml" /> + </section> + <section name="Send Data" id="send"> + <tool file="data_destination/epigraph.xml" /> </section> <section name="ENCODE Tools" id="EncodeTools"> <tool file="encode/gencode_partition.xml" /> diff -r dabed25dfbaf -r aae4754d6828 tools/data_destination/epigraph.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/data_destination/epigraph.xml Mon Sep 22 10:36:34 2008 -0400 @@ -0,0 +1,21 @@ +<?xml version="1.0"?> +<tool name="Perform EpiGRAPH" id="epigraph"> + <description> Genome analysis and prediction</description> + <redirect_url_params>GENOME=${input1.dbkey} NAME=${input1.name} INFO=${input1.info}</redirect_url_params> + <inputs> + <param format="bed" name="input1" type="data" label="Send this dataset to EpiGRAPH"> + <validator type="unspecified_build" /> + </param> + <param name="REDIRECT_URL" type="hidden" value="http://epigraph.mpi-inf.mpg.de/WebGRAPH_Public_Test/faces/DataImport.jsp" /> + <param name="DATA_URL" type="baseurl" value="/datasets" /> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=epigraph_import" /> + </inputs> + <outputs/> + <help> +**What it does** + +This tool sends the selected dataset to EpiGRAPH for in-depth analysis and prediction. + + </help> +</tool> + diff -r dabed25dfbaf -r aae4754d6828 tools/data_source/flymine.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/data_source/flymine.xml Mon Sep 22 10:36:34 2008 -0400 @@ -0,0 +1,16 @@ +<?xml version="1.0"?> +<tool name="Flymine" id="flymine"> + <description>server</description> + <command interpreter="python">intermine.py $output</command> + <inputs action="http://preview.flymine.org/preview/begin.do" check_values="false" method="get" target="_top"> + <display>go to Flymine server $GALAXY_URL</display> + <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=flymine" /> + </inputs> + <uihints minwidth="800"/> + <code file="flymine_filter_code.py"/> + <outputs> + <data name="output" format="txt" /> + </outputs> + <options sanitize="False" refresh="True"/> +</tool> + diff -r dabed25dfbaf -r aae4754d6828 tools/data_source/flymine_filter_code.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/data_source/flymine_filter_code.py Mon Sep 22 10:36:34 2008 -0400 @@ -0,0 +1,31 @@ +# Code for direct connection to flymine +from galaxy.datatypes import sniff +import urllib + +import logging +log = logging.getLogger( __name__ ) + +def exec_before_job( app, inp_data, out_data, param_dict, tool=None ): + """Sets the attributes of the data""" + items = out_data.items() + for name, data in items: + data.dbkey = param_dict.get( 'dbkey', '?' ) + # Store flymine parameters temporarily in output file + out = open( data.file_name, 'w' ) + for key, value in param_dict.items(): + out.write( "%s\t%s\n" % ( key, value ) ) + out.close() + out_data[ name ] = data + +def exec_after_process( app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None ): + """Verifies the data after the run""" + name, data = out_data.items()[0] + if data.state == data.states.OK: + data.info = data.name + if data.extension == 'txt': + data_type = sniff.guess_ext( data.file_name, sniff_order=app.datatypes_registry.sniff_order ) + data = app.datatypes_registry.change_datatype( data, data_type ) + data.set_peek() + data.set_size() + data.flush() + diff -r dabed25dfbaf -r aae4754d6828 tools/data_source/intermine.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/data_source/intermine.py Mon Sep 22 10:36:34 2008 -0400 @@ -0,0 +1,45 @@ +#!/usr/bin/env python +#Retreives data from intermine and stores in a file. Intermine parameters are provided in the input/output file. +import urllib, sys, os, gzip, tempfile, shutil +from galaxy import eggs +from galaxy.datatypes import data + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def __main__(): + filename = sys.argv[1] + params = {} + + for line in open( filename, 'r' ): + try: + line = line.strip() + fields = line.split( '\t' ) + params[ fields[0] ] = fields[1] + except: + continue + + URL = params.get( 'URL', None ) + if not URL: + open( filename, 'w' ).write( "" ) + stop_err( 'Datasource has not sent back a URL parameter.' ) + + CHUNK_SIZE = 2**20 # 1Mb + try: + page = urllib.urlopen( URL ) + except Exception, exc: + raise Exception( 'Problems connecting to %s (%s)' % ( URL, exc ) ) + sys.exit( 1 ) + + fp = open( filename, 'wb' ) + while 1: + chunk = page.read( CHUNK_SIZE ) + if not chunk: + break + fp.write( chunk ) + fp.close() + +if __name__ == "__main__": __main__()

1 0

[hg] galaxy 1525: Modified INDEL tools under Regional variation ...
by greg＠scofield.bx.psu.edu 24 Sep '08

24 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/675ad84ae008 changeset: 1525:675ad84ae008 user: guru date: Mon Sep 22 12:02:35 2008 -0400 description: Modified INDEL tools under Regional variation section. 6 file(s) affected in this change: test-data/indelrates_3way.tabular test-data/indels_3way.tabular tools/regVariation/getIndelRates_3way.py tools/regVariation/getIndelRates_3way.xml tools/regVariation/getIndels_3way.xml tools/regVariation/parseMAF_smallIndels.pl diffs (908 lines): diff -r aae4754d6828 -r 675ad84ae008 test-data/indelrates_3way.tabular --- a/test-data/indelrates_3way.tabular Mon Sep 22 10:36:34 2008 -0400 +++ b/test-data/indelrates_3way.tabular Mon Sep 22 12:02:35 2008 -0400 @@ -1,13 +1,15 @@ -#Window Species Window_Start Window_End Insertion_Rate Deletion_Rate -1 canFam2 3356001 3357001 3.80e-03 2.17e-03 -2 canFam2 3357001 3358001 2.46e-03 1.85e-03 -3 canFam2 3358001 3359001 1.23e-03 3.08e-03 -4 canFam2 3359001 3360001 6.76e-03 8.39e-03 -5 canFam2 3360001 3361001 8.41e-03 1.12e-02 -6 canFam2 3361001 3362001 9.27e-03 1.85e-02 -7 canFam2 3362001 3363001 1.17e-02 6.67e-03 -8 canFam2 3363001 3364001 1.97e-02 5.62e-03 -9 canFam2 3364001 3365001 5.92e-03 9.65e-03 -10 canFam2 3366001 3367001 3.69e-03 2.63e-03 -11 canFam2 3367001 3368001 9.89e-03 4.95e-03 -12 canFam2 3368001 3369001 6.90e-03 1.38e-02 +#Block hg18_InsRate panTro2_InsRate canFam2_InsRate hg18_DelRate panTro2_DelRate canFam2_DelRate +1 0.00e+00 6.90e-03 6.90e-03 0.00e+00 0.00e+00 1.38e-02 +2 0.00e+00 0.00e+00 1.27e-02 7.25e-04 0.00e+00 6.36e-03 +3 0.00e+00 0.00e+00 6.21e-03 0.00e+00 0.00e+00 6.21e-03 +4 0.00e+00 0.00e+00 2.07e-02 0.00e+00 0.00e+00 5.18e-03 +5 0.00e+00 0.00e+00 2.54e-02 0.00e+00 0.00e+00 1.69e-02 +6 0.00e+00 0.00e+00 9.61e-04 0.00e+00 0.00e+00 0.00e+00 +7 0.00e+00 0.00e+00 1.97e-02 0.00e+00 0.00e+00 5.62e-03 +8 0.00e+00 1.71e-03 1.17e-02 0.00e+00 1.71e-03 6.67e-03 +9 0.00e+00 0.00e+00 1.05e-02 2.72e-03 0.00e+00 1.83e-02 +10 0.00e+00 3.26e-03 1.51e-02 0.00e+00 0.00e+00 1.89e-02 +12 1.25e-03 0.00e+00 8.70e-03 0.00e+00 0.00e+00 1.24e-02 +13 0.00e+00 0.00e+00 6.76e-03 0.00e+00 0.00e+00 4.73e-02 +14 0.00e+00 0.00e+00 4.93e-03 1.23e-03 1.23e-03 5.55e-03 +15 0.00e+00 0.00e+00 2.29e-02 0.00e+00 0.00e+00 1.38e-02 diff -r aae4754d6828 -r 675ad84ae008 test-data/indels_3way.tabular --- a/test-data/indels_3way.tabular Mon Sep 22 10:36:34 2008 -0400 +++ b/test-data/indels_3way.tabular Mon Sep 22 12:02:35 2008 -0400 @@ -1,149 +1,149 @@ -#block indel_type indel_length ingroup1 ingroup1_start ingroup1_end ingroup1_orient ingroup2 ingroup2_start ingroup2_end ingroup2_orient outgroup outgroup_start outgroup_end outgroup_orient -12 hg18.chr1:802_insert 1 hg18.chr1:802 57213 57213 + panTro2.chrUn:801 9724391 9724392 + canFam2.chr30:805 3360548 3360549 - -2 hg18.chr1:1380_delete 4 hg18.chr1:1380 48103 48104 + panTro2.chrUn:1384 9713713 9713716 + canFam2.chr30:1415 3367755 3367758 - -9 hg18.chr1:367_delete 35 hg18.chr1:367 55389 55390 + panTro2.chrUn:402 9721584 9721618 + canFam2.chr30:382 3361661 3361695 - -14 hg18.chr1:1628_delete 1 hg18.chr1:1628 58770 58771 + panTro2.chrUn:1631 9725949 9725949 + canFam2.chr30:1623 3357056 3357056 - -14 hg18.chr1:1628_delete 4 hg18.chr1:1628 58770 58771 + panTro2.chrUn:1631 9725950 9725953 + canFam2.chr30:1623 3357058 3357061 - -1 panTro2.chrUn:145_insert 2 hg18.chr1:143 46857 46858 + panTro2.chrUn:145 9712465 9712466 + canFam2.chr30:145 3368190 3368191 - -8 panTro2.chrUn:585_insert 4 hg18.chr1:582 54744 54745 + panTro2.chrUn:585 9720936 9720939 + canFam2.chr30:600 3362635 3362636 - -10 panTro2.chrUn:307_insert 35 hg18.chr1:272 55984 55985 + panTro2.chrUn:307 9721981 9722015 + canFam2.chr30:265 3361105 3361106 - -8 panTro2.chrUn:585_delete 1 hg18.chr1:582 54979 54979 + panTro2.chrUn:585 9721173 9721174 + canFam2.chr30:600 3362890 3362890 - -14 panTro2.chrUn:1631_delete 1 hg18.chr1:1628 59015 59015 + panTro2.chrUn:1631 9726197 9726198 + canFam2.chr30:1623 3357328 3357328 - -14 panTro2.chrUn:1631_delete 1 hg18.chr1:1628 60211 60211 + panTro2.chrUn:1631 9727392 9727393 + canFam2.chr30:1623 3358490 3358490 - -1 canFam2.chr30:145_insert 4 hg18.chr1:143 46796 46797 + panTro2.chrUn:145 9712403 9712404 + canFam2.chr30:145 3368127 3368130 - -2 canFam2.chr30:1415_insert 8 hg18.chr1:1380 47138 47139 + panTro2.chrUn:1384 9712747 9712748 + canFam2.chr30:1415 3366730 3366737 - -2 canFam2.chr30:1415_insert 3 hg18.chr1:1380 47263 47264 + panTro2.chrUn:1384 9712872 9712873 + canFam2.chr30:1415 3366862 3366864 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 47293 47294 + panTro2.chrUn:1384 9712902 9712903 + canFam2.chr30:1415 3366895 3366895 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 47312 47313 + panTro2.chrUn:1384 9712921 9712922 + canFam2.chr30:1415 3366915 3366915 - -2 canFam2.chr30:1415_insert 7 hg18.chr1:1380 47440 47441 + panTro2.chrUn:1384 9713049 9713050 + canFam2.chr30:1415 3367044 3367050 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 47528 47529 + panTro2.chrUn:1384 9713137 9713138 + canFam2.chr30:1415 3367138 3367138 - -2 canFam2.chr30:1415_insert 10 hg18.chr1:1380 47546 47547 + panTro2.chrUn:1384 9713155 9713156 + canFam2.chr30:1415 3367157 3367166 - -2 canFam2.chr30:1415_insert 4 hg18.chr1:1380 47562 47563 + panTro2.chrUn:1384 9713171 9713172 + canFam2.chr30:1415 3367183 3367186 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 47648 47649 + panTro2.chrUn:1384 9713257 9713258 + canFam2.chr30:1415 3367273 3367273 - -2 canFam2.chr30:1415_insert 3 hg18.chr1:1380 47672 47673 + panTro2.chrUn:1384 9713281 9713282 + canFam2.chr30:1415 3367298 3367300 - -2 canFam2.chr30:1415_insert 5 hg18.chr1:1380 47734 47735 + panTro2.chrUn:1384 9713343 9713344 + canFam2.chr30:1415 3367361 3367365 - -2 canFam2.chr30:1415_insert 2 hg18.chr1:1380 47759 47760 + panTro2.chrUn:1384 9713368 9713369 + canFam2.chr30:1415 3367391 3367392 - -2 canFam2.chr30:1415_insert 2 hg18.chr1:1380 47835 47836 + panTro2.chrUn:1384 9713444 9713445 + canFam2.chr30:1415 3367469 3367470 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 48017 48018 + panTro2.chrUn:1384 9713626 9713627 + canFam2.chr30:1415 3367653 3367653 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 48035 48036 + panTro2.chrUn:1384 9713644 9713645 + canFam2.chr30:1415 3367672 3367672 - -2 canFam2.chr30:1415_insert 4 hg18.chr1:1380 48091 48092 + panTro2.chrUn:1384 9713700 9713701 + canFam2.chr30:1415 3367729 3367732 - -2 canFam2.chr30:1415_insert 10 hg18.chr1:1380 48103 48104 + panTro2.chrUn:1384 9713712 9713713 + canFam2.chr30:1415 3367745 3367754 - -2 canFam2.chr30:1415_insert 1 hg18.chr1:1380 48232 48233 + panTro2.chrUn:1384 9713845 9713846 + canFam2.chr30:1415 3367870 3367870 - -3 canFam2.chr30:483_insert 15 hg18.chr1:455 48379 48380 + panTro2.chrUn:455 9714289 9714290 + canFam2.chr30:483 3366038 3366052 - -3 canFam2.chr30:483_insert 21 hg18.chr1:455 48426 48427 + panTro2.chrUn:455 9714336 9714337 + canFam2.chr30:483 3366096 3366116 - -3 canFam2.chr30:483_insert 1 hg18.chr1:455 48523 48524 + panTro2.chrUn:455 9714433 9714434 + canFam2.chr30:483 3366213 3366213 - -4 canFam2.chr30:193_insert 8 hg18.chr1:183 52535 52536 + panTro2.chrUn:183 9718438 9718439 + canFam2.chr30:193 3364655 3364662 - -4 canFam2.chr30:193_insert 1 hg18.chr1:183 52555 52556 + panTro2.chrUn:183 9718458 9718459 + canFam2.chr30:193 3364683 3364683 - -4 canFam2.chr30:193_insert 1 hg18.chr1:183 52588 52589 + panTro2.chrUn:183 9718491 9718492 + canFam2.chr30:193 3364717 3364717 - -4 canFam2.chr30:193_insert 4 hg18.chr1:183 52691 52692 + panTro2.chrUn:183 9718594 9718595 + canFam2.chr30:193 3364817 3364820 - -5 canFam2.chr30:118_insert 3 hg18.chr1:115 52709 52710 + panTro2.chrUn:115 9718612 9718613 + canFam2.chr30:118 3364525 3364527 - -5 canFam2.chr30:118_insert 1 hg18.chr1:115 52754 52755 + panTro2.chrUn:115 9718657 9718658 + canFam2.chr30:118 3364571 3364571 - -5 canFam2.chr30:118_insert 4 hg18.chr1:115 52763 52764 + panTro2.chrUn:115 9718666 9718667 + canFam2.chr30:118 3364581 3364584 - -6 canFam2.chr30:1041_insert 1 hg18.chr1:1040 53848 53849 + panTro2.chrUn:1040 9719751 9719752 + canFam2.chr30:1041 3364511 3364511 - -7 canFam2.chr30:356_insert 4 hg18.chr1:335 53869 53870 + panTro2.chrUn:335 9719772 9719773 + canFam2.chr30:356 3363136 3363139 - -7 canFam2.chr30:356_insert 1 hg18.chr1:335 53899 53900 + panTro2.chrUn:335 9719802 9719803 + canFam2.chr30:356 3363168 3363168 - -7 canFam2.chr30:356_insert 1 hg18.chr1:335 53921 53922 + panTro2.chrUn:335 9719824 9719825 + canFam2.chr30:356 3363191 3363191 - -7 canFam2.chr30:356_insert 8 hg18.chr1:335 53938 53939 + panTro2.chrUn:335 9719841 9719842 + canFam2.chr30:356 3363209 3363216 - -7 canFam2.chr30:356_insert 7 hg18.chr1:335 53970 53971 + panTro2.chrUn:335 9719873 9719874 + canFam2.chr30:356 3363249 3363255 - -7 canFam2.chr30:356_insert 5 hg18.chr1:335 54101 54102 + panTro2.chrUn:335 9720004 9720005 + canFam2.chr30:356 3363383 3363387 - -7 canFam2.chr30:356_insert 1 hg18.chr1:335 54164 54165 + panTro2.chrUn:335 9720067 9720068 + canFam2.chr30:356 3363451 3363451 - -8 canFam2.chr30:600_insert 1 hg18.chr1:582 54579 54580 + panTro2.chrUn:585 9720770 9720771 + canFam2.chr30:600 3362472 3362472 - -8 canFam2.chr30:600_insert 1 hg18.chr1:582 54654 54655 + panTro2.chrUn:585 9720845 9720846 + canFam2.chr30:600 3362547 3362547 - -8 canFam2.chr30:600_insert 1 hg18.chr1:582 54691 54692 + panTro2.chrUn:585 9720882 9720883 + canFam2.chr30:600 3362585 3362585 - -8 canFam2.chr30:600_insert 18 hg18.chr1:582 54775 54776 + panTro2.chrUn:585 9720970 9720971 + canFam2.chr30:600 3362667 3362684 - -8 canFam2.chr30:600_insert 1 hg18.chr1:582 54799 54800 + panTro2.chrUn:585 9720994 9720995 + canFam2.chr30:600 3362709 3362709 - -8 canFam2.chr30:600_insert 4 hg18.chr1:582 54944 54945 + panTro2.chrUn:585 9721139 9721140 + canFam2.chr30:600 3362851 3362854 - -8 canFam2.chr30:600_insert 1 hg18.chr1:582 54969 54970 + panTro2.chrUn:585 9721164 9721165 + canFam2.chr30:600 3362880 3362880 - -9 canFam2.chr30:382_insert 1 hg18.chr1:367 55360 55361 + panTro2.chrUn:402 9721554 9721555 + canFam2.chr30:382 3361632 3361632 - -9 canFam2.chr30:382_insert 1 hg18.chr1:367 55378 55379 + panTro2.chrUn:402 9721572 9721573 + canFam2.chr30:382 3361651 3361651 - -9 canFam2.chr30:382_insert 1 hg18.chr1:367 55396 55397 + panTro2.chrUn:402 9721625 9721626 + canFam2.chr30:382 3361703 3361703 - -9 canFam2.chr30:382_insert 2 hg18.chr1:367 55444 55445 + panTro2.chrUn:402 9721673 9721674 + canFam2.chr30:382 3361752 3361753 - -10 canFam2.chr30:265_insert 1 hg18.chr1:272 55776 55777 + panTro2.chrUn:307 9721772 9721773 + canFam2.chr30:265 3360902 3360902 - -10 canFam2.chr30:265_insert 5 hg18.chr1:272 55864 55865 + panTro2.chrUn:307 9721860 9721861 + canFam2.chr30:265 3360991 3360995 - -10 canFam2.chr30:265_insert 2 hg18.chr1:272 55969 55970 + panTro2.chrUn:307 9721965 9721966 + canFam2.chr30:265 3361087 3361088 - -10 canFam2.chr30:265_insert 2 hg18.chr1:272 55980 55981 + panTro2.chrUn:307 9721976 9721977 + canFam2.chr30:265 3361100 3361101 - -12 canFam2.chr30:805_insert 33 hg18.chr1:802 56722 56723 + panTro2.chrUn:801 9723901 9723902 + canFam2.chr30:805 3360035 3360067 - -12 canFam2.chr30:805_insert 1 hg18.chr1:802 56746 56747 + panTro2.chrUn:801 9723925 9723926 + canFam2.chr30:805 3360092 3360092 - -12 canFam2.chr30:805_insert 1 hg18.chr1:802 56836 56837 + panTro2.chrUn:801 9724015 9724016 + canFam2.chr30:805 3360183 3360183 - -12 canFam2.chr30:805_insert 1 hg18.chr1:802 56864 56865 + panTro2.chrUn:801 9724043 9724044 + canFam2.chr30:805 3360212 3360212 - -12 canFam2.chr30:805_insert 1 hg18.chr1:802 56894 56895 + panTro2.chrUn:801 9724073 9724074 + canFam2.chr30:805 3360243 3360243 - -12 canFam2.chr30:805_insert 2 hg18.chr1:802 57161 57162 + panTro2.chrUn:801 9724340 9724341 + canFam2.chr30:805 3360496 3360497 - -12 canFam2.chr30:805_insert 1 hg18.chr1:802 57352 57353 + panTro2.chrUn:801 9724530 9724531 + canFam2.chr30:805 3360680 3360680 - -13 canFam2.chr30:148_insert 2 hg18.chr1:176 58529 58530 + panTro2.chrUn:176 9725707 9725708 + canFam2.chr30:148 3359565 3359566 - -14 canFam2.chr30:1623_insert 1 hg18.chr1:1628 58700 58701 + panTro2.chrUn:1631 9725878 9725879 + canFam2.chr30:1623 3356984 3356984 - -14 canFam2.chr30:1623_insert 1 hg18.chr1:1628 58709 58710 + panTro2.chrUn:1631 9725887 9725888 + canFam2.chr30:1623 3356994 3356994 - -14 canFam2.chr30:1623_insert 1 hg18.chr1:1628 58770 58771 + panTro2.chrUn:1631 9725949 9725950 + canFam2.chr30:1623 3357057 3357057 - -14 canFam2.chr30:1623_insert 1 hg18.chr1:1628 58772 58773 + panTro2.chrUn:1631 9725955 9725956 + canFam2.chr30:1623 3357064 3357064 - -14 canFam2.chr30:1623_insert 15 hg18.chr1:1628 58800 58801 + panTro2.chrUn:1631 9725983 9725984 + canFam2.chr30:1623 3357093 3357107 - -14 canFam2.chr30:1623_insert 9 hg18.chr1:1628 58808 58809 + panTro2.chrUn:1631 9725991 9725992 + canFam2.chr30:1623 3357116 3357124 - -14 canFam2.chr30:1623_insert 1 hg18.chr1:1628 60032 60033 + panTro2.chrUn:1631 9727214 9727215 + canFam2.chr30:1623 3358319 3358319 - -14 canFam2.chr30:1623_insert 4 hg18.chr1:1628 60148 60149 + panTro2.chrUn:1631 9727330 9727331 + canFam2.chr30:1623 3358426 3358429 - -15 canFam2.chr30:218_insert 1 hg18.chr1:230 60326 60327 + panTro2.chrUn:230 9727507 9727508 + canFam2.chr30:218 3356643 3356643 - -15 canFam2.chr30:218_insert 2 hg18.chr1:230 60353 60354 + panTro2.chrUn:230 9727534 9727535 + canFam2.chr30:218 3356671 3356672 - -15 canFam2.chr30:218_insert 1 hg18.chr1:230 60391 60392 + panTro2.chrUn:230 9727572 9727573 + canFam2.chr30:218 3356711 3356711 - -15 canFam2.chr30:218_insert 1 hg18.chr1:230 60466 60467 + panTro2.chrUn:230 9727647 9727648 + canFam2.chr30:218 3356783 3356783 - -15 canFam2.chr30:218_insert 5 hg18.chr1:230 60476 60477 + panTro2.chrUn:230 9727657 9727658 + canFam2.chr30:218 3356794 3356798 - -1 canFam2.chr30:145_delete 1 hg18.chr1:143 46857 46857 + panTro2.chrUn:145 9712464 9712464 + canFam2.chr30:145 3368190 3368191 - -1 canFam2.chr30:145_delete 1 hg18.chr1:143 46900 46900 + panTro2.chrUn:145 9712509 9712509 + canFam2.chr30:145 3368232 3368233 - -2 canFam2.chr30:1415_delete 7 hg18.chr1:1380 47080 47086 + panTro2.chrUn:1384 9712689 9712695 + canFam2.chr30:1415 3366677 3366678 - -2 canFam2.chr30:1415_delete 1 hg18.chr1:1380 47168 47168 + panTro2.chrUn:1384 9712777 9712777 + canFam2.chr30:1415 3366766 3366767 - -2 canFam2.chr30:1415_delete 1 hg18.chr1:1380 47482 47482 + panTro2.chrUn:1384 9713091 9713091 + canFam2.chr30:1415 3367091 3367092 - -2 canFam2.chr30:1415_delete 2 hg18.chr1:1380 47703 47704 + panTro2.chrUn:1384 9713312 9713313 + canFam2.chr30:1415 3367330 3367331 - -2 canFam2.chr30:1415_delete 15 hg18.chr1:1380 48139 48153 + panTro2.chrUn:1384 9713752 9713766 + canFam2.chr30:1415 3367793 3367794 - -2 canFam2.chr30:1415_delete 1 hg18.chr1:1380 48160 48160 + panTro2.chrUn:1384 9713773 9713773 + canFam2.chr30:1415 3367799 3367800 - -2 canFam2.chr30:1415_delete 1 hg18.chr1:1380 48174 48174 + panTro2.chrUn:1384 9713787 9713787 + canFam2.chr30:1415 3367812 3367813 - -2 canFam2.chr30:1415_delete 1 hg18.chr1:1380 48197 48197 + panTro2.chrUn:1384 9713810 9713810 + canFam2.chr30:1415 3367834 3367835 - -2 canFam2.chr30:1415_delete 5 hg18.chr1:1380 48280 48284 + panTro2.chrUn:1384 9713893 9713897 + canFam2.chr30:1415 3367917 3367918 - -3 canFam2.chr30:483_delete 4 hg18.chr1:455 48344 48347 + panTro2.chrUn:455 9714254 9714257 + canFam2.chr30:483 3366005 3366006 - -3 canFam2.chr30:483_delete 4 hg18.chr1:455 48410 48413 + panTro2.chrUn:455 9714320 9714323 + canFam2.chr30:483 3366082 3366083 - -3 canFam2.chr30:483_delete 1 hg18.chr1:455 48470 48470 + panTro2.chrUn:455 9714380 9714380 + canFam2.chr30:483 3366159 3366160 - -4 canFam2.chr30:193_delete 4 hg18.chr1:183 52639 52642 + panTro2.chrUn:183 9718542 9718545 + canFam2.chr30:193 3364767 3364768 - -5 canFam2.chr30:118_delete 3 hg18.chr1:115 52707 52709 + panTro2.chrUn:115 9718610 9718612 + canFam2.chr30:118 3364524 3364525 - -5 canFam2.chr30:118_delete 2 hg18.chr1:115 52743 52744 + panTro2.chrUn:115 9718646 9718647 + canFam2.chr30:118 3364560 3364561 - -7 canFam2.chr30:356_delete 2 hg18.chr1:335 53879 53880 + panTro2.chrUn:335 9719782 9719783 + canFam2.chr30:356 3363148 3363149 - -7 canFam2.chr30:356_delete 4 hg18.chr1:335 54052 54055 + panTro2.chrUn:335 9719955 9719958 + canFam2.chr30:356 3363336 3363337 - -8 canFam2.chr30:600_delete 1 hg18.chr1:582 54634 54634 + panTro2.chrUn:585 9720825 9720825 + canFam2.chr30:600 3362526 3362527 - -8 canFam2.chr30:600_delete 3 hg18.chr1:582 54710 54712 + panTro2.chrUn:585 9720901 9720903 + canFam2.chr30:600 3362603 3362604 - -8 canFam2.chr30:600_delete 4 hg18.chr1:582 54876 54879 + panTro2.chrUn:585 9721071 9721074 + canFam2.chr30:600 3362785 3362786 - -8 canFam2.chr30:600_delete 1 hg18.chr1:582 54984 54984 + panTro2.chrUn:585 9721178 9721178 + canFam2.chr30:600 3362894 3362895 - -9 canFam2.chr30:382_delete 1 hg18.chr1:367 55131 55131 + panTro2.chrUn:402 9721325 9721325 + canFam2.chr30:382 3361420 3361421 - -9 canFam2.chr30:382_delete 6 hg18.chr1:367 55213 55218 + panTro2.chrUn:402 9721407 9721412 + canFam2.chr30:382 3361501 3361502 - -9 canFam2.chr30:382_delete 1 hg18.chr1:367 55308 55308 + panTro2.chrUn:402 9721502 9721502 + canFam2.chr30:382 3361590 3361591 - -9 canFam2.chr30:382_delete 4 hg18.chr1:367 55324 55327 + panTro2.chrUn:402 9721518 9721521 + canFam2.chr30:382 3361605 3361606 - -9 canFam2.chr30:382_delete 7 hg18.chr1:367 55340 55346 + panTro2.chrUn:402 9721534 9721540 + canFam2.chr30:382 3361617 3361618 - -9 canFam2.chr30:382_delete 2 hg18.chr1:367 55386 55387 + panTro2.chrUn:402 9721580 9721581 + canFam2.chr30:382 3361658 3361659 - -9 canFam2.chr30:382_delete 4 hg18.chr1:367 55457 55460 + panTro2.chrUn:402 9721686 9721689 + canFam2.chr30:382 3361765 3361766 - -10 canFam2.chr30:265_delete 11 hg18.chr1:272 55893 55903 + panTro2.chrUn:307 9721889 9721899 + canFam2.chr30:265 3361023 3361024 - -10 canFam2.chr30:265_delete 1 hg18.chr1:272 55924 55924 + panTro2.chrUn:307 9721920 9721920 + canFam2.chr30:265 3361043 3361044 - -10 canFam2.chr30:265_delete 1 hg18.chr1:272 55939 55939 + panTro2.chrUn:307 9721935 9721935 + canFam2.chr30:265 3361057 3361058 - -10 canFam2.chr30:265_delete 1 hg18.chr1:272 55959 55959 + panTro2.chrUn:307 9721955 9721955 + canFam2.chr30:265 3361076 3361077 - -10 canFam2.chr30:265_delete 3 hg18.chr1:272 56001 56003 + panTro2.chrUn:307 9722032 9722034 + canFam2.chr30:265 3361121 3361122 - -12 canFam2.chr30:805_delete 6 hg18.chr1:802 56611 56616 + panTro2.chrUn:801 9723790 9723795 + canFam2.chr30:805 3359929 3359930 - -12 canFam2.chr30:805_delete 1 hg18.chr1:802 56696 56696 + panTro2.chrUn:801 9723875 9723875 + canFam2.chr30:805 3360008 3360009 - -12 canFam2.chr30:805_delete 7 hg18.chr1:802 56911 56917 + panTro2.chrUn:801 9724090 9724096 + canFam2.chr30:805 3360259 3360260 - -12 canFam2.chr30:805_delete 3 hg18.chr1:802 56991 56993 + panTro2.chrUn:801 9724170 9724172 + canFam2.chr30:805 3360332 3360333 - -12 canFam2.chr30:805_delete 5 hg18.chr1:802 57110 57114 + panTro2.chrUn:801 9724289 9724293 + canFam2.chr30:805 3360448 3360449 - -12 canFam2.chr30:805_delete 1 hg18.chr1:802 57217 57217 + panTro2.chrUn:801 9724395 9724395 + canFam2.chr30:805 3360551 3360552 - -12 canFam2.chr30:805_delete 5 hg18.chr1:802 57247 57251 + panTro2.chrUn:801 9724425 9724429 + canFam2.chr30:805 3360580 3360581 - -12 canFam2.chr30:805_delete 2 hg18.chr1:802 57310 57311 + panTro2.chrUn:801 9724488 9724489 + canFam2.chr30:805 3360638 3360639 - -12 canFam2.chr30:805_delete 2 hg18.chr1:802 57379 57380 + panTro2.chrUn:801 9724557 9724558 + canFam2.chr30:805 3360706 3360707 - -12 canFam2.chr30:805_delete 4 hg18.chr1:802 57387 57390 + panTro2.chrUn:801 9724565 9724568 + canFam2.chr30:805 3360712 3360713 - -13 canFam2.chr30:148_delete 3 hg18.chr1:176 58409 58411 + panTro2.chrUn:176 9725587 9725589 + canFam2.chr30:148 3359468 3359469 - -13 canFam2.chr30:148_delete 1 hg18.chr1:176 58434 58434 + panTro2.chrUn:176 9725612 9725612 + canFam2.chr30:148 3359490 3359491 - -13 canFam2.chr30:148_delete 2 hg18.chr1:176 58446 58447 + panTro2.chrUn:176 9725624 9725625 + canFam2.chr30:148 3359501 3359502 - -13 canFam2.chr30:148_delete 8 hg18.chr1:176 58470 58477 + panTro2.chrUn:176 9725648 9725655 + canFam2.chr30:148 3359523 3359524 - -13 canFam2.chr30:148_delete 5 hg18.chr1:176 58488 58492 + panTro2.chrUn:176 9725666 9725670 + canFam2.chr30:148 3359533 3359534 - -13 canFam2.chr30:148_delete 6 hg18.chr1:176 58510 58515 + panTro2.chrUn:176 9725688 9725693 + canFam2.chr30:148 3359550 3359551 - -13 canFam2.chr30:148_delete 5 hg18.chr1:176 58565 58569 + panTro2.chrUn:176 9725743 9725747 + canFam2.chr30:148 3359601 3359602 - -14 canFam2.chr30:1623_delete 1 hg18.chr1:1628 58669 58669 + panTro2.chrUn:1631 9725847 9725847 + canFam2.chr30:1623 3356952 3356953 - -14 canFam2.chr30:1623_delete 3 hg18.chr1:1628 58849 58851 + panTro2.chrUn:1631 9726032 9726034 + canFam2.chr30:1623 3357164 3357165 - -14 canFam2.chr30:1623_delete 10 hg18.chr1:1628 59342 59351 + panTro2.chrUn:1631 9726524 9726533 + canFam2.chr30:1623 3357654 3357655 - -14 canFam2.chr30:1623_delete 1 hg18.chr1:1628 59561 59561 + panTro2.chrUn:1631 9726743 9726743 + canFam2.chr30:1623 3357863 3357864 - -14 canFam2.chr30:1623_delete 11 hg18.chr1:1628 59887 59897 + panTro2.chrUn:1631 9727069 9727079 + canFam2.chr30:1623 3358188 3358189 - -14 canFam2.chr30:1623_delete 5 hg18.chr1:1628 59908 59912 + panTro2.chrUn:1631 9727090 9727094 + canFam2.chr30:1623 3358198 3358199 - -14 canFam2.chr30:1623_delete 6 hg18.chr1:1628 60052 60057 + panTro2.chrUn:1631 9727234 9727239 + canFam2.chr30:1623 3358338 3358339 - -14 canFam2.chr30:1623_delete 4 hg18.chr1:1628 60111 60114 + panTro2.chrUn:1631 9727293 9727296 + canFam2.chr30:1623 3358391 3358392 - -14 canFam2.chr30:1623_delete 2 hg18.chr1:1628 60183 60184 + panTro2.chrUn:1631 9727365 9727366 + canFam2.chr30:1623 3358463 3358464 - -15 canFam2.chr30:218_delete 4 hg18.chr1:230 60400 60403 + panTro2.chrUn:230 9727581 9727584 + canFam2.chr30:218 3356719 3356720 - -15 canFam2.chr30:218_delete 8 hg18.chr1:230 60506 60513 + panTro2.chrUn:230 9727687 9727694 + canFam2.chr30:218 3356827 3356828 - -15 canFam2.chr30:218_delete 10 hg18.chr1:230 60520 60529 + panTro2.chrUn:230 9727701 9727710 + canFam2.chr30:218 3356833 3356834 - +#block indel_type indel_length ingroup1 ingroup1_start ingroup1_end ingroup1_alignSize ingroup1_orient ingroup2 ingroup2_start ingroup2_end ingroup2_alignSize ingroup2_orient outgroup outgroup_start outgroup_end outgroup_alignSize outgroup_orient +12 hg18.chr1_insert 1 hg18.chr1 57213 57213 802 + panTro2.chrUn 9724391 9724392 801 + canFam2.chr30 3360548 3360549 805 - +2 hg18.chr1_delete 4 hg18.chr1 48103 48104 1380 + panTro2.chrUn 9713713 9713716 1384 + canFam2.chr30 3367755 3367758 1415 - +9 hg18.chr1_delete 35 hg18.chr1 55389 55390 367 + panTro2.chrUn 9721584 9721618 402 + canFam2.chr30 3361661 3361695 382 - +14 hg18.chr1_delete 1 hg18.chr1 58770 58771 1628 + panTro2.chrUn 9725949 9725949 1631 + canFam2.chr30 3357056 3357056 1623 - +14 hg18.chr1_delete 4 hg18.chr1 58770 58771 1628 + panTro2.chrUn 9725950 9725953 1631 + canFam2.chr30 3357058 3357061 1623 - +1 panTro2.chrUn_insert 2 hg18.chr1 46857 46858 143 + panTro2.chrUn 9712465 9712466 145 + canFam2.chr30 3368190 3368191 145 - +8 panTro2.chrUn_insert 4 hg18.chr1 54744 54745 582 + panTro2.chrUn 9720936 9720939 585 + canFam2.chr30 3362635 3362636 600 - +10 panTro2.chrUn_insert 35 hg18.chr1 55984 55985 272 + panTro2.chrUn 9721981 9722015 307 + canFam2.chr30 3361105 3361106 265 - +8 panTro2.chrUn_delete 1 hg18.chr1 54979 54979 582 + panTro2.chrUn 9721173 9721174 585 + canFam2.chr30 3362890 3362890 600 - +14 panTro2.chrUn_delete 1 hg18.chr1 59015 59015 1628 + panTro2.chrUn 9726197 9726198 1631 + canFam2.chr30 3357328 3357328 1623 - +14 panTro2.chrUn_delete 1 hg18.chr1 60211 60211 1628 + panTro2.chrUn 9727392 9727393 1631 + canFam2.chr30 3358490 3358490 1623 - +1 canFam2.chr30_insert 4 hg18.chr1 46796 46797 143 + panTro2.chrUn 9712403 9712404 145 + canFam2.chr30 3368127 3368130 145 - +2 canFam2.chr30_insert 8 hg18.chr1 47138 47139 1380 + panTro2.chrUn 9712747 9712748 1384 + canFam2.chr30 3366730 3366737 1415 - +2 canFam2.chr30_insert 3 hg18.chr1 47263 47264 1380 + panTro2.chrUn 9712872 9712873 1384 + canFam2.chr30 3366862 3366864 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 47293 47294 1380 + panTro2.chrUn 9712902 9712903 1384 + canFam2.chr30 3366895 3366895 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 47312 47313 1380 + panTro2.chrUn 9712921 9712922 1384 + canFam2.chr30 3366915 3366915 1415 - +2 canFam2.chr30_insert 7 hg18.chr1 47440 47441 1380 + panTro2.chrUn 9713049 9713050 1384 + canFam2.chr30 3367044 3367050 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 47528 47529 1380 + panTro2.chrUn 9713137 9713138 1384 + canFam2.chr30 3367138 3367138 1415 - +2 canFam2.chr30_insert 10 hg18.chr1 47546 47547 1380 + panTro2.chrUn 9713155 9713156 1384 + canFam2.chr30 3367157 3367166 1415 - +2 canFam2.chr30_insert 4 hg18.chr1 47562 47563 1380 + panTro2.chrUn 9713171 9713172 1384 + canFam2.chr30 3367183 3367186 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 47648 47649 1380 + panTro2.chrUn 9713257 9713258 1384 + canFam2.chr30 3367273 3367273 1415 - +2 canFam2.chr30_insert 3 hg18.chr1 47672 47673 1380 + panTro2.chrUn 9713281 9713282 1384 + canFam2.chr30 3367298 3367300 1415 - +2 canFam2.chr30_insert 5 hg18.chr1 47734 47735 1380 + panTro2.chrUn 9713343 9713344 1384 + canFam2.chr30 3367361 3367365 1415 - +2 canFam2.chr30_insert 2 hg18.chr1 47759 47760 1380 + panTro2.chrUn 9713368 9713369 1384 + canFam2.chr30 3367391 3367392 1415 - +2 canFam2.chr30_insert 2 hg18.chr1 47835 47836 1380 + panTro2.chrUn 9713444 9713445 1384 + canFam2.chr30 3367469 3367470 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 48017 48018 1380 + panTro2.chrUn 9713626 9713627 1384 + canFam2.chr30 3367653 3367653 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 48035 48036 1380 + panTro2.chrUn 9713644 9713645 1384 + canFam2.chr30 3367672 3367672 1415 - +2 canFam2.chr30_insert 4 hg18.chr1 48091 48092 1380 + panTro2.chrUn 9713700 9713701 1384 + canFam2.chr30 3367729 3367732 1415 - +2 canFam2.chr30_insert 10 hg18.chr1 48103 48104 1380 + panTro2.chrUn 9713712 9713713 1384 + canFam2.chr30 3367745 3367754 1415 - +2 canFam2.chr30_insert 1 hg18.chr1 48232 48233 1380 + panTro2.chrUn 9713845 9713846 1384 + canFam2.chr30 3367870 3367870 1415 - +3 canFam2.chr30_insert 15 hg18.chr1 48379 48380 455 + panTro2.chrUn 9714289 9714290 455 + canFam2.chr30 3366038 3366052 483 - +3 canFam2.chr30_insert 21 hg18.chr1 48426 48427 455 + panTro2.chrUn 9714336 9714337 455 + canFam2.chr30 3366096 3366116 483 - +3 canFam2.chr30_insert 1 hg18.chr1 48523 48524 455 + panTro2.chrUn 9714433 9714434 455 + canFam2.chr30 3366213 3366213 483 - +4 canFam2.chr30_insert 8 hg18.chr1 52535 52536 183 + panTro2.chrUn 9718438 9718439 183 + canFam2.chr30 3364655 3364662 193 - +4 canFam2.chr30_insert 1 hg18.chr1 52555 52556 183 + panTro2.chrUn 9718458 9718459 183 + canFam2.chr30 3364683 3364683 193 - +4 canFam2.chr30_insert 1 hg18.chr1 52588 52589 183 + panTro2.chrUn 9718491 9718492 183 + canFam2.chr30 3364717 3364717 193 - +4 canFam2.chr30_insert 4 hg18.chr1 52691 52692 183 + panTro2.chrUn 9718594 9718595 183 + canFam2.chr30 3364817 3364820 193 - +5 canFam2.chr30_insert 3 hg18.chr1 52709 52710 115 + panTro2.chrUn 9718612 9718613 115 + canFam2.chr30 3364525 3364527 118 - +5 canFam2.chr30_insert 1 hg18.chr1 52754 52755 115 + panTro2.chrUn 9718657 9718658 115 + canFam2.chr30 3364571 3364571 118 - +5 canFam2.chr30_insert 4 hg18.chr1 52763 52764 115 + panTro2.chrUn 9718666 9718667 115 + canFam2.chr30 3364581 3364584 118 - +6 canFam2.chr30_insert 1 hg18.chr1 53848 53849 1040 + panTro2.chrUn 9719751 9719752 1040 + canFam2.chr30 3364511 3364511 1041 - +7 canFam2.chr30_insert 4 hg18.chr1 53869 53870 335 + panTro2.chrUn 9719772 9719773 335 + canFam2.chr30 3363136 3363139 356 - +7 canFam2.chr30_insert 1 hg18.chr1 53899 53900 335 + panTro2.chrUn 9719802 9719803 335 + canFam2.chr30 3363168 3363168 356 - +7 canFam2.chr30_insert 1 hg18.chr1 53921 53922 335 + panTro2.chrUn 9719824 9719825 335 + canFam2.chr30 3363191 3363191 356 - +7 canFam2.chr30_insert 8 hg18.chr1 53938 53939 335 + panTro2.chrUn 9719841 9719842 335 + canFam2.chr30 3363209 3363216 356 - +7 canFam2.chr30_insert 7 hg18.chr1 53970 53971 335 + panTro2.chrUn 9719873 9719874 335 + canFam2.chr30 3363249 3363255 356 - +7 canFam2.chr30_insert 5 hg18.chr1 54101 54102 335 + panTro2.chrUn 9720004 9720005 335 + canFam2.chr30 3363383 3363387 356 - +7 canFam2.chr30_insert 1 hg18.chr1 54164 54165 335 + panTro2.chrUn 9720067 9720068 335 + canFam2.chr30 3363451 3363451 356 - +8 canFam2.chr30_insert 1 hg18.chr1 54579 54580 582 + panTro2.chrUn 9720770 9720771 585 + canFam2.chr30 3362472 3362472 600 - +8 canFam2.chr30_insert 1 hg18.chr1 54654 54655 582 + panTro2.chrUn 9720845 9720846 585 + canFam2.chr30 3362547 3362547 600 - +8 canFam2.chr30_insert 1 hg18.chr1 54691 54692 582 + panTro2.chrUn 9720882 9720883 585 + canFam2.chr30 3362585 3362585 600 - +8 canFam2.chr30_insert 18 hg18.chr1 54775 54776 582 + panTro2.chrUn 9720970 9720971 585 + canFam2.chr30 3362667 3362684 600 - +8 canFam2.chr30_insert 1 hg18.chr1 54799 54800 582 + panTro2.chrUn 9720994 9720995 585 + canFam2.chr30 3362709 3362709 600 - +8 canFam2.chr30_insert 4 hg18.chr1 54944 54945 582 + panTro2.chrUn 9721139 9721140 585 + canFam2.chr30 3362851 3362854 600 - +8 canFam2.chr30_insert 1 hg18.chr1 54969 54970 582 + panTro2.chrUn 9721164 9721165 585 + canFam2.chr30 3362880 3362880 600 - +9 canFam2.chr30_insert 1 hg18.chr1 55360 55361 367 + panTro2.chrUn 9721554 9721555 402 + canFam2.chr30 3361632 3361632 382 - +9 canFam2.chr30_insert 1 hg18.chr1 55378 55379 367 + panTro2.chrUn 9721572 9721573 402 + canFam2.chr30 3361651 3361651 382 - +9 canFam2.chr30_insert 1 hg18.chr1 55396 55397 367 + panTro2.chrUn 9721625 9721626 402 + canFam2.chr30 3361703 3361703 382 - +9 canFam2.chr30_insert 2 hg18.chr1 55444 55445 367 + panTro2.chrUn 9721673 9721674 402 + canFam2.chr30 3361752 3361753 382 - +10 canFam2.chr30_insert 1 hg18.chr1 55776 55777 272 + panTro2.chrUn 9721772 9721773 307 + canFam2.chr30 3360902 3360902 265 - +10 canFam2.chr30_insert 5 hg18.chr1 55864 55865 272 + panTro2.chrUn 9721860 9721861 307 + canFam2.chr30 3360991 3360995 265 - +10 canFam2.chr30_insert 2 hg18.chr1 55969 55970 272 + panTro2.chrUn 9721965 9721966 307 + canFam2.chr30 3361087 3361088 265 - +10 canFam2.chr30_insert 2 hg18.chr1 55980 55981 272 + panTro2.chrUn 9721976 9721977 307 + canFam2.chr30 3361100 3361101 265 - +12 canFam2.chr30_insert 33 hg18.chr1 56722 56723 802 + panTro2.chrUn 9723901 9723902 801 + canFam2.chr30 3360035 3360067 805 - +12 canFam2.chr30_insert 1 hg18.chr1 56746 56747 802 + panTro2.chrUn 9723925 9723926 801 + canFam2.chr30 3360092 3360092 805 - +12 canFam2.chr30_insert 1 hg18.chr1 56836 56837 802 + panTro2.chrUn 9724015 9724016 801 + canFam2.chr30 3360183 3360183 805 - +12 canFam2.chr30_insert 1 hg18.chr1 56864 56865 802 + panTro2.chrUn 9724043 9724044 801 + canFam2.chr30 3360212 3360212 805 - +12 canFam2.chr30_insert 1 hg18.chr1 56894 56895 802 + panTro2.chrUn 9724073 9724074 801 + canFam2.chr30 3360243 3360243 805 - +12 canFam2.chr30_insert 2 hg18.chr1 57161 57162 802 + panTro2.chrUn 9724340 9724341 801 + canFam2.chr30 3360496 3360497 805 - +12 canFam2.chr30_insert 1 hg18.chr1 57352 57353 802 + panTro2.chrUn 9724530 9724531 801 + canFam2.chr30 3360680 3360680 805 - +13 canFam2.chr30_insert 2 hg18.chr1 58529 58530 176 + panTro2.chrUn 9725707 9725708 176 + canFam2.chr30 3359565 3359566 148 - +14 canFam2.chr30_insert 1 hg18.chr1 58700 58701 1628 + panTro2.chrUn 9725878 9725879 1631 + canFam2.chr30 3356984 3356984 1623 - +14 canFam2.chr30_insert 1 hg18.chr1 58709 58710 1628 + panTro2.chrUn 9725887 9725888 1631 + canFam2.chr30 3356994 3356994 1623 - +14 canFam2.chr30_insert 1 hg18.chr1 58770 58771 1628 + panTro2.chrUn 9725949 9725950 1631 + canFam2.chr30 3357057 3357057 1623 - +14 canFam2.chr30_insert 1 hg18.chr1 58772 58773 1628 + panTro2.chrUn 9725955 9725956 1631 + canFam2.chr30 3357064 3357064 1623 - +14 canFam2.chr30_insert 15 hg18.chr1 58800 58801 1628 + panTro2.chrUn 9725983 9725984 1631 + canFam2.chr30 3357093 3357107 1623 - +14 canFam2.chr30_insert 9 hg18.chr1 58808 58809 1628 + panTro2.chrUn 9725991 9725992 1631 + canFam2.chr30 3357116 3357124 1623 - +14 canFam2.chr30_insert 1 hg18.chr1 60032 60033 1628 + panTro2.chrUn 9727214 9727215 1631 + canFam2.chr30 3358319 3358319 1623 - +14 canFam2.chr30_insert 4 hg18.chr1 60148 60149 1628 + panTro2.chrUn 9727330 9727331 1631 + canFam2.chr30 3358426 3358429 1623 - +15 canFam2.chr30_insert 1 hg18.chr1 60326 60327 230 + panTro2.chrUn 9727507 9727508 230 + canFam2.chr30 3356643 3356643 218 - +15 canFam2.chr30_insert 2 hg18.chr1 60353 60354 230 + panTro2.chrUn 9727534 9727535 230 + canFam2.chr30 3356671 3356672 218 - +15 canFam2.chr30_insert 1 hg18.chr1 60391 60392 230 + panTro2.chrUn 9727572 9727573 230 + canFam2.chr30 3356711 3356711 218 - +15 canFam2.chr30_insert 1 hg18.chr1 60466 60467 230 + panTro2.chrUn 9727647 9727648 230 + canFam2.chr30 3356783 3356783 218 - +15 canFam2.chr30_insert 5 hg18.chr1 60476 60477 230 + panTro2.chrUn 9727657 9727658 230 + canFam2.chr30 3356794 3356798 218 - +1 canFam2.chr30_delete 1 hg18.chr1 46857 46857 143 + panTro2.chrUn 9712464 9712464 145 + canFam2.chr30 3368190 3368191 145 - +1 canFam2.chr30_delete 1 hg18.chr1 46900 46900 143 + panTro2.chrUn 9712509 9712509 145 + canFam2.chr30 3368232 3368233 145 - +2 canFam2.chr30_delete 7 hg18.chr1 47080 47086 1380 + panTro2.chrUn 9712689 9712695 1384 + canFam2.chr30 3366677 3366678 1415 - +2 canFam2.chr30_delete 1 hg18.chr1 47168 47168 1380 + panTro2.chrUn 9712777 9712777 1384 + canFam2.chr30 3366766 3366767 1415 - +2 canFam2.chr30_delete 1 hg18.chr1 47482 47482 1380 + panTro2.chrUn 9713091 9713091 1384 + canFam2.chr30 3367091 3367092 1415 - +2 canFam2.chr30_delete 2 hg18.chr1 47703 47704 1380 + panTro2.chrUn 9713312 9713313 1384 + canFam2.chr30 3367330 3367331 1415 - +2 canFam2.chr30_delete 15 hg18.chr1 48139 48153 1380 + panTro2.chrUn 9713752 9713766 1384 + canFam2.chr30 3367793 3367794 1415 - +2 canFam2.chr30_delete 1 hg18.chr1 48160 48160 1380 + panTro2.chrUn 9713773 9713773 1384 + canFam2.chr30 3367799 3367800 1415 - +2 canFam2.chr30_delete 1 hg18.chr1 48174 48174 1380 + panTro2.chrUn 9713787 9713787 1384 + canFam2.chr30 3367812 3367813 1415 - +2 canFam2.chr30_delete 1 hg18.chr1 48197 48197 1380 + panTro2.chrUn 9713810 9713810 1384 + canFam2.chr30 3367834 3367835 1415 - +2 canFam2.chr30_delete 5 hg18.chr1 48280 48284 1380 + panTro2.chrUn 9713893 9713897 1384 + canFam2.chr30 3367917 3367918 1415 - +3 canFam2.chr30_delete 4 hg18.chr1 48344 48347 455 + panTro2.chrUn 9714254 9714257 455 + canFam2.chr30 3366005 3366006 483 - +3 canFam2.chr30_delete 4 hg18.chr1 48410 48413 455 + panTro2.chrUn 9714320 9714323 455 + canFam2.chr30 3366082 3366083 483 - +3 canFam2.chr30_delete 1 hg18.chr1 48470 48470 455 + panTro2.chrUn 9714380 9714380 455 + canFam2.chr30 3366159 3366160 483 - +4 canFam2.chr30_delete 4 hg18.chr1 52639 52642 183 + panTro2.chrUn 9718542 9718545 183 + canFam2.chr30 3364767 3364768 193 - +5 canFam2.chr30_delete 3 hg18.chr1 52707 52709 115 + panTro2.chrUn 9718610 9718612 115 + canFam2.chr30 3364524 3364525 118 - +5 canFam2.chr30_delete 2 hg18.chr1 52743 52744 115 + panTro2.chrUn 9718646 9718647 115 + canFam2.chr30 3364560 3364561 118 - +7 canFam2.chr30_delete 2 hg18.chr1 53879 53880 335 + panTro2.chrUn 9719782 9719783 335 + canFam2.chr30 3363148 3363149 356 - +7 canFam2.chr30_delete 4 hg18.chr1 54052 54055 335 + panTro2.chrUn 9719955 9719958 335 + canFam2.chr30 3363336 3363337 356 - +8 canFam2.chr30_delete 1 hg18.chr1 54634 54634 582 + panTro2.chrUn 9720825 9720825 585 + canFam2.chr30 3362526 3362527 600 - +8 canFam2.chr30_delete 3 hg18.chr1 54710 54712 582 + panTro2.chrUn 9720901 9720903 585 + canFam2.chr30 3362603 3362604 600 - +8 canFam2.chr30_delete 4 hg18.chr1 54876 54879 582 + panTro2.chrUn 9721071 9721074 585 + canFam2.chr30 3362785 3362786 600 - +8 canFam2.chr30_delete 1 hg18.chr1 54984 54984 582 + panTro2.chrUn 9721178 9721178 585 + canFam2.chr30 3362894 3362895 600 - +9 canFam2.chr30_delete 1 hg18.chr1 55131 55131 367 + panTro2.chrUn 9721325 9721325 402 + canFam2.chr30 3361420 3361421 382 - +9 canFam2.chr30_delete 6 hg18.chr1 55213 55218 367 + panTro2.chrUn 9721407 9721412 402 + canFam2.chr30 3361501 3361502 382 - +9 canFam2.chr30_delete 1 hg18.chr1 55308 55308 367 + panTro2.chrUn 9721502 9721502 402 + canFam2.chr30 3361590 3361591 382 - +9 canFam2.chr30_delete 4 hg18.chr1 55324 55327 367 + panTro2.chrUn 9721518 9721521 402 + canFam2.chr30 3361605 3361606 382 - +9 canFam2.chr30_delete 7 hg18.chr1 55340 55346 367 + panTro2.chrUn 9721534 9721540 402 + canFam2.chr30 3361617 3361618 382 - +9 canFam2.chr30_delete 2 hg18.chr1 55386 55387 367 + panTro2.chrUn 9721580 9721581 402 + canFam2.chr30 3361658 3361659 382 - +9 canFam2.chr30_delete 4 hg18.chr1 55457 55460 367 + panTro2.chrUn 9721686 9721689 402 + canFam2.chr30 3361765 3361766 382 - +10 canFam2.chr30_delete 11 hg18.chr1 55893 55903 272 + panTro2.chrUn 9721889 9721899 307 + canFam2.chr30 3361023 3361024 265 - +10 canFam2.chr30_delete 1 hg18.chr1 55924 55924 272 + panTro2.chrUn 9721920 9721920 307 + canFam2.chr30 3361043 3361044 265 - +10 canFam2.chr30_delete 1 hg18.chr1 55939 55939 272 + panTro2.chrUn 9721935 9721935 307 + canFam2.chr30 3361057 3361058 265 - +10 canFam2.chr30_delete 1 hg18.chr1 55959 55959 272 + panTro2.chrUn 9721955 9721955 307 + canFam2.chr30 3361076 3361077 265 - +10 canFam2.chr30_delete 3 hg18.chr1 56001 56003 272 + panTro2.chrUn 9722032 9722034 307 + canFam2.chr30 3361121 3361122 265 - +12 canFam2.chr30_delete 6 hg18.chr1 56611 56616 802 + panTro2.chrUn 9723790 9723795 801 + canFam2.chr30 3359929 3359930 805 - +12 canFam2.chr30_delete 1 hg18.chr1 56696 56696 802 + panTro2.chrUn 9723875 9723875 801 + canFam2.chr30 3360008 3360009 805 - +12 canFam2.chr30_delete 7 hg18.chr1 56911 56917 802 + panTro2.chrUn 9724090 9724096 801 + canFam2.chr30 3360259 3360260 805 - +12 canFam2.chr30_delete 3 hg18.chr1 56991 56993 802 + panTro2.chrUn 9724170 9724172 801 + canFam2.chr30 3360332 3360333 805 - +12 canFam2.chr30_delete 5 hg18.chr1 57110 57114 802 + panTro2.chrUn 9724289 9724293 801 + canFam2.chr30 3360448 3360449 805 - +12 canFam2.chr30_delete 1 hg18.chr1 57217 57217 802 + panTro2.chrUn 9724395 9724395 801 + canFam2.chr30 3360551 3360552 805 - +12 canFam2.chr30_delete 5 hg18.chr1 57247 57251 802 + panTro2.chrUn 9724425 9724429 801 + canFam2.chr30 3360580 3360581 805 - +12 canFam2.chr30_delete 2 hg18.chr1 57310 57311 802 + panTro2.chrUn 9724488 9724489 801 + canFam2.chr30 3360638 3360639 805 - +12 canFam2.chr30_delete 2 hg18.chr1 57379 57380 802 + panTro2.chrUn 9724557 9724558 801 + canFam2.chr30 3360706 3360707 805 - +12 canFam2.chr30_delete 4 hg18.chr1 57387 57390 802 + panTro2.chrUn 9724565 9724568 801 + canFam2.chr30 3360712 3360713 805 - +13 canFam2.chr30_delete 3 hg18.chr1 58409 58411 176 + panTro2.chrUn 9725587 9725589 176 + canFam2.chr30 3359468 3359469 148 - +13 canFam2.chr30_delete 1 hg18.chr1 58434 58434 176 + panTro2.chrUn 9725612 9725612 176 + canFam2.chr30 3359490 3359491 148 - +13 canFam2.chr30_delete 2 hg18.chr1 58446 58447 176 + panTro2.chrUn 9725624 9725625 176 + canFam2.chr30 3359501 3359502 148 - +13 canFam2.chr30_delete 8 hg18.chr1 58470 58477 176 + panTro2.chrUn 9725648 9725655 176 + canFam2.chr30 3359523 3359524 148 - +13 canFam2.chr30_delete 5 hg18.chr1 58488 58492 176 + panTro2.chrUn 9725666 9725670 176 + canFam2.chr30 3359533 3359534 148 - +13 canFam2.chr30_delete 6 hg18.chr1 58510 58515 176 + panTro2.chrUn 9725688 9725693 176 + canFam2.chr30 3359550 3359551 148 - +13 canFam2.chr30_delete 5 hg18.chr1 58565 58569 176 + panTro2.chrUn 9725743 9725747 176 + canFam2.chr30 3359601 3359602 148 - +14 canFam2.chr30_delete 1 hg18.chr1 58669 58669 1628 + panTro2.chrUn 9725847 9725847 1631 + canFam2.chr30 3356952 3356953 1623 - +14 canFam2.chr30_delete 3 hg18.chr1 58849 58851 1628 + panTro2.chrUn 9726032 9726034 1631 + canFam2.chr30 3357164 3357165 1623 - +14 canFam2.chr30_delete 10 hg18.chr1 59342 59351 1628 + panTro2.chrUn 9726524 9726533 1631 + canFam2.chr30 3357654 3357655 1623 - +14 canFam2.chr30_delete 1 hg18.chr1 59561 59561 1628 + panTro2.chrUn 9726743 9726743 1631 + canFam2.chr30 3357863 3357864 1623 - +14 canFam2.chr30_delete 11 hg18.chr1 59887 59897 1628 + panTro2.chrUn 9727069 9727079 1631 + canFam2.chr30 3358188 3358189 1623 - +14 canFam2.chr30_delete 5 hg18.chr1 59908 59912 1628 + panTro2.chrUn 9727090 9727094 1631 + canFam2.chr30 3358198 3358199 1623 - +14 canFam2.chr30_delete 6 hg18.chr1 60052 60057 1628 + panTro2.chrUn 9727234 9727239 1631 + canFam2.chr30 3358338 3358339 1623 - +14 canFam2.chr30_delete 4 hg18.chr1 60111 60114 1628 + panTro2.chrUn 9727293 9727296 1631 + canFam2.chr30 3358391 3358392 1623 - +14 canFam2.chr30_delete 2 hg18.chr1 60183 60184 1628 + panTro2.chrUn 9727365 9727366 1631 + canFam2.chr30 3358463 3358464 1623 - +15 canFam2.chr30_delete 4 hg18.chr1 60400 60403 230 + panTro2.chrUn 9727581 9727584 230 + canFam2.chr30 3356719 3356720 218 - +15 canFam2.chr30_delete 8 hg18.chr1 60506 60513 230 + panTro2.chrUn 9727687 9727694 230 + canFam2.chr30 3356827 3356828 218 - +15 canFam2.chr30_delete 10 hg18.chr1 60520 60529 230 + panTro2.chrUn 9727701 9727710 230 + canFam2.chr30 3356833 3356834 218 - diff -r aae4754d6828 -r 675ad84ae008 tools/regVariation/getIndelRates_3way.py --- a/tools/regVariation/getIndelRates_3way.py Mon Sep 22 10:36:34 2008 -0400 +++ b/tools/regVariation/getIndelRates_3way.py Mon Sep 22 12:02:35 2008 -0400 @@ -1,56 +1,56 @@ #!/usr/bin/env python #Guruprasad Ananda -import sys, os, tempfile, string +from galaxy import eggs +import pkg_resources +pkg_resources.require( "bx-python" ) -assert sys.version_info[:2] >= ( 2, 4 ) +import sys, os, tempfile +import traceback +import fileinput +from warnings import warn -fout = open(sys.argv[2],'w') -winsize = int(sys.argv[3]) -species_ind = int(sys.argv[4]) +from galaxy.tools.util.galaxyops import * +from bx.intervals.io import * + +from bx.intervals.operations import quicksect def stop_err(msg): sys.stderr.write(msg) sys.exit() + +def counter(node, start, end, sort_col): + global full, blk_len, blk_list + if node.start < start: + if node.right: + counter(node.right, start, end, sort_col) + elif start <= node.start <= end and start <= node.end <= end: + full += 1 + if node.other[0] not in blk_list: + blk_list.append(node.other[0]) + blk_len += int(node.other[sort_col+2]) + if node.left and node.left.maxend > start: + counter(node.left, start, end, sort_col) + if node.right: + counter(node.right, start, end, sort_col) + elif node.start > end: + if node.left: + counter(node.left, start, end, sort_col) + -def rate_estimator(win, blk_lines, wstart, wend, wspecies): - inserts = 0.0 - deletes = 0.0 - ilengths = {} #dict containing lengths of blocks(without gaps) having insertion in wspecies - dlengths = {} #dict containing lengths of blocks(without gaps) having deletion in wspecies - prev_bnum = -1 - for bline in blk_lines: - items = bline.split('\t') - bnum = int(items[0]) - bevent = items[1] - if not(bevent.startswith(wspecies)): - continue - if bevent.endswith('insert'): - inserts += 1 - #Add lengths only if the insert belongs to a new alignment block - if not(ilengths.has_key(bnum)): - ilengths[bnum] = int(items[species_ind].split(':')[1]) - #prev_bnum = bnum - elif bevent.endswith('delete'): - deletes += 1 - #Add lengths only if the delete belongs to a new alignment block - if not(dlengths.has_key(bnum)): - dlengths[bnum] = int(items[species_ind].split(':')[1]) - #prev_bnum = bnum +infile = sys.argv[1] +fout = open(sys.argv[2],'w') +int_file = sys.argv[3] +if int_file != "None": #User has specified an interval file try: - total_ilength = sum(ilengths.values()) - irate = inserts/total_ilength + fint = open(int_file, 'r') + dbkey_i = sys.argv[4] + chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) except: - irate = 0 - try: - total_dlength = sum(dlengths.values()) - drate = deletes/total_dlength - except: - drate = 0 - print >>fout, "%s\t%s\t%s\t%s\t%.2e\t%.2e" %(win, wspecies, wstart, wend, irate , drate) - + stop_err("Unable to open input Interval file") + def main(): - infile = sys.argv[1] + for i, line in enumerate( file ( infile )): line = line.rstrip('\r\n') if len( line )>0 and not line.startswith( '#' ): @@ -58,71 +58,192 @@ break if i == 30: break # Hopefully we'll never get here... - - if len( elems ) != 15: + + if len( elems ) != 18: stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) - wspecies = elems[species_ind].split(':')[0].split('.')[0] - fin = open(infile, 'r') - skipped = 0 - blk=0 - win=0 - linestr="" - sorted_infile = tempfile.NamedTemporaryFile() - cmdline = "sort -n -k"+str(species_ind+2)+" -o "+sorted_infile.name+" "+infile - try: - os.system(cmdline) - except: - stop_err("Encountered error while sorting the input file.") - - print >>fout, "#Window\tSpecies\tWindow_Start\tWindow_End\tInsertion_Rate\tDeletion_Rate" - - for line in sorted_infile.readlines(): - line = line.strip("\r\n") - if not(line) or line == "": - continue + for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') elems = line.split('\t') try: assert int(elems[0]) - assert len(elems) == 15 - except Exception, eon: + assert len(elems) == 18 + if int_file != "None": + if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: + stop_err("The species build corresponding to your interval file is not present in the Indel file.") + if dbkey_i in elems[3]: + sort_col = 4 + elif dbkey_i in elems[8]: + sort_col = 9 + elif dbkey_i in elems[13]: + sort_col = 14 + else: + species = [] + species.append( elems[3].split('.')[0] ) + species.append( elems[8].split('.')[0] ) + species.append( elems[13].split('.')[0] ) + sort_col = 0 #Based on block numbers + break + except: continue - if not(elems[1].startswith(wspecies)): #Event doesn't belong to the selected species + + fin = open(infile, 'r') + skipped = 0 + + if int_file == "None": + sorted_infile = tempfile.NamedTemporaryFile() + cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile + try: + os.system(cmdline) + except: + stop_err("Encountered error while sorting the input file.") + print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) + prev_bnum = -1 + sorted_infile.seek(0) + for line in sorted_infile.readlines(): + line = line.rstrip('\r\n') + elems = line.split('\t') + try: + assert int(elems[0]) + assert len(elems) == 18 + new_bnum = int(elems[0]) + if new_bnum != prev_bnum: + if prev_bnum != -1: + irate = [] + drate = [] + for i,elem in enumerate(inserts): + try: + irate.append(str("%.2e" %(inserts[i]/blen[i]))) + except: + irate.append('0') + try: + drate.append(str("%.2e" %(deletes[i]/blen[i]))) + except: + drate.append('0') + print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) + inserts = [0.0, 0.0, 0.0] + deletes = [0.0, 0.0, 0.0] + blen = [] + blen.append( int(elems[6]) ) + blen.append( int(elems[11]) ) + blen.append( int(elems[16]) ) + line_sp = elems[1].split('.')[0] + sp_ind = species.index(line_sp) + if elems[1].endswith('insert'): + inserts[sp_ind] += 1 + elif elems[1].endswith('delete'): + deletes[sp_ind] += 1 + prev_bnum = new_bnum + except Exception, ei: + #print >>sys.stderr, ei + continue + irate = [] + drate = [] + for i,elem in enumerate(inserts): + try: + irate.append(str("%.2e" %(inserts[i]/blen[i]))) + except: + irate.append('0') + try: + drate.append(str("%.2e" %(deletes[i]/blen[i]))) + except: + drate.append('0') + print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) + sys.exit() + + + inf = open(infile, 'r') + start_met = False + end_met = False + sp_file = tempfile.NamedTemporaryFile() + for n, line in enumerate(inf): + line = line.rstrip('\r\n') + elems = line.split('\t') + try: + assert int(elems[0]) + assert len(elems) == 18 + if dbkey_i not in elems[1]: + if not(start_met): + continue + else: + sp_end = n + break + else: + print >>sp_file, line + if not(start_met): + start_met = True + sp_start = n + except: continue - - try: - assert wstart - except NameError: - wstart = int(elems[species_ind+1]) - int(elems[species_ind+1])%winsize + 1 - wend = wstart + winsize - lstart = int(elems[species_ind + 1]) - - if lstart in range(wstart,wend+1): - linestr += line.strip() - linestr += "\n" - else: - try: - win += 1 - blk_lines = linestr.strip().split("\n") - rate_estimator(str(win), blk_lines, str(wstart), str(wend), wspecies) - linestr = "" - except: - skipped += 1 - pass - linestr=line.strip()+"\n" - wstart = int(elems[species_ind+1]) - int(elems[species_ind+1])%winsize + 1 - wend = wstart + winsize - if linestr != "": - try: - win += 1 - blk_lines = linestr.strip().split("\n") - rate_estimator(str(win), blk_lines, str(wstart), str(wend), wspecies) - except: - skipped += 1 + + try: + assert sp_end + except: + sp_end = n+1 + + sp_file.seek(0) + win = NiceReaderWrapper( fileinput.FileInput( int_file ), + chrom_col=chr_col_i, + start_col=start_col_i, + end_col=end_col_i, + strand_col=strand_col_i, + fix_strand=True) + + indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), + chrom_col=1, + start_col=sort_col, + end_col=sort_col+1, + strand_col=-1, + fix_strand=True) + + indelTree = quicksect.IntervalTree() + for item in indel: + if type( item ) is GenomicInterval: + indelTree.insert( item, indel.linenum, item.fields ) + result=[] + + global full, blk_len, blk_list + for interval in win: + if type( interval ) is Header: pass - if skipped: - print "Skipped %s windows as invalid." %(skipped) + if type( interval ) is Comment: + pass + elif type( interval ) == GenomicInterval: + chrom = interval.chrom + start = int(interval.start) + end = int(interval.end) + if start > end: + warn( "Interval start after end!" ) + ins_chr = "%s.%s_insert" %(dbkey_i,chrom) + del_chr = "%s.%s_delete" %(dbkey_i,chrom) + irate = 0 + drate = 0 + if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: + pass + else: + if ins_chr in indelTree.chroms: + full = 0.0 + blk_len = 0 + blk_list = [] + root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree + counter(root, start, end, sort_col) + if blk_len: + irate = full/blk_len + + if del_chr in indelTree.chroms: + full = 0.0 + blk_len = 0 + blk_list = [] + root = indelTree.chroms[del_chr] #root node for the chrom insertion tree + counter(root, start, end, sort_col) + if blk_len: + drate = full/blk_len + + interval.fields.append(str("%.2e" %irate)) + interval.fields.append(str("%.2e" %drate)) + print >>fout, "\t".join(interval.fields) + fout.flush() + if __name__ == "__main__": - main() - \ No newline at end of file + main() \ No newline at end of file diff -r aae4754d6828 -r 675ad84ae008 tools/regVariation/getIndelRates_3way.xml --- a/tools/regVariation/getIndelRates_3way.xml Mon Sep 22 10:36:34 2008 -0400 +++ b/tools/regVariation/getIndelRates_3way.xml Mon Sep 22 12:02:35 2008 -0400 @@ -1,30 +1,30 @@ -<tool id="getIndelRates_3way" name="Estimate Indel Rates" version="1.0.0"> +<tool id="indelRates_3way" name="Estimate Indel Rates" version="1.0.0"> <description> for 3-way alignments</description> <command interpreter="python"> - getIndelRates_3way.py $input1 $out_file1 $winsize $species + getIndelRates_3way.py $input1 $out_file1 + #if $region.type == "align" + "None" + #else + $region.input2 $input2_dbkey $input2_chromCol,$input2_startCol,$input2_endCol,$input2_strandCol + #end if </command> <inputs> <page> - <param format="tabular" name="input1" type="data" label="Select data"/> - <param name="winsize" size="10" type="integer" value="1000" label="Estimate rates in windows of size" /> - <param name="species" type="select" label="and corresponding to co-ordinates of" multiple="false"> - <option value="3">Species 1 (Ingroup 1)</option> - <option value="7">Species 2 (Ingroup 2)</option> - <option value="11">Species 3 (Outgroup)</option> - </param> -  + </page> </inputs> <outputs> @@ -34,8 +34,7 @@ <tests> <test> <param name="input1" value="indels_3way.tabular"/> - <param name="winsize" value="1000"/> - <param name="species" value="11"/> + <param name="type" value="align"/> <output name="out_file1" file="indelrates_3way.tabular"/> </test> </tests> @@ -54,8 +53,8 @@ **Note** -Any block/s not containing exactly 3 species will be omitted. - +This tool only works on the output of the 'Estimate Indel Rates for 3-way alignments' tool. + </help> diff -r aae4754d6828 -r 675ad84ae008 tools/regVariation/getIndels_3way.xml --- a/tools/regVariation/getIndels_3way.xml Mon Sep 22 10:36:34 2008 -0400 +++ b/tools/regVariation/getIndels_3way.xml Mon Sep 22 12:02:35 2008 -0400 @@ -1,4 +1,4 @@ -<tool id="getIndels_3way" name="Fetch Indels" version="1.0.1"> +<tool id="indels_3way" name="Fetch Indels" version="1.0.1"> <description> from 3-way alignments</description> <command interpreter="perl"> parseMAF_smallIndels.pl $input1 $out_file1 $outgroup diff -r aae4754d6828 -r 675ad84ae008 tools/regVariation/parseMAF_smallIndels.pl --- a/tools/regVariation/parseMAF_smallIndels.pl Mon Sep 22 10:36:34 2008 -0400 +++ b/tools/regVariation/parseMAF_smallIndels.pl Mon Sep 22 12:02:35 2008 -0400 @@ -58,7 +58,7 @@ #print OFILE "#align\tingroup1\tingroup1_coord\tingroup1_orient\tingroup2\tingroup2_coord\tingroup2_orient\toutgroup\toutgroup_coord\toutgroup_orient\tindel_type\n"; #print OFILE2 "# small indels summary, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; -print OFILE2 "#block\tindel_type\tindel_length\tingroup1\tingroup1_start\tingroup1_end\tingroup1_orient\tingroup2\tingroup2_start\tingroup2_end\tingroup2_orient\toutgroup\toutgroup_start\toutgroup_end\toutgroup_orient\n"; +print OFILE2 "#block\tindel_type\tindel_length\tingroup1\tingroup1_start\tingroup1_end\tingroup1_alignSize\tingroup1_orient\tingroup2\tingroup2_start\tingroup2_end\tingroup2_alignSize\tingroup2_orient\toutgroup\toutgroup_start\toutgroup_end\toutgroup_alignSize\toutgroup_orient\n"; # main body of program while ($record = get_next_record($fh) ){ @@ -348,7 +348,8 @@ && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ $ABC = join("",($ABC,"X")); - $indelType = $seq1."_delete"; + my @s = split(/:/, $seq1); + $indelType = $s[0]."_delete"; #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); @@ -361,7 +362,8 @@ && (substr($sequence2,$position,1) eq "-") && (substr($sequence3,$position,1) !~ m/[-*\$?^]/)){ $ABC = join("",($ABC,"Y")); - $indelType = $seq2."_delete"; + my @s = split(/:/, $seq2); + $indelType = $s[0]."_delete"; #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); push (@indels,$indel_line); @@ -375,7 +377,8 @@ && (substr($sequence2,$position,1) eq "-") && (substr($sequence3,$position,1) eq "-")){ $ABC = join("",($ABC,"Z")); - $indelType = $seq1."_insert"; + my @s = split(/:/, $seq1); + $indelType = $s[0]."_insert"; #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); push (@indels,$indel_line); @@ -387,7 +390,8 @@ && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) && (substr($sequence3,$position,1) eq "-")){ $ABC = join("",($ABC,"W")); - $indelType = $seq2."_insert"; + my @s = split(/:/, $seq2); + $indelType = $s[0]."_insert"; #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); push (@indels,$indel_line); @@ -399,7 +403,8 @@ && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) && (substr($sequence3,$position,1) eq "-")){ $ABC = join("",($ABC,"S")); - $indelType = $seq3."_delete"; + my @s = split(/:/, $seq3); + $indelType = $s[0]."_delete"; #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); push (@indels,$indel_line); @@ -411,7 +416,8 @@ && (substr($sequence2,$position,1) eq "-") && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ $ABC = join("",($ABC,"T")); - $indelType = $seq3."_insert"; + my @s = split(/:/, $seq3); + $indelType = $s[0]."_insert"; #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); push (@indels,$indel_line); @@ -622,6 +628,9 @@ my $event_line = $_; my @events = split(/\t/, $event_line); my $event_type = $events[10]; + my @name_align1 = split(/:/, $events[1]); + my @name_align2 = split(/:/, $events[4]); + my @name_align3 = split(/:/, $events[7]); my $seq1_event_start = my $seq1_event_end = my $seq2_event_start = my $seq2_event_end = my $seq3_event_start = my $seq3_event_end = 0; my $final_event_line = ""; # seq1_insert @@ -634,7 +643,7 @@ $seq2_event_end = ($events[5]); $seq3_event_start = ($events[8]-1); $seq3_event_end = ($events[8]); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$events[1],$seq1_event_start,$seq1_event_end,$events[3],$events[4],$seq2_event_start,$seq2_event_end,$events[6],$events[7],$seq3_event_start,$seq3_event_end,$events[9])); + $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); } # seq1_delete elsif ($event_type =~ m/$ingroup1/ && $event_type =~ m/delete/){ @@ -646,7 +655,7 @@ $seq2_event_end = ($events[5]+$events[11]-1); $seq3_event_start = ($events[8]); $seq3_event_end = ($events[8]+$events[11]-1); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$events[1],$seq1_event_start,$seq1_event_end,$events[3],$events[4],$seq2_event_start,$seq2_event_end,$events[6],$events[7],$seq3_event_start,$seq3_event_end,$events[9])); + $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); } # seq2_insert elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/insert/){ @@ -658,7 +667,7 @@ $seq2_event_end = ($events[5]+$events[11]-1); $seq3_event_start = ($events[8]-1); $seq3_event_end = ($events[8]); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$events[1],$seq1_event_start,$seq1_event_end,$events[3],$events[4],$seq2_event_start,$seq2_event_end,$events[6],$events[7],$seq3_event_start,$seq3_event_end,$events[9])); + $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); } # seq2_delete elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/delete/){ @@ -670,7 +679,7 @@ $seq2_event_end = ($events[5]); $seq3_event_start = ($events[8]); $seq3_event_end = ($events[8]+$events[11]-1); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$events[1],$seq1_event_start,$seq1_event_end,$events[3],$events[4],$seq2_event_start,$seq2_event_end,$events[6],$events[7],$seq3_event_start,$seq3_event_end,$events[9])); + $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); } # start testing w/seq3_insert elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/insert/){ @@ -682,7 +691,7 @@ $seq2_event_end = ($events[5]); $seq3_event_start = ($events[8]); $seq3_event_end = ($events[8]+$events[11]-1); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$events[1],$seq1_event_start,$seq1_event_end,$events[3],$events[4],$seq2_event_start,$seq2_event_end,$events[6],$events[7],$seq3_event_start,$seq3_event_end,$events[9])); + $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); } # seq3_delete elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/delete/){ @@ -694,7 +703,7 @@ $seq2_event_end = ($events[5]+$events[11]-1); $seq3_event_start = ($events[8]-1); $seq3_event_end = ($events[8]); - $final_event_line = join("\t",($events[0],$event_type,$events[11],$events[1],$seq1_event_start,$seq1_event_end,$events[3],$events[4],$seq2_event_start,$seq2_event_end,$events[6],$events[7],$seq3_event_start,$seq3_event_end,$events[9])); + $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); }

1 0

[hg] galaxy 1526: Convert metadata unicode strings to ascii ( pr...
by greg＠scofield.bx.psu.edu 24 Sep '08

24 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/38e533287df4 changeset: 1526:38e533287df4 user: Greg Von Kuster <greg(a)bx.psu.edu> date: Wed Sep 24 11:14:44 2008 -0400 description: Convert metadata unicode strings to ascii ( previous code doesn't seem to do anything, am I missing something? ). Add no_value metadata attribute to columns and column_types metadata. Fix for column_maker and filter tools. 4 file(s) affected in this change: lib/galaxy/datatypes/metadata.py lib/galaxy/datatypes/tabular.py tools/stats/column_maker.py tools/stats/filtering.py diffs (79 lines): diff -r 675ad84ae008 -r 38e533287df4 lib/galaxy/datatypes/metadata.py --- a/lib/galaxy/datatypes/metadata.py Mon Sep 22 12:02:35 2008 -0400 +++ b/lib/galaxy/datatypes/metadata.py Wed Sep 24 11:14:44 2008 -0400 @@ -60,10 +60,10 @@ if isinstance( value, ListType ): for i, elem in enumerate( value ): if type ( elem ) == unicode: - value[i] = str( elem ) + value[i] = elem.decode( 'ascii' ) elif isinstance ( value, basestring ): if type( value ) == unicode: - value = str( value ) + value = value.decode( 'ascii' ) self.value = value self.context = context self.display = True @@ -76,7 +76,7 @@ @classmethod def marshal( cls, value ): ''' - This method should/can be overridden to convert the incomming + This method should/can be overridden to convert the incoming value to whatever type it is supposed to be. ''' return value @@ -273,7 +273,6 @@ class ColumnTypesParameter( MetadataParameter ): def __init__( self, spec, value, context ): MetadataParameter.__init__( self, spec, value, context ) - def __str__(self): return ",".join( map( str, self.value ) ) diff -r 675ad84ae008 -r 38e533287df4 lib/galaxy/datatypes/tabular.py --- a/lib/galaxy/datatypes/tabular.py Mon Sep 22 12:02:35 2008 -0400 +++ b/lib/galaxy/datatypes/tabular.py Wed Sep 24 11:14:44 2008 -0400 @@ -19,8 +19,8 @@ """Tab delimited data""" """Add metadata elements""" - MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False ) - MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False ) + MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 ) + MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] ) def init_meta( self, dataset, copy_from=None ): data.Text.init_meta( self, dataset, copy_from=copy_from ) diff -r 675ad84ae008 -r 38e533287df4 tools/stats/column_maker.py --- a/tools/stats/column_maker.py Mon Sep 22 12:02:35 2008 -0400 +++ b/tools/stats/column_maker.py Wed Sep 24 11:14:44 2008 -0400 @@ -19,12 +19,10 @@ round = sys.argv[4] try: in_columns = int( sys.argv[5] ) - # in_column_types is passed as a string that looks something like: - # "[u'str', u'int', u'int', u'str', u'int', u'str']" - in_column_types = sys.argv[6].strip( '[' ).strip( ']' ).replace( 'u', '' ).replace( "'", '' ).split( ',' ) + in_column_types = sys.argv[6].split( ',' ) except: stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." ) - + # Unescape if input has been escaped mapped_str = { '__lt__': '<', diff -r 675ad84ae008 -r 38e533287df4 tools/stats/filtering.py --- a/tools/stats/filtering.py Mon Sep 22 12:02:35 2008 -0400 +++ b/tools/stats/filtering.py Wed Sep 24 11:14:44 2008 -0400 @@ -26,9 +26,7 @@ cond_text = sys.argv[3] try: in_columns = int( sys.argv[4] ) - # in_column_types is passed as a string that looks something like: - # "[u'str', u'int', u'int', u'str', u'int', u'str']" - in_column_types = sys.argv[5].strip( '[' ).strip( ']' ).replace( 'u', '' ).replace( "'", '' ).split( ',' ) + in_column_types = sys.argv[5].split( ',' ) except: stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." )

1 0

[svn] [2773] Added locally modified sputnik source to svn
by nate＠bx.psu.edu 24 Sep '08

24 Sep '08

Revision: 2773 Author: nate Date: 2008-09-24 10:26:07 -0400 (Wed, 24 Sep 2008) Log Message: ----------- Added locally modified sputnik source to svn Added Paths: ----------- dependencies/sputnik/ dependencies/sputnik/README dependencies/sputnik/sputnik.c Added: dependencies/sputnik/README =================================================================== --- dependencies/sputnik/README (rev 0) +++ dependencies/sputnik/README 2008-09-24 14:26:07 UTC (rev 2773) @@ -0,0 +1,12 @@ +Sputnik's original source is available from: + +http://espressosoftware.com/pages/sputnik.jsp + +The version available from bx.psu.edu contains modifications to include +mononucleotide microsatellites in the output. + +Build with: + +gcc -g -o bx-sputnik sputnik.c + +And ensure that it can be found in your Galaxy user's path. Added: dependencies/sputnik/sputnik.c =================================================================== --- dependencies/sputnik/sputnik.c (rev 0) +++ dependencies/sputnik/sputnik.c 2008-09-24 14:26:07 UTC (rev 2773) @@ -0,0 +1,599 @@ +/* #define DEBUG_SPUTNIK 1 */ + + +/* + find repeats in fasta format seq file + allows for indels, returns score. + + beta version. caveat emptor. + + chrisa 29-Jul-94 + + chris abajian + University of Washington + Dept. of Molecular Biotechnology FJ-20 + Fluke Hall, Mason Road + Seattle WA 98195 +*/ + +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> + +/* trivial defs */ +#ifndef True +#define True 1 +#endif +#ifndef False +#define False 0 +#endif + +typedef int Boolean; + +/* size of buffer for reads. */ +#define BUF_SIZE 1024*10 /* 10K */ +/* max size of description line (begins with ">") */ +#define MAX_DESCRIPTION_LEN 1024 +/* max sequence length */ +#define MAX_SEQUENCE_LEN 1024*800 /* 800K */ +/* max number of sequence chars dumped to line */ +#define MAX_OUT_LINE_CHARS 60 + +/* for debugging only */ +#define MAX_ERRCODES 1024 + +/* search params and definitions */ +#define MIN_UNIT_LENGTH 1 /* start search with dinucleotide repeats */ +/* will search for di, tri, tetra ... <n>nucleotide repeats up to + this value for n */ +#define MAX_UNIT_LENGTH 5 /* up to and including pentanucleotides */ +/* this is the point score for each exact match */ +#define EXACT_MATCH_POINTS 1 +/* this is the point score for a mismatch, insertion or deletion */ +#define ERROR_MATCH_POINTS -6 +/* this is the minimum score required to be considered a match */ +#define MATCH_MIN_SCORE 8 +/* this is the low score at which we stop trying */ +#define MATCH_FAIL_SCORE -1 +/* this is the max recursion depth we try to recover errors */ +#define MAX_RECURSION 5 + + +char *repeatName[MAX_UNIT_LENGTH+1] = +{ + "***ERROR***", /* bad programmer! no latte! */ + "mononucleotide", + "dinucleotide", + "trinucleotide", + "tetranucleotide", + "pentanucleotide" +}; + + +char readBuf[BUF_SIZE]; +Boolean endOfFile; +int curBufLen; +int curBufPos; +int fd; +Boolean havePutBack; +char putBack; + +/* struct for indiv sequence in a file */ +typedef struct ss +{ + char descStr[MAX_DESCRIPTION_LEN]; + char seqStr[MAX_SEQUENCE_LEN]; + unsigned int seqLen; +} SeqStruct, *SeqStructPtr; + + +/* + * this structure describes the current state of a comparison. + * it gets passed down to recursive calls of the find repeat + * call so it can know when to bail out of an unsuccessful + * search, or return the size/state of a successful hit, etc. + */ +typedef struct ms +{ + int curPos; /* putative pattern starts here */ + int testPos; /* start testing here */ + int testLen; /* di, tri, tetra, etc. */ + int testCtr; /* # chars in testLen already tested. mod counter */ + int curScore; /* current score */ + int missense; /* keep track of ins, del, err */ + int insertions; + int deletions; + int depth; /* how deep is recursion for this match */ + char errCodes[MAX_ERRCODES]; +} MatchStruct, *MatchStructPtr; +/* a utility macro to copy one testStruct to another */ +#define copyMSPtr(dest,source) memcpy((char *)dest,(char *)source,sizeof(MatchStruct)) +/* a utility macro to increment the modular testCtr */ +#define bumpTestCtr(msp) (msp)->testCtr++; if ((msp)->testCtr==(msp)->testLen) (msp)->testCtr=0; + + +/* + ************************************************************ + * these routines are used to read and parse the fasta format + * sequence file + ************************************************************ + */ + +void fillBuf() +{ + size_t result; + + result = read(fd, (void *)readBuf, BUF_SIZE); + if (result == -1) + { + fprintf(stderr,"error reading file! errno = %d\n",errno); + exit(1); + } + else if (result == 0) + endOfFile = True; + else + { + curBufLen = result; + curBufPos = 0; + } +} /* readBuf */ + + +/* returns True on success */ +Boolean getChar(char *achar) +{ + if (havePutBack) + { + *achar = putBack; + havePutBack = False; + return(True); + } + + if (curBufPos == curBufLen) + fillBuf(); + + if (endOfFile) + return (False); + + *achar = readBuf[curBufPos++]; + return (True); +} + + +void putCharBack(char c) +{ + havePutBack = True; + putBack = c; +} + + +void openFile(char *fn) +{ + /* open the specified file */ + fd = open(fn, O_RDONLY); + if (fd == -1) + { + fprintf(stderr,"unable to open file %s\n", fn); + exit(1); + } +} + +/* should call this once for each file read */ +void initBuffer() +{ + /* initialize length and pointer */ + curBufPos = 0; + curBufLen = 0; + havePutBack = False; + endOfFile = False; +} + +void addCharToLine(char c, char *line, int *lineLen) +{ + if (*lineLen < MAX_DESCRIPTION_LEN) + line[(*lineLen)++] = c; + else + fprintf(stderr,"warning: description line truncated\n"); +} + + +/* + ********************************************************************* + * these routines are (more) specific to reading the fasta file format + ********************************************************************* + */ + + +/* + * pick up a non-blank line from the file, presumably description. + * truncates all leading blanks and/or blank lines + */ +Boolean getNonBlankLine(char *line) +{ + Boolean stop, nonBlank; + char c; + int lineLen; + + lineLen = 0; + stop = False; + nonBlank = False; /* will be set by any non whitespace char */ + while ((! endOfFile) && (! stop)) + if (getChar(&c)) + if (c == '\n') + stop = nonBlank; /* stop if have anything. don't save eol char. */ + else + if (nonBlank) + /* add it to line no matter what */ + addCharToLine(c,line,&lineLen); + else if ((c != ' ') && (c != '\t')) + { + /* only non whitespace will start the line */ + nonBlank = True; + addCharToLine(c,line,&lineLen); + } +} + + +/* load the sequence struct with comment line and bases */ +SeqStructPtr getSeq(char *fname) +{ + SeqStructPtr newSeqP; + Boolean endOfSeq; + char c; + + if (endOfFile) return ((SeqStructPtr )0); /* bombproofing */ + + /* malloc a new seq */ + if (! (newSeqP = (SeqStructPtr )malloc(sizeof(SeqStruct)) ) ) + { + fprintf(stderr,"unable to malloc() memory for sequence.\n"); + exit(1); + } + /* clear mem */ + memset( (void *)newSeqP, '\0', sizeof(SeqStruct)); + + /* pick up description line */ + if (! getNonBlankLine(newSeqP->descStr) ) + { + free(newSeqP); + return ((SeqStructPtr )0); + } + + /* did it start correctly ? */ + if (newSeqP->descStr[0] != '>') + { + fprintf(stderr,"format error in input file: missing '>'\n"); + exit(1); + } + + endOfSeq = False; + while ((!endOfFile) && (!endOfSeq)) + { + if (getChar(&c)) + { + if (c == '>') + { + /* hit new sequence */ + endOfSeq = True; + putCharBack(c); + } + else if (((c >= 'A') && (c <= 'Z')) || + ((c >= 'a') && (c <= 'z')) || (c == '-'))/* bogus test, chris */ + /* have nucleotide */ + newSeqP->seqStr[newSeqP->seqLen++] = toupper(c); + else if ((c != '\n') && (c != ' ') && (c != '\t') && (c != '#') && (c != '$') && (c != '*') && (c != '?') && (c != '^')) + { + /* wierd shit in file. bail. */ + fprintf(stderr,">bad char in sequence, %c\n",c); + exit(1); + } + } + } + + if (! newSeqP->seqLen) + { + fprintf(stderr,"? Null sequence encountered in file %s (ignored)\n",fname); + fprintf(stderr," %s\n", newSeqP->descStr); + free(newSeqP); + return ((SeqStructPtr )0); + } + + return(newSeqP); +} /* getSeq */ + + +/* for debugging. dump entire seq to stdout. */ +#ifdef DEBUG_SPUTNIK +void dumpSeq(SeqStructPtr seqP) +{ + int i, charsOnLine; + + fprintf(stdout,"%s\n", seqP->descStr); + fprintf(stdout,"Sequence (length = %d):\n", seqP->seqLen); + i = 0; + charsOnLine = 0; + while (i < seqP->seqLen) + { + if (charsOnLine == MAX_OUT_LINE_CHARS) + { + fprintf(stdout,"\n"); + charsOnLine = 1; + } + else + charsOnLine++; + fprintf(stdout,"%c", seqP->seqStr[i++]); + } + fprintf(stdout,"\n"); +} /* dumpSeq */ +#endif /* DEBUG_SPUTNIK */ + +/* dump the matched seq & stats to stdout */ +void dumpMatch(SeqStructPtr seqP, + MatchStructPtr matchP, + Boolean anyMatchThisSeq) +{ + int i, charsOnLine; + + if (! anyMatchThisSeq) + fprintf(stdout,"%s\n", seqP->descStr); + + fprintf(stdout,"%s %d : %d -- length %d score %d\n", + repeatName[matchP->testLen], + matchP->curPos+1, + matchP->testPos, + matchP->testPos - matchP->curPos, + matchP->curScore); + +#ifdef DEBUG_SPUTNIK + fprintf(stdout,"mis = %d, del = %d, ins = %d\n", + matchP->missense, + matchP->deletions, + matchP->insertions); +#endif + + i = matchP->curPos; + charsOnLine = 0; + while (i < matchP->testPos) + { + if (charsOnLine == MAX_OUT_LINE_CHARS) + { + fprintf(stdout,"\n"); + charsOnLine = 1; + } + else + charsOnLine++; + fprintf(stdout,"%c", seqP->seqStr[i++]); + } + fprintf(stdout,"\n"); + +#ifdef DEBUG_SPUTNIK + i = 0; + charsOnLine = 0; + while (i < (matchP->testPos - matchP->curPos)) + { + if (charsOnLine == MAX_OUT_LINE_CHARS) + { + fprintf(stdout,"\n"); + charsOnLine = 1; + } + else + charsOnLine++; + if (matchP->errCodes[i] == '\0') + fprintf(stdout," "); + else + fprintf(stdout,"%c", matchP->errCodes[i]); + i++; + } + fprintf(stdout,"\n"); +#endif +} /* dumpMatch */ + + +Boolean testForNRepeat(SeqStructPtr seqP, + MatchStructPtr matchP) +{ + MatchStruct curMatch, recover, bestSoFar, bestOfABadLot; + + /* save matchP in case we fail altogether. */ + copyMSPtr(&curMatch, matchP); + /* keep track of the best score and return that if over thresh. */ + copyMSPtr(&bestSoFar, matchP); + + while ( (curMatch.testPos < seqP->seqLen) /* anything to test */ + && (curMatch.curScore > MATCH_FAIL_SCORE) ) /* above fail threshold */ + { + /* test a base */ + if (seqP->seqStr[curMatch.curPos+curMatch.testCtr] + == seqP->seqStr[curMatch.testPos]) + { + /* we matched. this is easy. */ + curMatch.curScore += EXACT_MATCH_POINTS; /* score your points */ + curMatch.testPos++; /* advance the downstream test position */ + bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */ + } + else if ((seqP->seqStr[curMatch.testPos] == 'N') || (seqP->seqStr[curMatch.testPos] == '-')) + { + /* don't call it wrong, but no credit either */ + curMatch.testPos++; /* advance the downstream test position */ + bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */ + } + else + { + /* no match. take the score penalty, but keep going (maybe). */ + curMatch.curScore += ERROR_MATCH_POINTS; + curMatch.testPos++; /* advance the downstream test position */ + bumpTestCtr(&curMatch); /* advance pos in seq */ + /* is the score too bad to continue, or are we + already too deep? */ + if ( (curMatch.curScore > MATCH_FAIL_SCORE) + && (curMatch.depth < MAX_RECURSION) ) + { + /* try simple missense */ + copyMSPtr(&recover,&curMatch); + if ((recover.testPos - recover.curPos) < MAX_ERRCODES) + recover.errCodes[recover.testPos - recover.curPos -1] = 'M'; + recover.missense++; + recover.depth++; + (void )testForNRepeat(seqP,&recover); + copyMSPtr(&bestOfABadLot,&recover); + + /* try deletion */ + copyMSPtr(&recover,&curMatch); + if ((recover.testPos - recover.curPos) < MAX_ERRCODES) + recover.errCodes[recover.testPos - recover.curPos -1] = 'D'; + recover.testPos--; /* DON'T advance downstream */ + recover.deletions++; + recover.depth++; + (void )testForNRepeat(seqP,&recover); + if (recover.curScore > bestOfABadLot.curScore) + copyMSPtr(&bestOfABadLot,&recover); + + /* try insertion */ + copyMSPtr(&recover,&curMatch); + if ((recover.testPos - recover.curPos) < MAX_ERRCODES) + recover.errCodes[recover.testPos - recover.curPos -1] = 'I'; + /* RETEST for this base in the repeating seq */ + if (recover.testCtr == 0) + recover.testCtr = recover.testLen - 1; + else + recover.testCtr--; + recover.insertions++; + recover.depth++; + (void )testForNRepeat(seqP,&recover); + if (recover.curScore > bestOfABadLot.curScore) + copyMSPtr(&bestOfABadLot,&recover); + + /* take the best of a bad lot */ + bestOfABadLot.depth--; /* dec recursion counter */ + copyMSPtr(&curMatch, &bestOfABadLot); + } /* it was worth carrying on */ + } /* no match, found best of bad lot */ + + /* whatever happened, the best we could do is now in matchP */ + if (curMatch.curScore > bestSoFar.curScore) + copyMSPtr(&bestSoFar, &curMatch); + + } /* while loop to test a single base */ + + /* for whatever reason, we've stopped searching for more of this + putative repeat. if there were any matches that passed + the global threshold, return the best of them. note that this + has the effect of NOT advancing the pointer(s) if nothing + rang the bell. remember that we will test the same position + for ntide repeats of several different lengths. */ + if (bestSoFar.curScore > MATCH_MIN_SCORE) + { + copyMSPtr(matchP, &bestSoFar); + return(True); + } + return(False); /* the whole thing was a waste of time */ +} /* testForNRepeat */ + + +/* + * returns True if the sequence we want to look for repeats of is + * + * a) all the same base (i.e. 'AAA' or 'GG'). This filters out + * single nucleotide repeats + * + * b) conains 'N'. we search against these, but don't use them + * as wildcards. + */ +Boolean ignoreSeq(SeqStructPtr seqP, + MatchStructPtr matchP) +{ + int i; + + /* firstly, never search for any pattern that contains N */ + for (i = 0; i < matchP->testLen; i++) + if ((seqP->seqStr[matchP->curPos+i] == 'N') || (seqP->seqStr[matchP->curPos+i] == '-')) + return(True); + + /* now test for mononucleotide repeat. other tests may get + added, in which case this one will beed to be changed. */ + for (i = 1; i < matchP->testLen; i++) + if (seqP->seqStr[matchP->curPos] != seqP->seqStr[matchP->curPos+i]) + return(False); /* they're not all the same */ + return (False); /* they ARE all same:changed by Guru to allow mononucleotide repeats */ +} + + +void findRepeats(SeqStructPtr seqP) +{ + int curPos; + Boolean anyMatchThisSeq, matchAtThisPos; + MatchStruct match; + + memset( (char *)&match, 0, sizeof(MatchStruct) ); /* clear match struct */ + + anyMatchThisSeq = False; /* avoid dumping description more than once. */ + /* loop on all positions in the sequence. note that a match + will advance curPos past all matching chars to the first + unmatched char. */ + while ( match.curPos <= seqP->seqLen) + { + /* now loop on all the different lengths of repeats we're + looking for (i.e. di, tri, tetra nucleotides. if we + find a match at a shorter repeat length, forego testing + for longer lengths. */ + match.testLen = MIN_UNIT_LENGTH; + matchAtThisPos = False; + while ((match.testLen <= MAX_UNIT_LENGTH) && (!matchAtThisPos)) + { + /* initialize the state of the match */ + match.curScore = 0; /* no points yet */ + match.testCtr = 0; /* no chars tested yet */ + match.testPos = match.curPos + match.testLen; + match.insertions = 0; + match.deletions = 0; + match.missense = 0; + /* there are some things we don't want to test for */ + if (! ignoreSeq(seqP,&match)) + matchAtThisPos = testForNRepeat(seqP, &match); + else + matchAtThisPos = False; + if (! matchAtThisPos) match.testLen++; + } + + if (matchAtThisPos) + { + dumpMatch(seqP,&match,anyMatchThisSeq); + anyMatchThisSeq |= matchAtThisPos; + match.curPos = match.testPos; + } + else + match.curPos++; /* no, so advance to next base. */ + } +} + + +main(int argc, char* argv[]) +{ + SeqStructPtr seqP; + int count; + + if (argc != 2) + { + fprintf(stderr,"Usage: %s <fasta format sequence file name>\n", argv[0]); + exit(1); + } + + openFile(argv[1]); + + initBuffer(); + + count = 0; + while (! endOfFile) + if (seqP = getSeq(argv[1])) + { +#ifdef DEBUG_SPUTNIK + fprintf(stdout,"processing sequence %d\n", count++); +#endif + /* dumpSeq(seqP); */ + findRepeats(seqP); + free((void *)seqP); + } +}

1 0

[hg] galaxy 1518: Add a wrapper for metadata inside of DatasetFi...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/0f735b21dc12 changeset: 1518:0f735b21dc12 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Thu Sep 18 16:48:29 2008 -0400 description: Add a wrapper for metadata inside of DatasetFilenameWrapper to allow proper string substitution in commandline and templates. 2 file(s) affected in this change: lib/galaxy/datatypes/metadata.py lib/galaxy/tools/__init__.py diffs (56 lines): diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/datatypes/metadata.py --- a/lib/galaxy/datatypes/metadata.py Thu Sep 18 15:41:23 2008 -0400 +++ b/lib/galaxy/datatypes/metadata.py Thu Sep 18 16:48:29 2008 -0400 @@ -211,6 +211,9 @@ elif not isinstance(value, list): MetadataParameter.__setattr__(self, name, [value]) + def __iter__( self ): + return iter( self.value ) + def __str__(self): if self.value in [None, []]: return str(self.spec.no_value) diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py Thu Sep 18 15:41:23 2008 -0400 +++ b/lib/galaxy/tools/__init__.py Thu Sep 18 16:48:29 2008 -0400 @@ -1177,6 +1177,31 @@ Wraps a dataset so that __str__ returns the filename, but all other attributes are accessible. """ + + class MetadataWrapper: + """ + Wraps a Metadata Collection to return MetadataParameters wrapped according to the metadata spec. + Methods implemented to match behavior of a Metadata Collection. + """ + def __init__( self, metadata ): + self.metadata = metadata + def __getattr__( self, name ): + rval = self.metadata.get( name, None ) + if name in self.metadata.spec: + rval = self.metadata.spec[name].wrap( rval, self.metadata.parent ) + return rval + def __nonzero__( self ): + return self.metadata.__nonzero__() + def __iter__( self ): + return self.metadata.__iter__() + def get( self, key, default=None ): + try: + return getattr( self, key ) + except: + return default + def items( self ): + return iter( [ ( k, self.get( k ) ) for k, v in self.metadata.items() ] ) + def __init__( self, dataset, datatypes_registry = None, tool = None, name = None ): if not dataset: try: @@ -1187,6 +1212,7 @@ self.dataset = NoneDataset( datatypes_registry = datatypes_registry, ext = ext ) else: self.dataset = dataset + self.metadata = self.MetadataWrapper( dataset.metadata ) def __str__( self ): return self.dataset.file_name def __getattr__( self, key ):

1 0

[hg] galaxy 1519: Update GMAJ tool interface.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/b2a9827178e2 changeset: 1519:b2a9827178e2 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Fri Sep 19 12:27:20 2008 -0400 description: Update GMAJ tool interface. 1 file(s) affected in this change: tools/visualization/GMAJ.xml diffs (117 lines): diff -r 0f735b21dc12 -r b2a9827178e2 tools/visualization/GMAJ.xml --- a/tools/visualization/GMAJ.xml Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:27:20 2008 -0400 @@ -3,7 +3,10 @@ <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command> <inputs> <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/> - <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/> + <param name="refseq" label="Reference Sequence" type="select"> + <option value="first" selected="true">First sequence in each block</option> + <option value="any">Any sequence</option> + </param> <repeat name="annotations" title="Annotations"> <conditional name="annotation_style"> <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'."> @@ -11,7 +14,7 @@ <option value="basic">Basic</option> </param> <when value="galaxy"> - <param name="species" type="select" label="Species of Annotation" multiple="False"> + <param name="species" type="select" label="Species" multiple="False"> <options> <filter type="data_meta" ref="maf_input" key="species" /> </options> @@ -21,7 +24,6 @@ <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/> <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/> <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/> - <param name="offset" label="Offset" value="0" type="integer"/> </when> <when value="basic"> <param name="seq_name" label="Full Sequence Name" value="" type="text"> @@ -44,6 +46,7 @@ <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/> <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/> <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/> + <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/> <option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/> </option> <option name="Annotation Files" value="annotations"> @@ -71,12 +74,15 @@ </option> <option name="Red Flags" value="red"> <option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/> - <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/> - <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/> + <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/> + <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/> <option name="Missing item name for URL substitution (url_subst)" value="url_subst"/> </option> </option> <option name="Miscellaneous" value="miscellaneous"> + <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/> + <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/> + <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/> <option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/> <option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/> </option> @@ -89,11 +95,7 @@ title = "Galaxy: $maf_input.name" alignfile = input.maf -#if $refseq.value: refseq = $refseq -#else: -refseq = any -#end if tabext = .bed .gff .gtf #if $nowarn.value: nowarn = $nowarn @@ -102,36 +104,35 @@ #set $seq_count = 0 #for $annotation_count, $annotation in $enumerate( $annotations ): #if $annotation.annotation_style.style == "galaxy": -#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: -#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] -#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] +#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: +#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] #else: #set $seq_names = [$annotation.annotation_style['species']] -#set $aliases = [""] #end if #else: #set $seq_names = [$annotation.annotation_style['seq_name']] -#set $aliases = [""] #end if -#for $seq_name, $alias in $zip( $seq_names, $aliases ): +#for $seq_name in $seq_names: seq ${seq_count}: seqname = $seq_name #if $annotation.annotation_style['exons_file'].dataset: -exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias +exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension} #end if #if $annotation.annotation_style['repeats_file'].dataset: -repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias +repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension} #end if #if $annotation.annotation_style['links_file'].dataset: -links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias +links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension} #end if #if $annotation.annotation_style['underlays_file'].dataset: -underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias +underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension} #end if #if $annotation.annotation_style['highlights_file'].dataset: -highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias +highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension} #end if +#if $annotation.annotation_style.style == "basic": offset = $annotation.annotation_style['offset'] +#end if #set $seq_count = $seq_count + 1 #end for

1 0

[hg] galaxy 1520: Fix a bug in shrimp_wrapper and add a tool for...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b changeset: 1520:9ef55e79068b user: wychung date: Fri Sep 19 12:02:13 2008 -0400 description: Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads. Update datatype/fastqsolexa so the number of sequences is correct. 7 file(s) affected in this change: lib/galaxy/datatypes/sequence.py test-data/split_paired_reads_test1.fastq test-data/split_paired_reads_test1.out1 tool_conf.xml.sample tools/metag_tools/shrimp_wrapper.py tools/metag_tools/split_paired_reads.py tools/metag_tools/split_paired_reads.xml diffs (216 lines): diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Thu Sep 18 16:48:29 2008 -0400 +++ b/lib/galaxy/datatypes/sequence.py Fri Sep 19 12:02:13 2008 -0400 @@ -98,8 +98,8 @@ dataset.peek = data.get_file_peek( dataset.file_name ) count = size = 0 bases_regexp = re.compile("^[NGTAC]*$") - for line in file( dataset.file_name ): - if line and line[0] == "@": + for i, line in enumerate(file( dataset.file_name )): + if line and line[0] == "@" and i % 4 == 0: count += 1 elif bases_regexp.match(line): line = line.strip() diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.fastq Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,21 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA ++HWI-EAS91_1_30788AAXX:7:21:1542:1758 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR +@HWI-EAS91_1_30788AAXX:7:22:1621:462 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC ++HWI-EAS91_1_30788AAXX:7:22:1621:462 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh +@HWI-EAS91_1_30788AAXX:7:45:408:807 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT ++HWI-EAS91_1_30788AAXX:7:45:408:807 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh +@HWI-EAS91_1_30788AAXX:7:49:654:1439 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG ++HWI-EAS91_1_30788AAXX:7:49:654:1439 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@ +@HWI-EAS91_1_30788AAXX:7:64:947:234 +TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG ++HWI-EAS91_1_30788AAXX:7:64:947:234 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd + diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.out1 Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,20 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC ++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA ++HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h +@HWI-EAS91_1_30788AAXX:7:45:408:807/1 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT ++HWI-EAS91_1_30788AAXX:7:45:408:807/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA ++HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:64:947:234/1 +TATCAAAAAAGAATATAATCTGAATCAACACTACAA ++HWI-EAS91_1_30788AAXX:7:64:947:234/1 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Sep 18 16:48:29 2008 -0400 +++ b/tool_conf.xml.sample Fri Sep 19 12:02:13 2008 -0400 @@ -274,6 +274,7 @@ <tool file="metag_tools/short_reads_figure_high_quality_length.xml" /> <tool file="metag_tools/short_reads_trim_seq.xml" /> <tool file="metag_tools/blat_coverage_report.xml" /> + <tool file="metag_tools/split_paired_reads.xml" /> </section> <section name="Short Read Mapping" id="solexa_tools"> <tool file="metag_tools/shrimp_wrapper.xml" /> diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.py Fri Sep 19 12:02:13 2008 -0400 @@ -162,6 +162,7 @@ readname, endindex = line[1:].split('/') else: score = line + if score: # the last one if hits.has_key(readname): if len(hits[readname]) == hit_per_read: @@ -182,8 +183,9 @@ match_count = 0 if hit_per_read == 1: - matches = [ hits[readkey]['1'] ] - match_count = 1 + if len(hits[readkey]['1']) == 1: + matches = [ hits[readkey]['1'] ] + match_count = 1 else: end1_data = hits[readkey]['1'] end2_data = hits[readkey]['2'] @@ -591,6 +593,7 @@ if os.path.exists(query_qual_end2): os.remove(query_qual_end2) if os.path.exists(shrimp_log): os.remove(shrimp_log) + if __name__ == '__main__': __main__() diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.py Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,46 @@ +#! /usr/bin/python + +""" +Split Solexa paired end reads +""" + +import os, sys + +if __name__ == '__main__': + + infile = sys.argv[1] + outfile_end1 = open(sys.argv[2], 'w') + outfile_end2 = open(sys.argv[3], 'w') + + for i, line in enumerate(file(infile)): + line = line.rstrip() + if not line or line.startswith('#'): continue + + end1 = '' + end2 = '' + + line_index = i % 4 + + if line_index == 0: + end1 = line + '/1' + end2 = line + '/2' + + elif line_index == 1: + seq_len = len(line)/2 + end1 = line[0:seq_len] + end2 = line[seq_len:] + + elif line_index == 2: + end1 = line + '/1' + end2 = line + '/2' + + else: + qual_len = len(line)/2 + end1 = line[0:qual_len] + end2 = line[qual_len:] + + outfile_end1.write('%s\n' %(end1)) + outfile_end2.write('%s\n' %(end2)) + + outfile_end1.close() + outfile_end2.close() \ No newline at end of file diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.xml Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,56 @@ +<tool id="split_paired_reads" name="Split" version="1.0.0"> + <description>paired-end reads into two ends</description> + <command interpreter="python"> + split_paired_reads.py $input $output1 $output2 + </command> + <inputs> + <param name="input" type="data" format="fastqsolexa" label="Your paired-end file" /> + </inputs> + <outputs> + <data name="output1" format="fastqsolexa"/> + <data name="output2" format="fastqsolexa"/> + </outputs> + <tests> + <test> + <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" /> + <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" /> + </test> + </tests> +<help> + +**What it does** + +This tool splits a single paired-end file in half and returns two files with each ends. + +----- + +**Input formats** + +A multiple-fastq file, for example:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + + +----- + +**Outputs** + +One end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh + +The other end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + +</help> +</tool>

1 0

[hg] galaxy 1521: Merge with b2a9827178e28d93e2a978f64033a556a72...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/618210a97e62 changeset: 1521:618210a97e62 user: wychung date: Fri Sep 19 12:34:51 2008 -0400 description: Merge with b2a9827178e28d93e2a978f64033a556a72b4c51 0 file(s) affected in this change: diffs (117 lines): diff -r 9ef55e79068b -r 618210a97e62 tools/visualization/GMAJ.xml --- a/tools/visualization/GMAJ.xml Fri Sep 19 12:02:13 2008 -0400 +++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:34:51 2008 -0400 @@ -3,7 +3,10 @@ <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command> <inputs> <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/> - <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/> + <param name="refseq" label="Reference Sequence" type="select"> + <option value="first" selected="true">First sequence in each block</option> + <option value="any">Any sequence</option> + </param> <repeat name="annotations" title="Annotations"> <conditional name="annotation_style"> <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'."> @@ -11,7 +14,7 @@ <option value="basic">Basic</option> </param> <when value="galaxy"> - <param name="species" type="select" label="Species of Annotation" multiple="False"> + <param name="species" type="select" label="Species" multiple="False"> <options> <filter type="data_meta" ref="maf_input" key="species" /> </options> @@ -21,7 +24,6 @@ <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/> <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/> <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/> - <param name="offset" label="Offset" value="0" type="integer"/> </when> <when value="basic"> <param name="seq_name" label="Full Sequence Name" value="" type="text"> @@ -44,6 +46,7 @@ <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/> <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/> <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/> + <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/> <option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/> </option> <option name="Annotation Files" value="annotations"> @@ -71,12 +74,15 @@ </option> <option name="Red Flags" value="red"> <option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/> - <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/> - <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/> + <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/> + <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/> <option name="Missing item name for URL substitution (url_subst)" value="url_subst"/> </option> </option> <option name="Miscellaneous" value="miscellaneous"> + <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/> + <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/> + <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/> <option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/> <option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/> </option> @@ -89,11 +95,7 @@ title = "Galaxy: $maf_input.name" alignfile = input.maf -#if $refseq.value: refseq = $refseq -#else: -refseq = any -#end if tabext = .bed .gff .gtf #if $nowarn.value: nowarn = $nowarn @@ -102,36 +104,35 @@ #set $seq_count = 0 #for $annotation_count, $annotation in $enumerate( $annotations ): #if $annotation.annotation_style.style == "galaxy": -#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: -#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] -#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] +#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: +#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] #else: #set $seq_names = [$annotation.annotation_style['species']] -#set $aliases = [""] #end if #else: #set $seq_names = [$annotation.annotation_style['seq_name']] -#set $aliases = [""] #end if -#for $seq_name, $alias in $zip( $seq_names, $aliases ): +#for $seq_name in $seq_names: seq ${seq_count}: seqname = $seq_name #if $annotation.annotation_style['exons_file'].dataset: -exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias +exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension} #end if #if $annotation.annotation_style['repeats_file'].dataset: -repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias +repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension} #end if #if $annotation.annotation_style['links_file'].dataset: -links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias +links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension} #end if #if $annotation.annotation_style['underlays_file'].dataset: -underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias +underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension} #end if #if $annotation.annotation_style['highlights_file'].dataset: -highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias +highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension} #end if +#if $annotation.annotation_style.style == "basic": offset = $annotation.annotation_style['offset'] +#end if #set $seq_count = $seq_count + 1 #end for

1 0

[hg] galaxy 1522: Adding a new set of toolss to perform multiple...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/05974294cbf1 changeset: 1522:05974294cbf1 user: guru date: Sat Sep 20 18:14:24 2008 -0400 description: Adding a new set of toolss to perform multiple linear regression analysis. 9 file(s) affected in this change: test-data/rcve_out.dat test-data/reg_inp.tab tool_conf.xml.sample tools/regVariation/best_regression_subsets.py tools/regVariation/best_regression_subsets.xml tools/regVariation/linear_regression.py tools/regVariation/linear_regression.xml tools/regVariation/rcve.py tools/regVariation/rcve.xml diffs (700 lines): diff -r 618210a97e62 -r 05974294cbf1 test-data/rcve_out.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rcve_out.dat Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,8 @@ +#Model R-sq RCVE_Terms RCVE_Value +2 3 4 0.3997 - - +3 4 0.3319 2 0.1697 +2 4 0.2974 3 0.2561 +2 3 0.3985 4 0.0031 +4 0.1226 2 3 0.6934 +3 0.2733 2 4 0.3164 +2 0.2972 3 4 0.2564 diff -r 618210a97e62 -r 05974294cbf1 test-data/reg_inp.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reg_inp.tab Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,100 @@ +2.04 2.01 1070 5 +2.56 3.40 1254 6 +3.75 3.68 1466 6 +1.10 1.54 706 4 +3.00 3.32 1160 5 +0.05 0.33 756 3 +1.38 0.36 1058 2 +1.50 1.97 1008 7 +1.38 2.03 1104 4 +4.01 2.05 1200 7 +1.50 2.13 896 7 +1.29 1.34 848 3 +1.90 1.51 958 5 +3.11 3.12 1246 6 +1.92 2.14 1106 4 +0.81 2.60 790 5 +1.01 1.90 954 4 +3.66 3.06 1500 6 +2.00 1.60 1046 5 +2.05 1.96 1054 4 +2.60 1.96 1198 6 +2.55 1.56 940 3 +0.38 1.60 456 6 +2.48 1.92 1150 7 +2.74 3.09 636 6 +1.77 0.78 744 5 +1.61 2.12 644 5 +0.99 1.85 842 3 +1.62 1.78 852 5 +2.03 1.03 1170 3 +3.50 3.44 1034 10 +3.18 2.42 1202 5 +2.39 1.74 1018 5 +1.48 1.89 1180 5 +1.54 1.43 952 3 +1.57 1.64 1038 4 +2.46 2.69 1090 6 +2.42 1.79 694 5 +2.11 2.72 1096 6 +2.04 2.15 1114 5 +1.68 2.22 1256 6 +1.64 1.55 1208 5 +2.41 2.34 820 6 +2.10 2.92 1222 4 +1.40 2.10 1120 5 +2.03 1.64 886 4 +1.99 2.83 1126 7 +2.24 1.76 1158 4 +0.45 1.81 676 6 +2.31 2.68 1214 7 +2.41 2.55 1136 6 +2.56 2.70 1264 6 +2.50 1.66 1116 3 +2.92 2.23 1292 4 +2.35 2.01 604 5 +2.82 1.24 854 6 +1.80 1.95 814 6 +1.29 1.73 778 3 +1.68 1.08 800 2 +3.44 3.46 1424 7 +1.90 3.01 950 6 +2.06 0.54 1056 3 +3.30 3.20 956 8 +1.80 1.50 1352 5 +2.00 1.71 852 5 +1.68 1.99 1168 5 +1.94 2.76 970 6 +0.97 1.56 776 4 +1.12 1.78 854 6 +1.31 1.32 1232 5 +1.68 0.87 1140 6 +3.09 1.75 1084 4 +1.87 1.41 954 2 +2.00 2.77 1000 4 +2.39 1.78 1084 4 +1.50 1.34 1058 4 +1.82 1.52 816 5 +1.80 2.97 1146 7 +2.01 1.75 1000 6 +1.88 1.64 856 4 +1.64 1.80 798 4 +2.42 3.37 1324 6 +0.22 1.15 704 6 +2.31 1.72 1222 5 +0.95 2.27 948 6 +1.99 2.85 1182 8 +1.86 2.21 1000 6 +1.79 1.94 910 6 +3.02 4.25 1374 9 +1.85 1.83 1014 6 +1.98 2.75 1420 7 +2.15 1.71 400 6 +1.46 2.20 998 7 +2.29 2.13 776 6 +2.39 2.38 1134 7 +1.80 1.64 772 4 +2.64 1.87 1304 6 +2.08 2.53 1212 4 +0.70 1.78 818 6 +0.89 1.20 864 2 \ No newline at end of file diff -r 618210a97e62 -r 05974294cbf1 tool_conf.xml.sample --- a/tool_conf.xml.sample Fri Sep 19 12:34:51 2008 -0400 +++ b/tool_conf.xml.sample Sat Sep 20 18:14:24 2008 -0400 @@ -128,6 +128,11 @@ <tool file="regVariation/getIndels_2way.xml" /> <tool file="regVariation/getIndels_3way.xml" /> <tool file="regVariation/getIndelRates_3way.xml" /> + </section> + <section name="Multiple regression" id="multReg"> + <tool file="regVariation/linear_regression.xml" /> + <tool file="regVariation/best_regression_subsets.xml" /> + <tool file="regVariation/rcve.xml" /> </section> <section name="Evolution: HyPhy" id="hyphy"> <tool file="hyphy/hyphy_branch_lengths_wrapper.xml" /> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/best_regression_subsets.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +from galaxy import eggs + +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] +outfile2 = sys.argv[5] +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +response_term = "" + +x_vals1 = numpy.asarray(x_vals).transpose() + +dat= r.list(x=array(x_vals1), y=y_vals) + +r.library("leaps") + +set_default_mode(NO_CONVERSION) +try: + leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat)) +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(leaps) +tot = len(x_vals) +pattern = "[" +for i in range(tot): + pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' ' +pattern = pattern.strip() + ']' +print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern) +for ind,item in enumerate(summary['outmat']): + print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind]) + + +r.pdf( outfile2, 8, 8 ) +r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion") +r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion") +r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion") +r.plot(leaps, scale="bic", main="Best subsets using bic Criterion") + +r.dev_off() diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/best_regression_subsets.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,64 @@ +<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression"> + <description> </description> + <command interpreter="python"> + best_regression_subsets.py + $input1 + $response_col + $predictor_cols + $out_file1 + $out_file2 + 1>/dev/null + 2>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + <data format="pdf" name="out_file2" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results. + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor and response variables as continuous variables. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The 6 columns in the output are described below: + + - Column 1 (Vars): denotes the number of variables in the model + - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model. + - Column 3 (R-sq): the fraction of variance explained by the model + - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p) + - Column 5 (Cp): Mallow's Cp statistics + - Column 6 (bic): Bayesian Information Criterion. + + + </help> +</tool> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/linear_regression.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +from galaxy import eggs +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] +outfile2 = sys.argv[5] + +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except: + yval = r('NA') + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() + +dat= r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat)) +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") +set_default_mode(BASIC_CONVERSION) + +coeffs=linear_model.as_py()['coefficients'] +yintercept= coeffs['(Intercept)'] +print >>fout, "Y-intercept\t%s" %(yintercept) +summary = r.summary(linear_model) + +co = summary.get('coefficients', 'NA') +""" +if len(co) != len(x_vals)+1: + stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") +""" +print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3]) + +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + try: + slope = coeffs['x'] + except: + slope = 'NA' + try: + pval = co[1][3] + except: + pval = 'NA' + print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope) + print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval) +else: #Multiple regression case with >1 predictors + ind=1 + while ind < len(coeffs.keys()): + print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)]) + try: + pval = co[ind][3] + except: + pval = 'NA' + print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval) + ind+=1 + +print >>fout, "R-squared\t%s" %(summary.get('r.squared','NA')) +print >>fout, "Adjusted R-squared\t%s" %(summary.get('adj.r.squared','NA')) +print >>fout, "F-statistic\t%s" %(summary.get('fstatistic','NA')) +print >>fout, "Sigma\t%s" %(summary.get('sigma','NA')) + +r.pdf( outfile2, 8, 8 ) +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept) + r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") + r.abline(a=yintercept, b=slope, col="red") +else: + r.pairs(dat, main="Scatterplot Matrix", col="blue") + +r.plot(linear_model) +r.dev_off() diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/linear_regression.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,62 @@ +<tool id="LinearRegression1" name="Perform Linear Regression"> + <description> </description> + <command interpreter="python"> + linear_regression.py + $input1 + $response_col + $predictor_cols + $out_file1 + $out_file2 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + <data format="pdf" name="out_file2" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + </tests> + <help> + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied. + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor and response variables as continuous variables. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The summary statistics in the output are described below: + + - sigma: the square root of the estimated variance of the random error (standard error of the residiuals) + - R-squared: the fraction of variance explained by the model + - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p) + - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative. + + + </help> +</tool> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/rcve.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +from galaxy import eggs + +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def sscombs(s): + if len(s) == 1: + return [s] + else: + ssc = sscombs(s[1:]) + return [s[0]] + [s[0]+comb for comb in ssc] + ssc + + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] + +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + """ + try: + float( elems[x_cols[k]] ) + except: + try: + msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) + except: + msg = "This operation cannot be performed on non-numeric data." + stop_err( msg ) + """ +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + #print >>sys.stderr, "ey = %s" %ey + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + #print >>sys.stderr, "ex = %s" %ex + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() +dat= r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(full) +fullr2 = summary.get('r.squared','NA') + +if fullr2 == 'NA': + stop_error("Error in linear regression") + +if len(x_vals) < 10: + s = "" + for ch in range(len(x_vals)): + s += str(ch) +else: + stop_err("This tool only works with less than 10 predictors.") + +print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" +all_combos = sorted(sscombs(s), key=len) +all_combos.reverse() +for j,cols in enumerate(all_combos): + #if len(cols) == len(s): #Same as the full model above + # continue + if len(cols) == 1: + x_vals1 = x_vals[int(cols)] + else: + x_v = [] + for col in cols: + x_v.append(x_vals[int(col)]) + x_vals1 = numpy.asarray(x_v).transpose() + dat= r.list(x=array(x_vals1), y=y_vals) + set_default_mode(NO_CONVERSION) + red = r.lm(r("y ~ x"), data= dat) #Reduced model + set_default_mode(BASIC_CONVERSION) + summary = r.summary(red) + redr2 = summary.get('r.squared','NA') + try: + rcve = (float(fullr2)-float(redr2))/float(fullr2) + except: + rcve = 'NA' + col_str = "" + for col in cols: + col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " + col_str.strip() + rcve_col_str = "" + for col in s: + if col not in cols: + rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " + rcve_col_str.strip() + if len(cols) == len(s): #full model + rcve_col_str = "-" + rcve = "-" + try: + redr2 = "%.4f" %(float(redr2)) + except: + pass + try: + rcve = "%.4f" %(float(rcve)) + except: + pass + print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve) diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/rcve.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,68 @@ +<tool id="rcve1" name="Compute RCVE" version="1.0.0"> + <description> </description> + <command interpreter="python"> + rcve.py + $input1 + $response_col + $predictor_cols + $out_file1 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + <test> + <param name="input1" value="reg_inp.tab"/> + <param name="response_col" value="1"/> + <param name="predictor_cols" value="2,3,4"/> + <output name="out_file1" file="rcve_out.dat"/> + </test> + + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula: + +**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**, +which denotes the case where the 'i'th predictor is dropped. + + +In general, +**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**, +where, + +- {X,X+} denotes the set of all predictors, +- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), +- {X} is the set of the predictors that are left in the reduced model after excluding {X+} + + +The 4 columns in the output are described below: + +- Column 1 (Model): denotes the variables present in the model ({X}) +- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1 +- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model. +- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model. + + + </help> +</tool>

1 0