[hg] galaxy 3568: Missed the actual tool files for new join in l...

16 Apr 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/5bbd895570d4
changeset: 3568:5bbd895570d4
user:      Kelly Vincent <kpvincent@bx.psu.edu>
date:      Fri Mar 26 12:37:20 2010 -0400
description:
Missed the actual tool files for new join in last commit...

diffstat:

 tools/new_operations/column_join.py  |  175 +++++++++++++++++++++++++++++++++++
 tools/new_operations/column_join.xml |  150 ++++++++++++++++++++++++++++++
 2 files changed, 325 insertions(+), 0 deletions(-)

diffs (333 lines):

diff -r a9e25e05ae8d -r 5bbd895570d4 tools/new_operations/column_join.py

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/column_join.py	Fri Mar 26 12:37:20 2010 -0400
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+"""
+This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
+
+usage: %prog output input1 input2 column1[,column2[,column3[,...]]] hinge1[,hinge2[,hinge3[,...]]] [other_input1 [other_input2 [other_input3 ...]]]
+    output: the output pileup
+    input1: the pileup file to start with
+    input2: the second pileup file to join
+    hinge: the columns to be used for matching
+    columns: the columns that should appear in the output
+    other_inputs: the other input files to join
+
+"""
+
+import os, re, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def ref_compare( ref1, ref2 ):
+    """
+    Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
+    first part handled as text but last part as number
+    """
+    pat = re.compile( '(?P<text>\D*)(?P<number>\d+)?' )
+    r1 = pat.match( ref1 )
+    r2 = pat.match( ref2 )
+    if not r2:
+        return 1
+    elif not r1:
+        return -1
+    text1, num1 = r1.groupdict()[ 'text' ].strip(), r1.groupdict()[ 'number' ]
+    text2, num2 = r2.groupdict()[ 'text' ].strip(), r2.groupdict()[ 'number' ]
+    if text2 == '' and ( num2 == '' or num2 is None ):
+        return 1
+    elif text1 == '' and ( num1 == '' or num1 is None ):
+        return -1
+    if text1 > text2:
+        return 1
+    elif text1 == text2:
+        if not ( num1 is None or num2 is None ):
+            num1 = int( num1 )
+            num2 = int( num2 )
+        if num1 > num2:
+            return 1
+        elif num1 == num2:
+            return 0
+        elif num1 < num2:
+            return -1
+    elif text1 < text2:
+        return -1
+
+def ref_pos_sort( infile, outfile ):
+    """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
+    ref_locs = {}
+    bad_lines = []
+    fin = open( infile, 'rb' )
+    line = fin.readline()
+    while line.strip():
+        if True:
+            try:
+                ref_seq, ref_loc = line.split( '\t' )[:2]
+                try:
+                    ref_locs[ ref_seq ][ long( ref_loc ) ] = fin.tell() - len( line.strip() ) - 1
+                except KeyError:
+                    ref_locs[ ref_seq ] = { long( ref_loc ): fin.tell() - len( line.strip() ) - 1 }
+                except ValueError:
+                    bad_lines.append( line )
+            except ValueError:
+                bad_line.append( line )
+        line = fin.readline()
+    fin.close()
+    refs = ref_locs.keys()
+    refs.sort( ref_compare )
+    fin = open( infile, 'rb' )
+    fout = open( outfile, 'wb' )
+    for ref in refs:
+        locs = ref_locs[ ref ].keys()
+        locs.sort()
+        for loc in locs:
+            fin.seek( ref_locs[ ref ][ loc ] )
+            fout.write( fin.readline() )
+    fout.close()
+    fin.close()
+
+def min_chr_pos( chr_pos ):
+    """Given line and hinge, identifies the 'smallest' one, from left to right"""
+    if len( chr_pos ) == 0 and ''.join( chr_pos ):
+        return ''
+    min_loc = len( chr_pos )
+    min_ref_pos = []
+    loc = 0
+    for c_pos in chr_pos:
+        if c_pos.strip():
+            ref, pos = c_pos.split( '\t' )[:2]
+            pos = int( pos )
+            if not min_ref_pos:
+                min_ref_pos = [ ref, pos ]
+                min_loc = loc
+            else:
+                ref_comp = ref_compare( ref, min_ref_pos[0] )
+                if ref_comp < 0:
+                    min_ref_pos = [ ref, pos ]
+                    min_loc = loc
+                elif ref_comp == 0 and pos < min_ref_pos[1]:
+                    min_ref_pos[1] = pos
+                    min_loc = loc
+        loc += 1
+    return '%s\t%s' % tuple( min_ref_pos ), min_loc
+
+def __main__():
+    output = sys.argv[1]
+    input1 = sys.argv[2]
+    input2 = sys.argv[3]
+    hinge = int( sys.argv[4] )
+    cols = [ int( c ) for c in sys.argv[5].split( ',' ) ]
+    inputs = sys.argv[6:]
+    assert len( cols ) > 2, 'You need to select at least one column in addition to the first two'
+    # make sure all files are sorted in same way, ascending
+    tmp_input_files = []
+    input_files = [ input1, input2 ]
+    input_files.extend( inputs ) 
+    for in_file in input_files:
+        tmp_file = tempfile.NamedTemporaryFile()
+        tmp_file_name = tmp_file.name
+        tmp_file.close()
+        ref_pos_sort( in_file, tmp_file_name )
+        tmp_file = open( tmp_file_name, 'rb' )
+        tmp_input_files.append( tmp_file )
+    # cycle through files, getting smallest line of all files one at a time
+    # also have to keep track of vertical position of extra columns
+    fout = file( output, 'w' )
+    old_current_chr_pos = ''
+    first_line = True
+    current_lines = [ f.readline() for f in tmp_input_files ]
+    last_lines = ''.join( current_lines ).strip()
+    last_loc = -1
+    while last_lines:
+        # get the "minimum" hinge, which should come first, and the file location in list
+        current_chr_pos, loc = min_chr_pos( [ '\t'.join( line.split( '\t' )[ :hinge ] ) for line in current_lines ] )
+        # first output empty columns for vertical alignment (account for "missing" files)
+        if current_chr_pos != old_current_chr_pos:
+            last_loc = -1
+        if loc - last_loc > 1:
+            current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols[ hinge: ] ) ) ]
+        else:
+            current_data = []
+        # now output actual data
+        split_line = current_lines[ loc ].strip().split( '\t' )
+        if ''.join( split_line ):
+            for col in cols:
+                if col > hinge:
+                    current_data.append( split_line[ col - 1 ] )
+            current_lines[ loc ] = tmp_input_files[ loc ].readline()
+            if current_chr_pos == old_current_chr_pos:
+                fout.write( '\t%s' % '\t'.join( current_data ) )
+            else:
+                if not first_line:
+                    fout.write( '\n' )
+                fout.write( '%s\t%s' % ( current_chr_pos, '\t'.join( current_data ) ) )
+                first_line = False
+        old_current_chr_pos = current_chr_pos
+        if last_lines == ''.join( current_lines ).strip():
+            break
+        last_lines = ''.join( current_lines ).strip()
+        last_loc = loc
+    fout.write( '\n' )
+    for f in tmp_input_files:
+        os.unlink( f.name )
+    fout.close()
+#    sys.stderr.write('******************\n'+file(fout.name, 'r').read()+'\n******************\n')
+
+if __name__ == "__main__" : __main__()
diff -r a9e25e05ae8d -r 5bbd895570d4 tools/new_operations/column_join.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/column_join.xml	Fri Mar 26 12:37:20 2010 -0400
@@ -0,0 +1,150 @@
+<tool id="column_join" name="Column Join" version="1.0.0">
+  <description></description>
+  <command interpreter="python">
+    column_join.py
+        $output
+        $input1
+        $input2
+        $hinge
+        $columns
+        #for $f in $file_chooser:
+            ${f.input}
+        #end for
+  </command>
+  <inputs>
+    <param name="input1" type="data" format="tabular" label="Choose the first file for the join" />
+    <param name="columns" type="data_column" data_ref="input1" multiple="true" numerical="false" label="Include these column" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
+    <param name="hinge" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Use this column and columns to left the 'hinge' (matching data for each join)" help="All columns to left of selected column (plus selected column) will be used. Select 2 for pileup" />
+    <param name="input2" type="data" format="tabular" label="Choose the second file for the join" />
+    <repeat name="file_chooser" title="Additional Input">
+      <param name="input" label="Additional input file" type="data" format="tabular" />
+    </repeat>
+  </inputs>
+  <outputs>
+    <data name="output" format="pileup" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="column_join_in1.pileup" ftype="pileup" />
+      <param name="columns" value="1,2,3,4,5,7" />
+      <param name="hinge" value="1,2" />
+      <param name="input2" value="column_join_in2.pileup" ftype="pileup" />
+      <param name="input" value="column_join_in3.pileup" ftype="pileup" />
+      <output name="output" file="column_join_out1.pileup" ftype="pileup" />
+    </test>
+    <test>
+      <param name="input1" value="column_join_in4.pileup" ftype="pileup" />
+      <param name="columns" value="1,2,3,4" />
+      <param name="hinge" value="1,2" />
+      <param name="input2" value="column_join_in5.pileup" ftype="pileup" />
+      <param name="input" value="column_join_in6.pileup" ftype="pileup" />
+      <output name="output" file="column_join_out2.pileup" ftype="pileup" />
+    </test>
+    <test>
+      <param name="input1" value="column_join_in7.pileup" ftype="pileup" />
+      <param name="columns" value="1,2,3,4,5" />
+      <param name="hinge" value="1,2" />
+      <param name="input2" value="column_join_in8.pileup" ftype="pileup" />
+      <param name="input" value="column_join_in9.pileup" ftype="pileup" />
+      <output name="output" file="column_join_out3.pileup" ftype="pileup" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool allows you to join several files with the same column structure into one file, removing certain columns if necessary. The user needs to select a 'hinge', which is the number of left-most columns to match on. They also need to select the columns to include in the join, which should include the hinge columns, too.
+
+-----
+
+**Example**
+
+Given the following files::
+
+  FILE 1
+  chr2    1    T    6    .C...,     I$$III
+  chr2    2    G    6    ..N..,     III@II
+  chr2    3    C    7    ..C...,    I$IIIII
+  chr2    4    G    7    .G....,    I#IIIII
+  chr2    5    G    7    ...N..,    IIII#BI
+  chr2    6    A    7    ..T...,    I$IDIII
+  chr1    1    C    1    ^:.        I
+  chr1    2    G    2    .^:.       $I
+  chr1    3    A    2    ..         I%
+  chr1    4    C    2    ..         I$
+  chr1    5    T    3    ..^:.      I#I
+  chr1    6    G    3    ..^:,      I#I
+
+  FILE 2
+  chr1    3    T    1    ^:.        I
+  chr1    4    G    2    .^:.       $I
+  chr1    5    T    2    ..         I%
+  chr1    6    C    3    ..^:.      III
+  chr1    7    G    3    ..^:.      I#I
+  chr1    8    T    4    ...^:,     I#II
+  chr2    77   C    6    .G...,     I$$III
+  chr2    78   G    6    ..N..,     III@II
+  chr2    79   T    7    ..N...,    I$IIIII
+  chr2    80   C    7    .G....,    I#IIIII
+  chr2    81   G    7    ...A..,    IIII#BI
+  chr2    82   A    8    ...G...,   I$IDIIII
+  chr2    83   T    8    .A.....N   IIIIIIII
+  chr2    84   A    9    ......T.   I$IIIIIII
+
+  FILE 3
+  chr1    1    A    1    .          I
+  chr1    2    T    2    G.         I$
+  chr1    3    C    2    .,         I@
+  chr1    4    C    3    ..N        III
+  chr1    42   C    5    ...N^:.    III@I
+  chr1    43   C    5    .N..^:.    IIIII
+  chr1    44   T    5    .A..,      IA@II
+  chr1    45   A    6    .N...^:.   IIIII$
+  chr1    46   G    6    .GN..^:.   I@IIII
+  chr1    47   A    7    ....^:..,  IIIII$I
+  chr2    73   T    5    .N..,      II$II
+  chr2    74   A    5    ....,      IIIII
+  chr2    75   T    5    ....,      IIIII
+  chr2    76   T    5    ....,      IIIII
+  chr2    77   C    5    ....,      IIIBI
+  chr2    78   T    5    ....,      IDIII
+
+To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output::
+
+  chr1    1    C    1              A    1
+  chr1    2    G    2              T    2
+  chr1    3    A    2    T    1    C    2
+  chr1    4    C    2    G    2    C    3
+  chr1    5    T    3    T    2
+  chr1    6    G    3    C    3
+  chr1    7              G    3
+  chr1    8              T    4
+  chr1    42                       C    5
+  chr1    43                       C    5
+  chr1    44                       T    5
+  chr1    45                       A    6
+  chr1    46                       G    6
+  chr1    47                       A    7
+  chr2    1    T    6
+  chr2    2    G    6
+  chr2    3    C    7
+  chr2    4    G    7
+  chr2    5    G    7
+  chr2    6    A    7
+  chr2    73                       T    5
+  chr2    74                       A    5
+  chr2    75                       T    5
+  chr2    76                       T    5
+  chr2    77             C    6    C    5
+  chr2    78             G    6    T    5
+  chr2    79             T    7
+  chr2    80             C    7
+  chr2    81             G    7
+  chr2    82             A    8
+  chr2    83             T    8
+  chr2    84             A    9
+
+
+  </help>
+</tool>
+
+

    

Greg Von Kuster

tags

participants (1)