galaxy-dist commit 1f467b4962cc: Corrected bug in column_join that resulted in some items incorrectly being listed on more than one line

29 Jun 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Kelly Vincent <kpvincent@bx.psu.edu>
# Date 1277392524 14400
# Node ID 1f467b4962cc6b01cbdc829407e08169683ce8ce
# Parent  8adc2157e02a8b144697147b5e5a64833f0d1964
Corrected bug in column_join that resulted in some items incorrectly being listed on more than one line

--- a/tools/new_operations/column_join.py
+++ b/tools/new_operations/column_join.py
@@ -3,14 +3,14 @@
 """
 This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
 
-usage: %prog output input1 input2 column1[,column2[,column3[,...]]] hinge1[,hinge2[,hinge3[,...]]] [other_input1 [other_input2 [other_input3 ...]]]
-    output: the output pileup
-    input1: the pileup file to start with
-    input2: the second pileup file to join
-    hinge: the columns to be used for matching
-    columns: the columns that should appear in the output
+usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
+    -o, output=0: the output pileup
+    -1, input1=1: the pileup file to start with
+    -2, input2=2: the second pileup file to join
+    -g, hinge=h: the columns to be used for matching
+    -c, columns=c: the columns that should appear in the output
+    -f, fill_options_file=f: the file specifying the fill value to use
     other_inputs: the other input files to join
-
 """
 
 import optparse, os, re, struct, sys, tempfile
@@ -31,63 +31,87 @@ def stop_err( msg ):
     sys.stderr.write( msg )
     sys.exit()
 
+def split_nums( text ):
+    """
+    Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
+    """
+    split_t = []
+    c = ''
+    n = ''
+    for ch in text:
+        try:
+            v = int( ch )
+            n += ch
+            if c:
+                split_t.append( ''.join( c ) )
+                c = ''
+        except ValueError:
+            c += ch
+            if n:
+                split_t.append( int( ''.join( n ) ) )
+                n = ''
+    if c:
+        split_t.append( ''.join( c ) )
+    if n:
+        split_t.append( int( ''.join( n ) ) )
+    return split_t
+
 def hinge_compare( hinge1, hinge2 ):
     """
     Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
     first part handled as text but last part as number
     """
-    pat = re.compile( '(?P<text>\D*)(?P<number>\d+)?' )
     split_hinge1 = hinge1.split( '\t' )
     split_hinge2 = hinge2.split( '\t' )
-    for i in range( len( split_hinge1 ) ):
-        if split_hinge1[ i ] == split_hinge2[ i ]:
+    # quick check if either hinge is empty
+    if not ''.join( split_hinge2 ):
+        if ''.join( split_hinge1 ):
+            return 1
+        elif not ''.join( split_hinge1 ):
+            return 0
+    else:
+        if not ''.join( split_hinge1 ):
+            return -1
+    # go through all parts of the hinges and compare
+    for i, sh1 in enumerate( split_hinge1 ):
+        # if these hinge segments are the same, just move on to the next ones
+        if sh1 == split_hinge2[ i ]:
             continue
-        try:
-            if int( split_hinge1[ i ] ) > int( split_hinge2[ i ] ):
+        # check all parts of each hinge
+        h1 = split_nums( sh1 )
+        h2 = split_nums( split_hinge2[ i ] )
+        for j, h in enumerate( h1 ):
+            # if second hinge has no more parts, first is considered larger
+            if j > 0 and len( h2 ) <= j:
                 return 1
-            else:
-                return -1
-        except ValueError:
-            try:
-                if float( split_hinge1[ i ] ) > float( split_hinge2[ i ] ):
+            # if these two parts are the same, move on to next
+            if h == h2[ j ]:
+                continue
+            # do actual comparison, depending on whether letter or number
+            if type( h ) == int:
+                if type( h2[ j ] ) == int:
+                    if h > h2[ j ]:
+                        return 1
+                    elif h < h2[ j ]:
+                        return -1
+                # numbers are less than letters
+                elif type( h2[ j ] ) == str:
+                    return -1
+            elif type( h ) == str:
+                if type( h2[ j ] ) == str:
+                    if h > h2[ j ]:
+                        return 1
+                    elif h < h2[ j ]:
+                        return -1
+                # numbers are less than letters
+                elif type( h2[ j ] ) == int:
                     return 1
-                else:
-                    return -1
-            except ValueError:
-                return ref_compare( split_hinge1[ i ], split_hinge2[ i ])
-    return 0
-
-def ref_compare( ref1, ref2 ):
-    """
-    Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
-    first part handled as text but last part as number
-    """
-    pat = re.compile( '(?P<text>\D*)(?P<number>\d+)?' )
-    r1 = pat.match( ref1 )
-    r2 = pat.match( ref2 )
-    if not r2:
+    # if all else has failed, just do basic string comparison
+    if hinge1 > hinge2:
         return 1
-    elif not r1:
-        return -1
-    text1, num1 = r1.groupdict()[ 'text' ].strip(), r1.groupdict()[ 'number' ]
-    text2, num2 = r2.groupdict()[ 'text' ].strip(), r2.groupdict()[ 'number' ]
-    if text2 == '' and ( num2 == '' or num2 is None ):
-        return 1
-    elif text1 == '' and ( num1 == '' or num1 is None ):
-        return -1
-    if text1 > text2:
-        return 1
-    elif text1 == text2:
-        if not ( num1 is None or num2 is None ):
-            num1 = int( num1 )
-            num2 = int( num2 )
-        if num1 > num2:
-            return 1
-        elif num1 == num2:
-            return 0
-        elif num1 < num2:
-            return -1
-    elif text1 < text2:
+    elif hinge1 == hinge2:
+        return 0
+    elif hinge1 < hinge2:
         return -1
 
 def hinge_sort( infile, outfile, hinge ):
@@ -119,49 +143,18 @@ def hinge_sort( infile, outfile, hinge )
     fout.close()
     fin.close()
 
-def min_chr_pos( chr_pos ):
-    """Given line and hinge, identifies the 'smallest' one, from left to right"""
-    if len( chr_pos ) == 0 and ''.join( chr_pos ):
-        return ''
-    min_loc = len( chr_pos )
-    min_hinge = []
-    loc = 0
-    for c_pos in chr_pos:
-        if c_pos.strip():
-            split_c = c_pos.split( '\t' )
-            
-            
-            ref, pos = c_pos.split( '\t' )[:2]
-            pos = int( pos )
-            if not min_hinge:
-                min_hinge = split_c
-                min_loc = loc
-            else:
-                ref_comp = ref_compare( ref, min_ref_pos[0] )
-                if ref_comp < 0:
-                    min_ref_pos = [ ref, pos ]
-                    min_loc = loc
-                elif ref_comp == 0 and pos < min_ref_pos[1]:
-                    min_ref_pos[1] = pos
-                    min_loc = loc
-        loc += 1
-    return '%s\t%s' % tuple( min_ref_pos ), min_loc
-
 def __main__():
     parser = optparse.OptionParser()
-    parser.add_option( '', '--output', dest='output', help='' )
-    parser.add_option( '', '--input1', dest='input1', help='' )
-    parser.add_option( '', '--input2', dest='input2', help='' )
-    parser.add_option( '', '--hinge', dest='hinge', help='' )
-    parser.add_option( '', '--columns', dest='columns', help='' )
-    parser.add_option( '', '--fill_options_file', dest='fill_options_file', default=None, help='' )
+    parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
+    parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
+    parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
+    parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
+    parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
+    parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
     (options, args) = parser.parse_args()
-    output = options.output
-    input1 = options.input1
-    input2 = options.input2
     hinge = int( options.hinge )
     cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
-    inputs = [ input1, input2 ]
+    inputs = [ options.input1, options.input2 ]
     if options.fill_options_file == "None":
         inputs.extend( args )
     else:
@@ -201,7 +194,7 @@ def __main__():
         tmp_input_files.append( tmp_file )
     # cycle through files, getting smallest line of all files one at a time
     # also have to keep track of vertical position of extra columns
-    fout = file( output, 'w' )
+    fout = file( options.output, 'w' )
     old_current = ''
     first_line = True
     current_lines = [ f.readline() for f in tmp_input_files ]
@@ -272,5 +265,6 @@ def __main__():
     fout.close()
     for f in tmp_input_files:
         os.unlink( f.name )
+    file('/afs/bx.psu.edu/user/kpvincent/galaxy-commit/actual_out', 'w').write(file(fout.name,'r').read())
 
 if __name__ == "__main__" : __main__()

--- /dev/null
+++ b/test-data/column_join_in10.pileup
@@ -0,0 +1,26 @@
+0610009D07Rik	2	1.41	1.41	-0.24	12/12	2	1
+1110002N22Rik	2	1.70	1.70	-0.06	10/12	2	1
+1110008L16Rik	3	1.73	1.73	-0.54	12/12	2	1
+1110054O05Rik	1	1.55	1.55	1.14	5/12	1	1
+Actg1	2	4.24	4.24	2.36	4/12	2	1
+Actl6a	2	1.55	1.55	1.00	10/12	1	1
+Actn1	1	3.46	3.46	3.17	1/12	1	1
+Actn4	1	3.46	3.46	3.17	1/12	1	1
+Bnc2	1	2.00	2.00	1.67	3/12	1	1
+Bub3	2	1.89	1.89	1.02	9/12	2	1
+Cad	4	4.90	4.90	3.09	2/12	1	1
+Calm1;Calm3;Calm2	2	2.83	2.83	2.57	3/12	1	1
+E130012A19Rik	2	5.66	5.66	1.50	3/12	2	1
+E2f6	2	3.39	3.39	1.80	5/12	2	1
+Gm12620	1	3.46	3.46	3.17	1/12	1	1
+Gm13092;LOC677017	1	1.15	1.15	0.29	9/12	1	1
+Gm14173;Rpl37a;Gm4149	1	3.00	3.00	1.37	4/12	2	1
+Gm14393;2210418O10Rik;Gm14296;Gm14401;RP23-330D3.5	1	3.46	3.46	3.17	1/12	1	1
+Gm189	1	1.20	1.20	0.16	10/12	2	1
+Sfrs7	1	1.71	1.71	0.18	7/12	2	1
+Sin3a	1	1.71	1.71	-0.12	7/12	2	1
+Ski	1	2.45	2.45	2.13	2/12	1	1
+Skil	1	2.00	2.00	1.03	3/12	1	1
+Tubb2c	1	2.00	2.00	1.67	3/12	1	1
+Tubb2c-ps1	1	12.00	12.00	3.17	1/12	2	1
+Zscan4f	2	1.70	1.70	1.00	10/12	2	1

--- /dev/null
+++ b/test-data/column_join_in12.pileup
@@ -0,0 +1,36 @@
+0610009D07Rik	4	2.00	2.00	2.54	12/12	2	1
+0610010K14Rik	1	1.41	1.41	0.96	6/12	1	1
+1110002N22Rik	2	1.70	1.70	-0.06	10/12	2	1
+1110008L16Rik	6	2.45	2.45	0.68	12/12	2	1
+1110037F02Rik	1	2.45	2.45	2.13	2/12	1	1
+1190005F20Rik	4	2.18	2.18	0.28	11/12	2	1
+Acot8	1	2.00	2.00	0.07	6/12	2	1
+Acta1	2	1.54	1.54	0.63	11/12	2	1
+Actb	2	1.89	1.89	1.35	9/12	2	1
+Actl6b	1	6.00	6.00	2.13	2/12	2	1
+Bend3	1	1.15	1.33	-0.51	9/12	1	1.33
+Bend5	1	3.46	3.46	3.17	1/12	1	1
+Brip1	2	1.73	1.73	0.58	8/12	1	1
+Btf3;Gm3531	1	4.00	4.00	1.67	3/12	2	1
+Bub3	1	1.33	1.33	-0.09	9/12	2	1
+C130039O16Rik	1	2.45	2.45	2.13	2/12	1	1
+C1d	1	1.73	1.73	0.87	4/12	1	1
+Caprin1	2	2.42	2.42	0.51	7/12	2	1
+Cbx3	2	1.54	1.54	0.75	11/12	2	1
+Eed	1	1.10	1.10	-0.47	10/12	1	1
+Efha1	1	3.46	3.46	3.17	1/12	1	1
+Exosc1	3	1.73	1.73	1.29	12/12	2	1
+Exosc10	25	5.00	5.00	1.03	12/12	2	1
+Gm189	2	1.70	1.70	2.12	10/12	2	1
+Gm3200	1	2.45	2.45	2.13	2/12	1	1
+Gm9855;Tdg	2	1.70	1.70	1.37	10/12	2	1
+Sfrs11	4	2.00	2.00	2.54	12/12	2	1
+Sfrs12	2	5.66	5.66	2.57	3/12	2	1
+Sin3a	1	1.31	1.31	-0.12	7/12	1	1
+Sirt7	1	2.00	2.00	1.67	3/12	1	1
+Skiv2l2	34	5.83	5.83	0.68	12/12	2	1
+Tubb2b	3	1.73	1.73	-0.10	12/12	2	1
+Tubb4	1	1.15	1.15	0.29	9/12	1	1
+Zscan4-ps2	1	12.00	12.00	3.17	1/12	2	1
+Zscan4e	2	2.83	2.83	2.12	6/12	2	1
+Zscan4f	2	1.70	1.70	1.00	10/12	2	1

--- a/tools/new_operations/column_join.xml
+++ b/tools/new_operations/column_join.xml
@@ -92,7 +92,7 @@ import simplejson
       <param name="input" value="column_join_in6.pileup" ftype="pileup" /><output name="output" file="column_join_out2.pileup" ftype="tabular" /></test>
-<!--  This test is failing for an unclear reason (the column values do not get 
+<!--  This test is failing for an unclear reason (the column values do not get
       passed into the script), but passes in the browser
     <test><param name="input1" value="column_join_in7.pileup" ftype="tabular" />
@@ -106,7 +106,17 @@ import simplejson
       <param name="input" value="column_join_in9.pileup" ftype="tabular" /><output name="output" file="column_join_out3.pileup" ftype="tabular" /></test>
---></tests>
+-->
+    <test>
+      <param name="input1" value="column_join_in10.pileup" ftype="pileup" />
+      <param name="hinge" value="1" />
+      <param name="columns" value="2,7" />
+      <param name="fill_empty_columns_switch" value="no_fill" />
+      <param name="input2" value="column_join_in11.pileup" ftype="pileup" />
+      <param name="input" value="column_join_in12.pileup" ftype="pileup" />
+      <output name="output" file="column_join_out4.pileup" ftype="tabular" />
+    </test>
+    </tests><help>
 **What it does**
 
@@ -204,5 +214,3 @@ To join on columns 3 and 4 combining on 
 
   </help></tool>
-
-

--- /dev/null
+++ b/test-data/column_join_in11.pileup
@@ -0,0 +1,36 @@
+0610009D07Rik	3	1.73	1.73	1.15	12/12	2	1
+0610010K14Rik	1	1.41	1.41	0.96	6/12	1	1
+1110002N22Rik	3	2.08	2.08	0.74	10/12	2	1
+1110008L16Rik	1	1.00	1.00	-1.35	12/12	1	1
+Acta1	2	1.54	1.54	0.63	11/12	2	1
+Actb	1	1.33	1.33	0.00	9/12	2	1
+Actg1	1	3.00	3.00	0.87	4/12	2	1
+Actl6a	1	1.10	1.10	-0.33	10/12	1	1
+Actl6b	1	2.45	2.45	2.13	2/12	1	1
+Bnc2	1	2.00	2.00	1.67	3/12	1	1
+Bptf	1	3.46	3.46	3.17	1/12	1	1
+Brip1	1	1.22	1.22	-0.19	8/12	1	1
+Brms1l	1	12.00	12.00	3.17	1/12	2	1
+Btf3;Gm3531	1	2.00	2.00	1.67	3/12	1	1
+Bub3	3	2.00	2.00	2.13	9/12	1	1
+C330007P06Rik	1	2.45	2.45	2.13	2/12	1	1
+Cad	1	2.45	2.45	0.50	2/12	1	1
+Calm1;Calm3;Calm2	1	2.00	2.00	1.03	3/12	1	1
+Cbx1	2	3.39	3.39	2.24	5/12	2	1
+E2f6	1	2.40	2.40	0.53	5/12	2	1
+Eed	1	1.20	1.20	-0.47	10/12	2	1
+Gm10079	2	1.41	1.41	0.16	12/12	1	1
+Gm11230	2	1.48	1.48	1.21	11/12	1	1
+Gm13072;Trmt112	1	3.46	3.46	3.17	1/12	1	1
+Gm13092;LOC677017	1	1.33	1.33	0.29	9/12	2	1
+Gm14231	1	1.31	1.31	0.51	7/12	1	1
+Gm14456;Tpt1	1	2.00	2.00	1.67	3/12	1	1
+Gm15501;Rps8	1	1.55	1.55	1.14	5/12	1	1
+Gm189	1	1.20	1.20	0.16	10/12	2	1
+Sfrs11	3	1.73	1.73	1.15	12/12	2	1
+Sin3a	4	3.43	3.43	1.93	7/12	2	1
+Sirt7	1	2.00	2.00	1.67	3/12	1	1
+Skiv2l2	12	3.46	3.46	-0.72	12/12	2	1
+Tubb2b	4	2.00	2.00	0.49	12/12	2	1
+Zscan4e	1	1.41	1.41	0.63	6/12	1	1
+Zscan4f	2	1.70	1.70	1.00	10/12	2	1

--- /dev/null
+++ b/test-data/column_join_out4.pileup
@@ -0,0 +1,65 @@
+0610009D07Rik	2	2	3	2	4	2
+0610010K14Rik			1	1	1	1
+1110002N22Rik	2	2	3	2	2	2
+1110008L16Rik	3	2	1	1	6	2
+1110037F02Rik					1	1
+1110054O05Rik	1	1				
+1190005F20Rik					4	2
+Acot8					1	2
+Acta1			2	2	2	2
+Actb			1	2	2	2
+Actg1	2	2	1	2		
+Actl6a	2	1	1	1		
+Actl6b			1	1	1	2
+Actn1	1	1				
+Actn4	1	1				
+Bend3					1	1
+Bend5					1	1
+Bnc2	1	1	1	1		
+Bptf			1	1		
+Brip1			1	1	2	1
+Brms1l			1	2		
+Btf3;Gm3531			1	1	1	2
+Bub3	2	2	3	1	1	2
+C1d					1	1
+C130039O16Rik					1	1
+C330007P06Rik			1	1		
+Cad	4	1	1	1		
+Calm1;Calm3;Calm2	2	1	1	1		
+Caprin1					2	2
+Cbx1			2	2		
+Cbx3					2	2
+E2f6	2	2	1	2		
+E130012A19Rik	2	2				
+Eed			1	2	1	1
+Efha1					1	1
+Exosc1					3	2
+Exosc10					25	2
+Gm189	1	2	1	2	2	2
+Gm3200					1	1
+Gm9855;Tdg					2	2
+Gm10079			2	1		
+Gm11230			2	1		
+Gm12620	1	1				
+Gm13072;Trmt112			1	1		
+Gm13092;LOC677017	1	1	1	2		
+Gm14173;Rpl37a;Gm4149	1	2				
+Gm14231			1	1		
+Gm14393;2210418O10Rik;Gm14296;Gm14401;RP23-330D3.5	1	1				
+Gm14456;Tpt1			1	1		
+Gm15501;Rps8			1	1		
+Sfrs7	1	2				
+Sfrs11			3	2	4	2
+Sfrs12					2	2
+Sin3a	1	2	4	2	1	1
+Sirt7			1	1	1	1
+Ski	1	1				
+Skil	1	1				
+Skiv2l2			12	2	34	2
+Tubb2b			4	2	3	2
+Tubb2c	1	1				
+Tubb2c-ps1	1	2				
+Tubb4					1	1
+Zscan4-ps2					1	2
+Zscan4e			1	1	2	2
+Zscan4f	2	2	2	2	2	2

    

commits-noreply＠bitbucket.org

tags

participants (1)