commit/galaxy-central: kanwei: Grouping tool:

3 Jun 2011

1 new changeset in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/7ee052a1f4f2/
changeset:   7ee052a1f4f2
branches:    
user:        kanwei
date:        2011-06-03 22:08:26
summary:     Grouping tool:
- Converted to use numpy, which is included with Galaxy, instead of rpy, which is not.
- Don't simply skip lines if a numeric parameter encounters non-numeric data, return descriptive error with line contents instead
- General refactoring
- Added new comprehensive test, disabled until test framework supports multiple repeat inputs
affected #:  3 files (3.3 KB)

--- a/tools/stats/grouping.py	Fri Jun 03 15:36:59 2011 -0400
+++ b/tools/stats/grouping.py	Fri Jun 03 16:08:26 2011 -0400
@@ -1,73 +1,58 @@
 #!/usr/bin/env python
 # Guruprasad Ananda
 # Refactored 2011, Kanwei Li
+# Refactored to use numpy instead of rpy
 """
 This tool provides the SQL "group by" functionality.
 """
-import sys, string, re, commands, tempfile, random
-from rpy import *
+import sys, commands, tempfile, random
+import pkg_resources 
+pkg_resources.require( "numpy" )
+import numpy
+
 from itertools import groupby
 
 def stop_err(msg):
     sys.stderr.write(msg)
     sys.exit()
 
+def mode(data):
+    counts = {}
+    for x in data:
+        counts[x] = counts.get(x,0) + 1
+    maxcount = max(counts.values())
+    modelist = []
+    for x in counts:
+        if counts[x] == maxcount:
+            modelist.append( str(x) )
+    return ','.join(modelist)
+    
 def main():
     inputfile = sys.argv[2]
     ignorecase = int(sys.argv[4])
     ops = []
     cols = []
-    rounds = []
-    elems = []
+    round_val = []
+    data_ary = []
     
     for var in sys.argv[5:]:
-        ops.append(var.split()[0])
-        cols.append(var.split()[1])
-        rounds.append(var.split()[2])
-    
-    if 'Mode' in ops:
-        try:
-            r.library('prettyR')
-        except:
-            stop_err('R package prettyR could not be loaded. Please make sure it is installed.')
-    
+        op, col, do_round = var.split()
+        ops.append(op)
+        cols.append(col)
+        round_val.append(do_round)
     """
     At this point, ops, cols and rounds will look something like this:
     ops:  ['mean', 'min', 'c']
     cols: ['1', '3', '4']
-    rounds: ['no', 'yes' 'no']
+    round_val: ['no', 'yes' 'no']
     """
-    
-    for i, line in enumerate( file ( inputfile )):
-        line = line.rstrip('\r\n')
-        if len( line )>0 and not line.startswith( '#' ):
-            elems = line.split( '\t' )
-            break
-        if i == 30:
-            break # Hopefully we'll never get here...
-    
-    if len( elems )<1:
-        stop_err( "The data in your input dataset is either missing or not formatted properly." )
-    
+
     try:
         group_col = int( sys.argv[3] )-1
     except:
         stop_err( "Group column not specified." )
     
     str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs
-    for k, col in enumerate(cols):
-        col = int(col)-1
-        if ops[k] not in str_ops:
-            # We'll get here only if the user didn't choose 'Concatenate' or 'Count' or 'Count Distinct' or 'pick randomly', which are the
-            # only aggregation functions that can be used on columns containing strings.
-            try:
-                float( elems[col] )
-            except:
-                try:
-                    msg = "Operation '%s' cannot be performed on non-numeric column %d containing value '%s'." %( ops[k], col+1, elems[col] )
-                except:
-                    msg = "Operation '%s' cannot be performed on non-numeric data." %ops[k]
-                stop_err( msg )
     
     tmpfile = tempfile.NamedTemporaryFile()
     
@@ -83,7 +68,7 @@
         case = ''
         if ignorecase == 1:
             case = '-f' 
-        command_line = "sort -t '	' " + case + " -k" + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
+        command_line = "sort -t '	' %s -k%s,%s -o %s %s" % (case, group_col+1, group_col+1, tmpfile.name, inputfile)
     except Exception, exc:
         stop_err( 'Initialization error -> %s' %str(exc) )
     
@@ -92,21 +77,16 @@
     if error_code != 0:
         stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
         
-    prev_item = None
-    skipped_lines = 0
-    first_invalid_line = None
-    invalid_value = ''
-    invalid_column = 0
     fout = open(sys.argv[1], "w")
     
     def is_new_item(line):
         item = line.strip().split("\t")[group_col]
         if ignorecase == 1:
-            item = item.lower()
+            return item.lower()
         return item
         
     for key, line_list in groupby(tmpfile, key=is_new_item):
-        op_vals = [ [] for op in cols ]
+        op_vals = [ [] for op in ops ]
         out_str = key
         multiple_modes = False
         mode_index = None
@@ -115,86 +95,65 @@
             fields = line.strip().split("\t")
             for i, col in enumerate(cols):
                 col = int(col)-1 # cXX from galaxy is 1-based
-                val = fields[col].strip()
-                # Before appending the current value, make sure it is numeric if the
-                # operation for the column requires it.
-                if ops[i] not in str_ops:
-                    try:
-                        float(val)
-                    except ValueError:
-                        skipped_lines += 1
-                        if first_invalid_line is None:
-                            first_invalid_line = i+1
-                            invalid_value = fields[col]
-                            invalid_column = col+1
-                        break
+                try:
+                    val = fields[col].strip()
+                    op_vals[i].append(val)
+                except IndexError:
+                    sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col+1, line) )
+                    sys.exit( 1 )
                 
-                op_vals[i].append(val)
+        # Generate string for each op for this group
+        for i, op in enumerate( ops ):
+            data = op_vals[i]
+            rval = ""
+            if op == "mode":
+                rval = mode( data )
+            elif op == "length":
+                rval = len( data )
+            elif op == "random":
+                rval = random.choice(data)
+            elif op in ['cat', 'cat_uniq']:
+                if op == 'cat_uniq':
+                    data = numpy.unique(data)
+                rval = ','.join(data)
+            elif op == "unique":
+                rval = len( numpy.unique(data) )
+            else:
+                # some kind of numpy fn
+                try:
+                    data = map(float, data)
+                except ValueError:
+                    sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) )
+                    sys.exit( 1 )
+                rval = getattr(numpy, op)( data )
+                if round_val[i] == 'yes':
+                    rval = round(rval)
+                else:
+                    rval = '%g' % rval
+                        
+            out_str += "\t%s" % rval
         
-        for i, op in enumerate( ops ):
-            if op == 'cuniq':
-                rfunc = "r.c"
-            else:
-                rfunc = "r." + op 
-            if op not in str_ops:
-                for j, elem in enumerate( op_vals[i] ):
-                    op_vals[i][j] = float( elem )
-                rout = eval( rfunc )( op_vals[i] )
-                if rounds[i] == 'yes':
-                    rout = round(float(rout))
-                else:
-                    rout = '%g' %(float(rout))
-            else:
-                if op != 'random':
-                    rout = eval( rfunc )( op_vals[i] )
-                else:
-                    try:
-                        rand_index = random.randint(0,len(op_vals[i])-1)  #if the two inputs to randint are equal, it seems to throw a ValueError. This can't be reproduced with the python interpreter in its interactive mode. 
-                    except:
-                        rand_index = 0
-                    rout = op_vals[i][rand_index]
-            
-            if op == 'Mode' and rout == '>1 mode':
-                multiple_modes = True
-                mode_index = i
-            if op == 'unique':
-                rfunc = "r.length" 
-                rout = eval( rfunc )( rout )
-            if op in ['c', 'cuniq']:
-                if isinstance(rout, list):
-                    if op == 'cuniq':
-                        rout = set(rout)
-                    out_str += "\t" + ','.join(rout)
-                else:
-                    out_str += "\t" + str(rout)
-            else:
-                out_str += "\t" + str(rout)
-        if multiple_modes and mode_index != None:
-            out_str_list = out_str.split('\t')
-            for val in op_vals[mode_index]:
-                out_str = '\t'.join(out_str_list[:mode_index+1]) + '\t' + str(val) + '\t' + '\t'.join(out_str_list[mode_index+2:])
-                fout.write(out_str.rstrip('\t') + "\n")
-        else:
-            fout.write(out_str + "\n")
+        fout.write(out_str + "\n")
     
     # Generate a useful info message.
     msg = "--Group by c%d: " %(group_col+1)
     for i, op in enumerate(ops):
-        if op == 'c':
+        if op == 'cat':
             op = 'concat'
+        elif op == 'cat_uniq':
+            op = 'concat_distinct'
         elif op == 'length':
             op = 'count'
         elif op == 'unique':
             op = 'count_distinct'
         elif op == 'random':
             op = 'randomly_pick'
-        elif op == 'cuniq':
-            op = 'concat_distinct'
+        
         msg += op + "[c" + cols[i] + "] "
-    if skipped_lines > 0:
-        msg+= "--skipped %d invalid lines starting with line %d.  Value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column )
     
     print msg
+    fout.close()
+    tmpfile.close()
 
 if __name__ == "__main__":
     main()


--- a/tools/stats/grouping.xml	Fri Jun 03 15:36:59 2011 -0400
+++ b/tools/stats/grouping.xml	Fri Jun 03 16:08:26 2011 -0400
@@ -1,4 +1,4 @@
-<tool id="Grouping1" name="Group" version="1.9.4">
+<tool id="Grouping1" name="Group" version="2.0.0"><description>data by a column and perform aggregate operation on other columns.</description><command interpreter="python">
     grouping.py 
@@ -22,16 +22,16 @@
       <param name="optype" type="select" label="Type"><option value="mean">Mean</option><option value="median">Median</option>
-        <option value="Mode">Mode</option>
+        <option value="mode">Mode</option><option value="max">Maximum</option><option value="min">Minimum</option><option value="sum">Sum</option><option value="length">Count</option><option value="unique">Count Distinct</option>
-        <option value="c">Concatenate</option>
-        <option value="cuniq">Concatenate Distinct</option>
+        <option value="cat">Concatenate</option>
+        <option value="cat_uniq">Concatenate Distinct</option><option value="random">Randomly pick</option>
-        <option value="sd">Standard deviation</option>
+        <option value="std">Standard deviation</option></param><param name="opcol" label="On column" type="data_column" data_ref="input1" /><param name="opround" type="select" label="Round result to nearest integer?">
@@ -44,7 +44,7 @@
     <data format="tabular" name="out_file1" /></outputs><requirements>
-    <requirement type="python-module">rpy</requirement>
+    <requirement type="python-module">numpy</requirement></requirements><tests><!-- Test valid data -->
@@ -57,8 +57,16 @@
       <param name="opround" value="no"/><output name="out_file1" file="groupby_out1.dat"/></test>
-    
-    <!-- Test data with an invalid value in a column -->
+    <!-- Long case but test framework doesn't allow yet
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="false"/>
+      <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/>
+      <output name="out_file1" file="groupby_out3.tabular"/>
+    </test>
+    -->
+    <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors
     <test><param name="input1" value="1.tabular"/><param name="groupcol" value="1"/>
@@ -68,6 +76,7 @@
       <param name="opround" value="no"/><output name="out_file1" file="groupby_out2.dat"/></test>
+     --></tests><help>
 
@@ -79,9 +88,7 @@
 
 **Syntax**
 
-This tool allows you to group the input dataset by a particular column and perform aggregate functions like Mean, Median, Mode, Sum, Max, Min, Count, Random draw and Concatenate on other columns. 
-
-- All invalid, blank and comment lines are skipped when performing the aggregate functions.  The number of skipped lines is displayed in the resulting history item.
+This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Randomly pick, and Concatenate on any column.
 
 - If multiple modes are present, all are reported.

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

Bitbucket

tags

participants (1)