[galaxy-dev] [hg] galaxy 3489: Add the ability for tests to have a lines_diff...

8 Mar 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/3563fade2c86
changeset: 3489:3563fade2c86
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Fri Mar 05 17:21:16 2010 -0500
description:
Add the ability for tests to have a lines_diff attribute set for standard output datasets that will allow the specified number of lines to vary between test-data and tool run results; a 'change' to a line equates to a difference of 2 lines between files (i.e. 1 line added, 1 line removed). Add the ability for test comparisons to use single or multiline regular expressions, this is accessed by adding the attribute compare= "match_re" or "match_re_multiline" (respectively); the standard diff method can be explicitly declared as "diff", or is used at the default when not specified.

diffstat:

 lib/galaxy/tools/__init__.py    |    7 +-
 lib/galaxy/tools/test.py        |    4 +-
 test/base/twilltestcase.py      |  134 ++++++++++++++++++++++++++++-----------
 test/functional/test_toolbox.py |    4 +-
 4 files changed, 106 insertions(+), 43 deletions(-)

diffs (225 lines):

diff -r 45cfcab6aed7 -r 3563fade2c86 lib/galaxy/tools/__init__.py

--- a/lib/galaxy/tools/__init__.py	Fri Mar 05 12:55:51 2010 -0500
+++ b/lib/galaxy/tools/__init__.py	Fri Mar 05 17:21:16 2010 -0500
@@ -532,8 +532,11 @@
                     file = attrib.pop( 'file', None )
                     if file is None:
                         raise Exception( "Test output does not have a 'file'")
-                    sort = util.string_as_bool( attrib.pop( 'sort', False ) )
-                    test.add_output( name, file, sort )
+                    attributes = Bunch()
+                    attributes.compare = attrib.pop( 'compare', 'diff' ).lower() #method of comparison
+                    attributes.lines_diff = int( attrib.pop( 'lines_diff', '0' ) ) # allow a few lines (dates etc) to vary in logs
+                    attributes.sort = util.string_as_bool( attrib.pop( 'sort', False ) )
+                    test.add_output( name, file, attributes )
             except Exception, e:
                 test.error = True
                 test.exception = e
diff -r 45cfcab6aed7 -r 3563fade2c86 lib/galaxy/tools/test.py
--- a/lib/galaxy/tools/test.py	Fri Mar 05 12:55:51 2010 -0500
+++ b/lib/galaxy/tools/test.py	Fri Mar 05 17:21:16 2010 -0500
@@ -37,8 +37,8 @@
         except Exception, e:
             log.debug( "Error in add_param for %s: %s" % ( name, e ) )
         self.inputs.append( ( name, value, extra ) )
-    def add_output( self, name, file, sort ):
-        self.outputs.append( ( name, file, sort ) )
+    def add_output( self, name, file, extra ):
+        self.outputs.append( ( name, file, extra ) )
     def __expand_grouping_for_data_input( self, name, value, extra, grouping_name, grouping_value ):
         # Currently handles grouping.Conditional and grouping.Repeat
         if isinstance( grouping_value, grouping.Conditional  ):
diff -r 45cfcab6aed7 -r 3563fade2c86 test/base/twilltestcase.py
--- a/test/base/twilltestcase.py	Fri Mar 05 12:55:51 2010 -0500
+++ b/test/base/twilltestcase.py	Fri Mar 05 17:21:16 2010 -0500
@@ -36,14 +36,21 @@
         #self.set_history()
 
     # Functions associated with files
-    def files_diff( self, file1, file2, sort=False ):
+    def files_diff( self, file1, file2, attributes=None ):
         """Checks the contents of 2 files for differences"""
+        def get_lines_diff( diff ):
+            count = 0
+            for line in diff:
+                if ( line.startswith( '+' ) and not line.startswith( '+++' ) ) or ( line.startswith( '-' ) and not line.startswith( '---' ) ):
+                    count += 1
+            return count
         if not filecmp.cmp( file1, file2 ):
             files_differ = False
             local_file = open( file1, 'U' ).readlines()
             history_data = open( file2, 'U' ).readlines()
-            if sort:
+            if attributes and attributes.sort:
                 history_data.sort()
+            ##Why even bother with the check loop below, why not just use the diff output? This seems wasteful.
             if len( local_file ) == len( history_data ):
                 for i in range( len( history_data ) ):
                     if local_file[i].rstrip( '\r\n' ) != history_data[i].rstrip( '\r\n' ):
@@ -52,37 +59,79 @@
             else:
                 files_differ = True
             if files_differ:
-                diff = difflib.unified_diff( local_file, history_data, "local_file", "history_data" )
-                diff_slice = list( islice( diff, 40 ) )        
-                if file1.endswith( '.pdf' ) or file2.endswith( '.pdf' ):
-                    # PDF files contain creation dates, modification dates, ids and descriptions that change with each
-                    # new file, so we need to handle these differences.  As long as the rest of the PDF file does
-                    # not differ we're ok.
-                    valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id' ]
-                    valid_diff = False
-                    for line in diff_slice:
-                        # Make sure to lower case strings before checking.
-                        line = line.lower()
-                        # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n'
-                        if ( line.startswith( '+' ) or line.startswith( '-' ) ) and line.find( 'local_file' ) < 0 and line.find( 'history_data' ) < 0:
-                            for vdf in valid_diff_strs:
-                                if line.find( vdf ) < 0:
-                                    valid_diff = False
-                                else:
-                                    valid_diff = True
-                                    # Stop checking as soon as we know we have a valid difference
-                                    break
-                            if not valid_diff:
-                                # Print out diff_slice so we can see what failed
-                                print "###### diff_slice ######"
-                                raise AssertionError( "".join( diff_slice ) )
-                                break
+                if attributes and attributes.lines_diff:
+                    allowed_diff_count = attributes.lines_diff
                 else:
-                    for line in diff_slice:
-                        for char in line:
-                            if ord( char ) > 128:
-                                raise AssertionError( "Binary data detected, not displaying diff" )
-                    raise AssertionError( "".join( diff_slice ) )
+                    allowed_diff_count = 0
+                diff = list( difflib.unified_diff( local_file, history_data, "local_file", "history_data" ) )
+                diff_lines = get_lines_diff( diff )
+                if diff_lines > allowed_diff_count:
+                    diff_slice = diff[0:40]
+                    #FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff 
+                    #due to unknown desired behavior when used in conjunction with a non-zero lines_diff
+                    #PDF forgiveness can probably be handled better by not special casing by __extension__ here
+                    #and instead using lines_diff or a regular expression matching
+                    #or by creating and using a specialized pdf comparison function
+                    if file1.endswith( '.pdf' ) or file2.endswith( '.pdf' ):
+                        # PDF files contain creation dates, modification dates, ids and descriptions that change with each
+                        # new file, so we need to handle these differences.  As long as the rest of the PDF file does
+                        # not differ we're ok.
+                        valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id' ]
+                        valid_diff = False
+                        for line in diff_slice:
+                            # Make sure to lower case strings before checking.
+                            line = line.lower()
+                            # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n'
+                            if ( line.startswith( '+' ) or line.startswith( '-' ) ) and line.find( 'local_file' ) < 0 and line.find( 'history_data' ) < 0:
+                                for vdf in valid_diff_strs:
+                                    if line.find( vdf ) < 0:
+                                        valid_diff = False
+                                    else:
+                                        valid_diff = True
+                                        # Stop checking as soon as we know we have a valid difference
+                                        break
+                                if not valid_diff:
+                                    # Print out diff_slice so we can see what failed
+                                    print "###### diff_slice ######"
+                                    raise AssertionError( "".join( diff_slice ) )
+                    else:
+                        for line in diff_slice:
+                            for char in line:
+                                if ord( char ) > 128:
+                                    raise AssertionError( "Binary data detected, not displaying diff" )
+                        raise AssertionError( "".join( diff_slice )  )
+
+    def files_re_match( self, file1, file2, attributes=None ):
+        """Checks the contents of 2 files for differences using re.match"""
+        local_file = open( file1, 'U' ).readlines() #regex file
+        history_data = open( file2, 'U' ).readlines()
+        assert len( local_file ) == len( history_data ), 'Data File and Regular Expression File contain a different number of lines (%s != %s)' % ( len( local_file ), len( history_data ) )
+        if attributes and attributes.sort:
+            history_data.sort()
+        if attributes and attributes.lines_diff:
+            lines_diff = attributes.lines_diff
+        else:
+            lines_diff = 0
+        line_diff_count = 0
+        diffs = []
+        for i in range( len( history_data ) ):
+            if not re.match( local_file[i].rstrip( '\r\n' ), history_data[i].rstrip( '\r\n' ) ):
+                line_diff_count += 1
+                diffs.append( 'Regular Expression: %s\nData file         : %s' % ( local_file[i].rstrip( '\r\n' ),  history_data[i].rstrip( '\r\n' ) ) )
+            if line_diff_count > lines_diff:
+                raise AssertionError, "Regular expression did not match data file (allowed variants=%i):\n%s" % ( lines_diff, "".join( diffs ) )
+
+    def files_re_match_multiline( self, file1, file2, attributes=None ):
+        """Checks the contents of 2 files for differences using re.match in multiline mode"""
+        local_file = open( file1, 'U' ).read() #regex file
+        if attributes and attributes.sort:
+            history_data = open( file2, 'U' ).readlines()
+            history_data.sort()
+            history_data = ''.join( history_data )
+        else:
+            history_data = open( file2, 'U' ).read()
+        #lines_diff not applicable to multiline matching
+        assert re.match( local_file, history_data, re.MULTILINE ), "Multiline Regular expression did not match data file"
 
     def get_filename( self, filename ):
         full = os.path.join( self.file_dir, filename)
@@ -541,7 +590,7 @@
             hid = elem.get('hid')
             hids.append(hid)
         return hids
-    def verify_dataset_correctness( self, filename, hid=None, wait=True, maxseconds=120, sort=False ):
+    def verify_dataset_correctness( self, filename, hid=None, wait=True, maxseconds=120, attributes=None ):
         """Verifies that the attributes and contents of a history item meet expectations"""
         if wait:
             self.wait( maxseconds=maxseconds ) #wait for job to finish
@@ -576,13 +625,24 @@
             data = self.last_page()
             file( temp_name, 'wb' ).write(data)
             try:
-                self.files_diff( local_name, temp_name, sort=sort )
+                if attributes and 'compare' in attributes:
+                    compare = attributes.compare
+                else:
+                    compare = 'diff'
+                if compare == 'diff':
+                    self.files_diff( local_name, temp_name, attributes=attributes )
+                elif compare == 're_match':
+                    self.files_re_match( local_name, temp_name, attributes=attributes )
+                elif compare == 're_match_multiline':
+                    self.files_re_match_multiline( local_name, temp_name, attributes=attributes )
+                else:
+                    raise Exception, 'Unimplemented Compare type: %s' % compare
             except AssertionError, err:
-                os.remove(temp_name)
-                errmsg = 'History item %s different than expected, difference:\n' % hid
+                errmsg = 'History item %s different than expected, difference (using %s):\n' % ( hid, compare )
                 errmsg += str( err )
                 raise AssertionError( errmsg )
-            os.remove(temp_name)
+            finally:
+                os.remove( temp_name )
     def verify_composite_datatype_file_content( self, file_name, hda_id ):
         local_name = self.get_filename( file_name )
         temp_name = self.get_filename( 'temp_%s' % file_name )
diff -r 45cfcab6aed7 -r 3563fade2c86 test/functional/test_toolbox.py
--- a/test/functional/test_toolbox.py	Fri Mar 05 12:55:51 2010 -0500
+++ b/test/functional/test_toolbox.py	Fri Mar 05 17:21:16 2010 -0500
@@ -66,13 +66,13 @@
         self.assertTrue( data_list )
         elem_index = 0 - len( testdef.outputs )
         for output_tuple in testdef.outputs:
-            name, file, sort = output_tuple
+            name, outfile, attributes = output_tuple
             # Get the correct hid
             elem = data_list[ elem_index ]
             self.assertTrue( elem is not None )
             elem_hid = elem.get( 'hid' )
             elem_index += 1
-            self.verify_dataset_correctness( file, hid=elem_hid, maxseconds=testdef.maxseconds, sort=sort )
+            self.verify_dataset_correctness( outfile, hid=elem_hid, maxseconds=testdef.maxseconds, attributes=attributes )
         self.delete_history( id=self.security.encode_id( latest_history.id ) )
 
     def __expand_grouping( self, tool_inputs, declared_inputs, prefix='' ):

    

[galaxy-dev] [hg] galaxy 3489: Add the ability for tests to have a lines_diff...

Greg Von Kuster