details: http://www.bx.psu.edu/hg/galaxy/rev/3563fade2c86 changeset: 3489:3563fade2c86 user: Dan Blankenberg <dan@bx.psu.edu> date: Fri Mar 05 17:21:16 2010 -0500 description: Add the ability for tests to have a lines_diff attribute set for standard output datasets that will allow the specified number of lines to vary between test-data and tool run results; a 'change' to a line equates to a difference of 2 lines between files (i.e. 1 line added, 1 line removed). Add the ability for test comparisons to use single or multiline regular expressions, this is accessed by adding the attribute compare= "match_re" or "match_re_multiline" (respectively); the standard diff method can be explicitly declared as "diff", or is used at the default when not specified. diffstat: lib/galaxy/tools/__init__.py | 7 +- lib/galaxy/tools/test.py | 4 +- test/base/twilltestcase.py | 134 ++++++++++++++++++++++++++++----------- test/functional/test_toolbox.py | 4 +- 4 files changed, 106 insertions(+), 43 deletions(-) diffs (225 lines): diff -r 45cfcab6aed7 -r 3563fade2c86 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py Fri Mar 05 12:55:51 2010 -0500 +++ b/lib/galaxy/tools/__init__.py Fri Mar 05 17:21:16 2010 -0500 @@ -532,8 +532,11 @@ file = attrib.pop( 'file', None ) if file is None: raise Exception( "Test output does not have a 'file'") - sort = util.string_as_bool( attrib.pop( 'sort', False ) ) - test.add_output( name, file, sort ) + attributes = Bunch() + attributes.compare = attrib.pop( 'compare', 'diff' ).lower() #method of comparison + attributes.lines_diff = int( attrib.pop( 'lines_diff', '0' ) ) # allow a few lines (dates etc) to vary in logs + attributes.sort = util.string_as_bool( attrib.pop( 'sort', False ) ) + test.add_output( name, file, attributes ) except Exception, e: test.error = True test.exception = e diff -r 45cfcab6aed7 -r 3563fade2c86 lib/galaxy/tools/test.py --- a/lib/galaxy/tools/test.py Fri Mar 05 12:55:51 2010 -0500 +++ b/lib/galaxy/tools/test.py Fri Mar 05 17:21:16 2010 -0500 @@ -37,8 +37,8 @@ except Exception, e: log.debug( "Error in add_param for %s: %s" % ( name, e ) ) self.inputs.append( ( name, value, extra ) ) - def add_output( self, name, file, sort ): - self.outputs.append( ( name, file, sort ) ) + def add_output( self, name, file, extra ): + self.outputs.append( ( name, file, extra ) ) def __expand_grouping_for_data_input( self, name, value, extra, grouping_name, grouping_value ): # Currently handles grouping.Conditional and grouping.Repeat if isinstance( grouping_value, grouping.Conditional ): diff -r 45cfcab6aed7 -r 3563fade2c86 test/base/twilltestcase.py --- a/test/base/twilltestcase.py Fri Mar 05 12:55:51 2010 -0500 +++ b/test/base/twilltestcase.py Fri Mar 05 17:21:16 2010 -0500 @@ -36,14 +36,21 @@ #self.set_history() # Functions associated with files - def files_diff( self, file1, file2, sort=False ): + def files_diff( self, file1, file2, attributes=None ): """Checks the contents of 2 files for differences""" + def get_lines_diff( diff ): + count = 0 + for line in diff: + if ( line.startswith( '+' ) and not line.startswith( '+++' ) ) or ( line.startswith( '-' ) and not line.startswith( '---' ) ): + count += 1 + return count if not filecmp.cmp( file1, file2 ): files_differ = False local_file = open( file1, 'U' ).readlines() history_data = open( file2, 'U' ).readlines() - if sort: + if attributes and attributes.sort: history_data.sort() + ##Why even bother with the check loop below, why not just use the diff output? This seems wasteful. if len( local_file ) == len( history_data ): for i in range( len( history_data ) ): if local_file[i].rstrip( '\r\n' ) != history_data[i].rstrip( '\r\n' ): @@ -52,37 +59,79 @@ else: files_differ = True if files_differ: - diff = difflib.unified_diff( local_file, history_data, "local_file", "history_data" ) - diff_slice = list( islice( diff, 40 ) ) - if file1.endswith( '.pdf' ) or file2.endswith( '.pdf' ): - # PDF files contain creation dates, modification dates, ids and descriptions that change with each - # new file, so we need to handle these differences. As long as the rest of the PDF file does - # not differ we're ok. - valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id' ] - valid_diff = False - for line in diff_slice: - # Make sure to lower case strings before checking. - line = line.lower() - # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n' - if ( line.startswith( '+' ) or line.startswith( '-' ) ) and line.find( 'local_file' ) < 0 and line.find( 'history_data' ) < 0: - for vdf in valid_diff_strs: - if line.find( vdf ) < 0: - valid_diff = False - else: - valid_diff = True - # Stop checking as soon as we know we have a valid difference - break - if not valid_diff: - # Print out diff_slice so we can see what failed - print "###### diff_slice ######" - raise AssertionError( "".join( diff_slice ) ) - break + if attributes and attributes.lines_diff: + allowed_diff_count = attributes.lines_diff else: - for line in diff_slice: - for char in line: - if ord( char ) > 128: - raise AssertionError( "Binary data detected, not displaying diff" ) - raise AssertionError( "".join( diff_slice ) ) + allowed_diff_count = 0 + diff = list( difflib.unified_diff( local_file, history_data, "local_file", "history_data" ) ) + diff_lines = get_lines_diff( diff ) + if diff_lines > allowed_diff_count: + diff_slice = diff[0:40] + #FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff + #due to unknown desired behavior when used in conjunction with a non-zero lines_diff + #PDF forgiveness can probably be handled better by not special casing by __extension__ here + #and instead using lines_diff or a regular expression matching + #or by creating and using a specialized pdf comparison function + if file1.endswith( '.pdf' ) or file2.endswith( '.pdf' ): + # PDF files contain creation dates, modification dates, ids and descriptions that change with each + # new file, so we need to handle these differences. As long as the rest of the PDF file does + # not differ we're ok. + valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id' ] + valid_diff = False + for line in diff_slice: + # Make sure to lower case strings before checking. + line = line.lower() + # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n' + if ( line.startswith( '+' ) or line.startswith( '-' ) ) and line.find( 'local_file' ) < 0 and line.find( 'history_data' ) < 0: + for vdf in valid_diff_strs: + if line.find( vdf ) < 0: + valid_diff = False + else: + valid_diff = True + # Stop checking as soon as we know we have a valid difference + break + if not valid_diff: + # Print out diff_slice so we can see what failed + print "###### diff_slice ######" + raise AssertionError( "".join( diff_slice ) ) + else: + for line in diff_slice: + for char in line: + if ord( char ) > 128: + raise AssertionError( "Binary data detected, not displaying diff" ) + raise AssertionError( "".join( diff_slice ) ) + + def files_re_match( self, file1, file2, attributes=None ): + """Checks the contents of 2 files for differences using re.match""" + local_file = open( file1, 'U' ).readlines() #regex file + history_data = open( file2, 'U' ).readlines() + assert len( local_file ) == len( history_data ), 'Data File and Regular Expression File contain a different number of lines (%s != %s)' % ( len( local_file ), len( history_data ) ) + if attributes and attributes.sort: + history_data.sort() + if attributes and attributes.lines_diff: + lines_diff = attributes.lines_diff + else: + lines_diff = 0 + line_diff_count = 0 + diffs = [] + for i in range( len( history_data ) ): + if not re.match( local_file[i].rstrip( '\r\n' ), history_data[i].rstrip( '\r\n' ) ): + line_diff_count += 1 + diffs.append( 'Regular Expression: %s\nData file : %s' % ( local_file[i].rstrip( '\r\n' ), history_data[i].rstrip( '\r\n' ) ) ) + if line_diff_count > lines_diff: + raise AssertionError, "Regular expression did not match data file (allowed variants=%i):\n%s" % ( lines_diff, "".join( diffs ) ) + + def files_re_match_multiline( self, file1, file2, attributes=None ): + """Checks the contents of 2 files for differences using re.match in multiline mode""" + local_file = open( file1, 'U' ).read() #regex file + if attributes and attributes.sort: + history_data = open( file2, 'U' ).readlines() + history_data.sort() + history_data = ''.join( history_data ) + else: + history_data = open( file2, 'U' ).read() + #lines_diff not applicable to multiline matching + assert re.match( local_file, history_data, re.MULTILINE ), "Multiline Regular expression did not match data file" def get_filename( self, filename ): full = os.path.join( self.file_dir, filename) @@ -541,7 +590,7 @@ hid = elem.get('hid') hids.append(hid) return hids - def verify_dataset_correctness( self, filename, hid=None, wait=True, maxseconds=120, sort=False ): + def verify_dataset_correctness( self, filename, hid=None, wait=True, maxseconds=120, attributes=None ): """Verifies that the attributes and contents of a history item meet expectations""" if wait: self.wait( maxseconds=maxseconds ) #wait for job to finish @@ -576,13 +625,24 @@ data = self.last_page() file( temp_name, 'wb' ).write(data) try: - self.files_diff( local_name, temp_name, sort=sort ) + if attributes and 'compare' in attributes: + compare = attributes.compare + else: + compare = 'diff' + if compare == 'diff': + self.files_diff( local_name, temp_name, attributes=attributes ) + elif compare == 're_match': + self.files_re_match( local_name, temp_name, attributes=attributes ) + elif compare == 're_match_multiline': + self.files_re_match_multiline( local_name, temp_name, attributes=attributes ) + else: + raise Exception, 'Unimplemented Compare type: %s' % compare except AssertionError, err: - os.remove(temp_name) - errmsg = 'History item %s different than expected, difference:\n' % hid + errmsg = 'History item %s different than expected, difference (using %s):\n' % ( hid, compare ) errmsg += str( err ) raise AssertionError( errmsg ) - os.remove(temp_name) + finally: + os.remove( temp_name ) def verify_composite_datatype_file_content( self, file_name, hda_id ): local_name = self.get_filename( file_name ) temp_name = self.get_filename( 'temp_%s' % file_name ) diff -r 45cfcab6aed7 -r 3563fade2c86 test/functional/test_toolbox.py --- a/test/functional/test_toolbox.py Fri Mar 05 12:55:51 2010 -0500 +++ b/test/functional/test_toolbox.py Fri Mar 05 17:21:16 2010 -0500 @@ -66,13 +66,13 @@ self.assertTrue( data_list ) elem_index = 0 - len( testdef.outputs ) for output_tuple in testdef.outputs: - name, file, sort = output_tuple + name, outfile, attributes = output_tuple # Get the correct hid elem = data_list[ elem_index ] self.assertTrue( elem is not None ) elem_hid = elem.get( 'hid' ) elem_index += 1 - self.verify_dataset_correctness( file, hid=elem_hid, maxseconds=testdef.maxseconds, sort=sort ) + self.verify_dataset_correctness( outfile, hid=elem_hid, maxseconds=testdef.maxseconds, attributes=attributes ) self.delete_history( id=self.security.encode_id( latest_history.id ) ) def __expand_grouping( self, tool_inputs, declared_inputs, prefix='' ):