[hg] galaxy 3114: Move line count processing from set_peek() met...

23 Nov 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/f4654abcec1e
changeset: 3114:f4654abcec1e
user:      Greg Von Kuster <greg@bx.psu.edu>
date:      Fri Nov 20 11:03:41 2009 -0500
description:
Move line count processing from set_peek() methods to set_meta() methods for data types since set_meta can run externally and it does line by line processing in some cases.  Added new MetadataElements to the Data, Text, Sequence and Maf classes to handle setting dataset.blurb from stored metadata.  Cleaned up a lot of code in the process.

diffstat:

 lib/galaxy/datatypes/binary.py                                 |   14 +-
 lib/galaxy/datatypes/data.py                                   |   55 ++--
 lib/galaxy/datatypes/genetics.py                               |    4 +-
 lib/galaxy/datatypes/images.py                                 |    8 +-
 lib/galaxy/datatypes/interval.py                               |   15 +-
 lib/galaxy/datatypes/metadata.py                               |    5 +-
 lib/galaxy/datatypes/qualityscore.py                           |   54 +-----
 lib/galaxy/datatypes/sequence.py                               |  148 ++++++++++----
 lib/galaxy/datatypes/tabular.py                                |   93 ++++----
 lib/galaxy/datatypes/xml.py                                    |    4 +-
 lib/galaxy/jobs/__init__.py                                    |    4 +-
 lib/galaxy/jobs/runners/local.py                               |    5 +-
 lib/galaxy/model/__init__.py                                   |    8 +-
 lib/galaxy/model/migrate/versions/0005_cleanup_datasets_fix.py |    8 +-
 test/functional/test_get_data.py                               |    6 +-
 test/functional/test_history_functions.py                      |    2 +-
 16 files changed, 220 insertions(+), 213 deletions(-)

diffs (965 lines):

diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/binary.py

--- a/lib/galaxy/datatypes/binary.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/binary.py	Fri Nov 20 11:03:41 2009 -0500
@@ -18,7 +18,7 @@
 
 class Binary( data.Data ):
     """Binary data"""
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         """Set the peek and blurb text"""
         if not dataset.dataset.purged:
             dataset.peek = 'binary data'
@@ -30,7 +30,7 @@
 class Ab1( Binary ):
     """Class describing an ab1 binary sequence file"""
     file_ext = "ab1"
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             export_url = "/history_add_to?" + urlencode( {'history_id':dataset.history_id,'ext':'ab1','name':'ab1 sequence','info':'Sequence file','dbkey':dataset.dbkey} )
             dataset.peek  = "Binary ab1 sequence file"
@@ -71,7 +71,7 @@
         if os.path.exists( tmpf1bai ):
             os.remove( tmpf1bai )
         dataset.metadata.bam_index = index_file
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             export_url = "/history_add_to?" + urlencode( {'history_id':dataset.history_id,'ext':'bam','name':'bam alignments','info':'Alignments file','dbkey':dataset.dbkey} )
             dataset.peek  = "Binary bam alignments file" 
@@ -91,7 +91,7 @@
 class Binseq( Binary ):
     """Class describing a zip archive of binary sequence files"""
     file_ext = "binseq.zip"
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             zip_file = zipfile.ZipFile( dataset.file_name, "r" )
             num_files = len( zip_file.namelist() )
@@ -112,7 +112,7 @@
 class Scf( Binary ):
     """Class describing an scf binary sequence file"""
     file_ext = "scf"
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             export_url = "/history_add_to?" + urlencode({'history_id':dataset.history_id,'ext':'scf','name':'scf sequence','info':'Sequence file','dbkey':dataset.dbkey})
             dataset.peek  = "Binary scf sequence file" 
@@ -139,9 +139,9 @@
             if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
                 return True
             return False
-        except Exception, e:
+        except:
             return False
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             export_url = "/history_add_to?" + urlencode( {'history_id':dataset.history_id,'ext':'sff','name':'sff file','info':'sff file','dbkey':dataset.dbkey} )
             dataset.peek  = "Binary sff file" 
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/data.py
--- a/lib/galaxy/datatypes/data.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/data.py	Fri Nov 20 11:03:41 2009 -0500
@@ -95,9 +95,6 @@
     def set_meta( self, dataset, overwrite = True, **kwd ):
         """Unimplemented method, allows guessing of metadata from contents of file"""
         return True
-    def set_readonly_meta( self, dataset ):
-        """Unimplemented method, resets the readonly metadata values"""
-        return True
     def missing_meta( self, dataset, check = [], skip = [] ):
         """
         Checks for empty metadata values, Returns True if non-optional metadata is missing
@@ -114,7 +111,7 @@
             if not value:
                 return True
         return False
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         """Set the peek and blurb text"""
         if not dataset.dataset.purged:
             dataset.peek = ''
@@ -312,6 +309,11 @@
         return False
 
 class Text( Data ):
+    file_ext = 'txt'
+
+    """Add metadata elements"""
+    MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )
+
     def write_from_stream(self, dataset, stream):
         """Writes data from a stream"""
         # write it twice for now 
@@ -322,7 +324,6 @@
                 break
             os.write(fd, chunk)
         os.close(fd)
-
         # rewrite the file with unix newlines
         fp = open(dataset.file_name, 'wt')
         for line in file(temp_name, "U"):
@@ -344,23 +345,29 @@
     def get_mime(self):
         """Returns the mime type of the datatype"""
         return 'text/plain'
-    def set_peek( self, dataset, line_count=None ):
+    def set_meta( self, dataset, **kwd ):
+        """
+        Set the number of lines of data in dataset,
+        skipping all blank lines and comments.
+        """
+        data_lines = 0
+        for line in file( dataset.file_name ):
+            line = line.strip()
+            if line and not line.startswith( '#' ):
+                data_lines += 1
+        dataset.metadata.data_lines = data_lines
+    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
         if not dataset.dataset.purged:
             # The file must exist on disk for the get_file_peek() method
-            dataset.peek = get_file_peek( dataset.file_name )
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
             if line_count is None:
-                dataset.blurb = "%s lines" % util.commaify( str( get_line_count( dataset.file_name ) ) )
-            else:
-                dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
-    def set_multi_byte_peek( self, dataset, line_count=None ):
-        if not dataset.dataset.purged:
-            # The file must exist on disk for the get_file_peek() method
-            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=True )
-            if line_count is None:
-                dataset.blurb = "%s lines" % util.commaify( str( get_line_count( dataset.file_name ) ) )
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    dataset.blurb = "? lines"
             else:
                 dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
         else:
@@ -370,7 +377,7 @@
 class Txtseq( Data ):
     """Class describing a zip archive of text sequence files"""
     file_ext = "txtseq.zip"
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             zip_file = zipfile.ZipFile( dataset.file_name, "r" )
             num_files = len( zip_file.namelist() )
@@ -459,11 +466,3 @@
     else:
         text = unicode( '\n'.join( lines ), 'utf-8' )
     return text
-def get_line_count(file_name):
-    """Returns the number of lines in a file that are neither null nor comments"""
-    count = 0
-    for line in file(file_name):
-        line = line.strip()
-        if line and line[0] != '#':
-            count += 1
-    return count
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/genetics.py
--- a/lib/galaxy/datatypes/genetics.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/genetics.py	Fri Nov 20 11:03:41 2009 -0500
@@ -358,7 +358,7 @@
     """
     file_ext="snpmatrix"
 
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             dataset.peek  = "Binary RGenetics file"
             dataset.blurb = data.nice_size( dataset.get_size() )
@@ -597,7 +597,7 @@
         else:
             p = []
         return '\n'.join(p)
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         """
         expects a .pheno file in the extra_files_dir - ugh
         note that R is wierd and does not include the row.name in
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/images.py
--- a/lib/galaxy/datatypes/images.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/images.py	Fri Nov 20 11:03:41 2009 -0500
@@ -15,7 +15,7 @@
 
 class Image( data.Data ):
     """Class describing an image"""
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             dataset.peek = 'Image in %s format' % dataset.extension
             dataset.blurb = data.nice_size( dataset.get_size() )
@@ -51,7 +51,7 @@
     """Class describing a GMAJ Applet"""
     file_ext = "gmaj.zip"
     copy_safe_peek = False
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             if hasattr( dataset, 'history_id' ):
                 params = {
@@ -102,7 +102,7 @@
 class Html( data.Text ):
     """Class describing an html file"""
     file_ext = "html"
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             dataset.peek = "HTML file"
             dataset.blurb = data.nice_size( dataset.get_size() )
@@ -136,7 +136,7 @@
     """Class describing a LAJ Applet"""
     file_ext = "laj"
     copy_safe_peek = False
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         if not dataset.dataset.purged:
             if hasattr( dataset, 'history_id' ):
                 params = {
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/interval.py
--- a/lib/galaxy/datatypes/interval.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/interval.py	Fri Nov 20 11:03:41 2009 -0500
@@ -58,12 +58,18 @@
     def init_meta( self, dataset, copy_from=None ):
         Tabular.init_meta( self, dataset, copy_from=copy_from )
     
-    def set_peek( self, dataset, line_count=None ):
+    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
         """Set the peek and blurb text"""
         if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
             if line_count is None:
-                dataset.blurb = "%s regions" % util.commaify( str( data.get_line_count( dataset.file_name ) ) )
+                # See if line_count is stored in the metadata
+                if dataset.metadata.data_lines:
+                    dataset.blurb = "%s regions" % util.commaify( str( dataset.metadata.data_lines ) )
+                else:
+                    # Number of lines is not known ( this should not happen ), and auto-detect is
+                    # needed to set metadata
+                    dataset.blurb = "? regions"
             else:
                 dataset.blurb = "%s regions" % util.commaify( str( line_count ) )
         else:
@@ -884,9 +890,6 @@
         """Initialize interval datatype, by adding UCSC display app"""
         Tabular.__init__(self, **kwd)
         self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
-    def set_readonly_meta( self, dataset, skip=1, **kwd ):
-        """Resets the values of readonly metadata elements."""
-        Tabular.set_readonly_meta( self, dataset, skip = skip, **kwd )
     def set_meta( self, dataset, overwrite = True, **kwd ):
         Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
     def display_peek( self, dataset ):
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/metadata.py
--- a/lib/galaxy/datatypes/metadata.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/metadata.py	Fri Nov 20 11:03:41 2009 -0500
@@ -89,7 +89,10 @@
         return bool( self.parent._metadata.get( name, False ) )
     def get_html_by_name( self, name, **kwd ):
         if name in self.spec:
-            return self.spec[name].param.get_html( value=getattr( self, name ), context=self, **kwd )
+            rval = self.spec[name].param.get_html( value=getattr( self, name ), context=self, **kwd )
+            if rval is None:
+                return self.spec[name].no_value
+            return rval
     def make_dict_copy( self, to_copy ):
         """Makes a deep copy of input iterable to_copy according to self.spec"""
         rval = {}
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/qualityscore.py
--- a/lib/galaxy/datatypes/qualityscore.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/qualityscore.py	Fri Nov 20 11:03:41 2009 -0500
@@ -14,23 +14,6 @@
     until we know more about quality score formats
     """
     file_ext = "qualsolid"
-    
-    def set_peek( self, dataset, line_count=None ):
-        if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
-            if line_count is None:
-                dataset.blurb = data.nice_size( dataset.get_size() )
-            else:
-                dataset.blurb = "%s lines, SOLiD Quality score file" % util.commaify( str( line_count ) )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
-    
-    def display_peek(self, dataset):
-        try:
-            return dataset.peek
-        except:
-            return "SOLiD Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) )
 
     def sniff( self, filename ):
         """
@@ -70,6 +53,7 @@
                             return True
                     else:
                         break #we found a non-empty line, but it's not a header
+            fh.close()
         except:
             pass
         return False
@@ -79,23 +63,6 @@
     until we know more about quality score formats
     """
     file_ext = "qual454"
-    
-    def set_peek( self, dataset, line_count=None ):
-        if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
-            if line_count is None:
-                dataset.blurb = data.nice_size( dataset.get_size() )
-            else:
-                dataset.blurb = "%s lines, 454 Quality score file" % util.commaify( str( line_count ) )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
-    
-    def display_peek(self, dataset):
-        try:
-            return dataset.peek
-        except:
-            return "454 Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) )
 
     def sniff( self, filename ):
         """
@@ -125,6 +92,7 @@
                         return True
                     else:
                         break #we found a non-empty line, but it's not a header
+            fh.close()
         except:
             pass
         return False
@@ -134,22 +102,4 @@
     until we know more about quality score formats
     """
     file_ext = "qualsolexa"
-    
-    def set_peek( self, dataset, line_count=None ):
-        if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
-            if line_count is None:
-                dataset.blurb = data.nice_size( dataset.get_size() )
-            else:
-                dataset.blurb = "%s lines, Solexa Quality score file" % util.commaify( str( line_count ) )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
-    
-    def display_peek(self, dataset):
-        try:
-            return dataset.peek
-        except:
-            return "Solexa Quality score file (%s)" % ( data.nice_size( dataset.get_size() ) )
-
     
\ No newline at end of file
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/sequence.py	Fri Nov 20 11:03:41 2009 -0500
@@ -17,11 +17,40 @@
 
 class Sequence( data.Text ):
     """Class describing a sequence"""
-    def set_readonly_meta( self, dataset ):
-        """Resets the values of readonly metadata elements."""
-        pass
 
-class Alignment( Sequence ):
+    """Add metadata elements"""
+    MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
+
+    def set_meta( self, dataset, **kwd ):
+        """
+        Set the number of sequences and the number of data lines in dataset.
+        """
+        data_lines = 0
+        sequences = 0
+        for line in file( dataset.file_name ):
+            line = line.strip()
+            if line and line.startswith( '#' ):
+                # We don't count comment lines for sequence data types
+                continue
+            if line and line.startswith( '>' ):
+                sequences += 1
+                data_lines +=1
+            else:
+                data_lines += 1
+        dataset.metadata.data_lines = data_lines
+        dataset.metadata.sequences = sequences
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            if dataset.metadata.sequences:
+                dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
+            else:
+                dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+
+class Alignment( data.Text ):
     """Class describing an alignment"""
 
     """Add metadata elements"""
@@ -29,16 +58,9 @@
 
 class Fasta( Sequence ):
     """Class representing a FASTA sequence"""
+
     file_ext = "fasta"
 
-    def set_peek( self, dataset ):
-        if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
-            dataset.blurb = data.nice_size( dataset.get_size() )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
-
     def sniff( self, filename ):
         """
         Determines whether the file is in fasta format
@@ -82,6 +104,7 @@
                         return True
                     else:
                         break #we found a non-empty line, but its not a fasta header
+            fh.close()
         except:
             pass
         return False
@@ -89,14 +112,6 @@
 class csFasta( Sequence ):
     """ Class representing the SOLID Color-Space sequence ( csfasta ) """
     file_ext = "csfasta"
-    
-    def set_peek( self, dataset ):
-        if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
-            dataset.blurb = data.nice_size( dataset.get_size() )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
 
     def sniff( self, filename ):
         """
@@ -130,6 +145,7 @@
                         return True
                     else:
                         break #we found a non-empty line, but it's not a header
+            fh.close()
         except:
             pass
         return False
@@ -137,15 +153,26 @@
 class Fastq ( Sequence ):
     """Class representing a generic FASTQ sequence"""
     file_ext = "fastq"
-    
-    def set_peek( self, dataset ):
-        if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
-            dataset.blurb = data.nice_size( dataset.get_size() )
-        else:
-            dataset.peek = 'file does not exist'
-            dataset.blurb = 'file purged from disk'
-    
+
+    def set_meta( self, dataset, **kwd ):
+        """
+        Set the number of sequences and the number of data lines
+        in dataset.
+        """
+        data_lines = 0
+        sequences = 0
+        for line in file( dataset.file_name ):
+            line = line.strip()
+            if line and line.startswith( '#' ):
+                # We don't count comment lines for sequence data types
+                continue
+            if line and line.startswith( '@' ):
+                sequences += 1
+                data_lines +=1
+            else:
+                data_lines += 1
+        dataset.metadata.data_lines = data_lines
+        dataset.metadata.sequences = sequences
     def sniff ( self, filename ):
         """
         Determines whether the file is in generic fastq format
@@ -178,13 +205,13 @@
     """Class representing a FASTQ sequence ( the Sanger variant )"""
     file_ext = "fastqsanger"
 
-
 try:
     from galaxy import eggs
     import pkg_resources; pkg_resources.require( "bx-python" )
     import bx.align.maf
 except:
     pass
+
 #trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools:
 #from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
 #Traceback (most recent call last):
@@ -223,12 +250,15 @@
     species = []
     species_chromosomes = {}
     indexes = bx.interval_index_file.Indexes()
+    blocks = 0
     try:
         maf_reader = bx.align.maf.Reader( open( filename ) )
         while True:
             pos = maf_reader.file.tell()
             block = maf_reader.next()
-            if block is None: break
+            if block is None:
+                break
+            blocks += 1
             for c in block.components:
                 spec = c.src
                 chrom = None
@@ -255,29 +285,30 @@
     except Exception, e:
         #most likely a bad MAF
         log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
-        return ( None, [], {} )
-    return ( indexes, species, species_chromosomes )
+        return ( None, [], {}, 0 )
+    return ( indexes, species, species_chromosomes, blocks )
 
 class Maf( Alignment ):
     """Class describing a Maf alignment"""
     file_ext = "maf"
     
     #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
+    MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
     MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
     MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
 
     def init_meta( self, dataset, copy_from=None ):
         Alignment.init_meta( self, dataset, copy_from=copy_from )
-    
     def set_meta( self, dataset, overwrite = True, **kwd ):
         """
         Parses and sets species, chromosomes, index from MAF file.
         """
         #these metadata values are not accessable by users, always overwrite
-        indexes, species, species_chromosomes = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
-        if indexes is None: return #this is not a MAF file
-        
+        indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
+        if indexes is None:
+            return #this is not a MAF file
         dataset.metadata.species = species
+        dataset.metadata.blocks = blocks
         #only overwrite the contents if our newly determined chromosomes don't match stored
         chrom_file = dataset.metadata.species_chromosomes
         compare_chroms = {}
@@ -303,17 +334,27 @@
             open( chrom_file.file_name, 'wb' ).write( tmp_file.read() )
             dataset.metadata.species_chromosomes = chrom_file
             tmp_file.close()
-        
         index_file = dataset.metadata.maf_index
         if not index_file:
             index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
         indexes.write( open( index_file.file_name, 'w' ) )
         dataset.metadata.maf_index = index_file
-    
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            # The file must exist on disk for the get_file_peek() method
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            if dataset.metadata.blocks:
+                dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
+            else:
+                # Number of blocks is not known ( this should not happen ), and auto-detect is
+                # needed to set metadata
+                dataset.blurb = "? blocks"
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
     def display_peek( self, dataset ):
         """Returns formated html of peek"""
         return self.make_html_table( dataset )
-
     def make_html_table( self, dataset, skipchars=[] ):
         """Create HTML table, used for displaying peek"""
         out = ['<table cellspacing="0" cellpadding="3">']
@@ -336,7 +377,6 @@
         except Exception, exc:
             out = "Can't create peek %s" % exc
         return out
-
     def sniff( self, filename ):
         """
         Determines wether the file is in maf format
@@ -368,8 +408,13 @@
         except:
             return False
 
-class Axt( Sequence ):
+class Axt( data.Text ):
     """Class describing an axt alignment"""
+    
+    # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
+    # here simply for backward compatibility ( although it is still in the datatypes registry ).  Subclassing
+    # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
+
     file_ext = "axt"
 
     def sniff( self, filename ):
@@ -377,10 +422,16 @@
         Determines whether the file is in axt format
         
         axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab 
-        at Penn State University.  Each alignment block in an axt file contains three lines: a summary 
-        line and 2 sequence lines. Blocks are separated from one another by blank lines.
+        at Penn State University.
         
-        The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields:
+        Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
+        Blocks are separated from one another by blank lines.
+        
+        The summary line contains chromosomal position and size information about the alignment. It
+        consists of 9 required fields.
+        
+        The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
+        (line 3) with inserts.  Repeats are indicated by lower-case letters.
     
         For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
         
@@ -409,10 +460,15 @@
                 else:
                     return True
 
-class Lav( Sequence ):
+class Lav( data.Text ):
     """Class describing a LAV alignment"""
+
     file_ext = "lav"
 
+    # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
+    # here simply for backward compatibility ( although it is still in the datatypes registry ).  Subclassing
+    # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
+
     def sniff( self, filename ):
         """
         Determines whether the file is in lav format
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/tabular.py
--- a/lib/galaxy/datatypes/tabular.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/tabular.py	Fri Nov 20 11:03:41 2009 -0500
@@ -19,14 +19,12 @@
     """Tab delimited data"""
 
     """Add metadata elements"""
+    MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
     MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
     MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
 
     def init_meta( self, dataset, copy_from=None ):
         data.Text.init_meta( self, dataset, copy_from=copy_from )
-    def set_readonly_meta( self, dataset, skip=None, **kwd ):
-        """Resets the values of readonly metadata elements."""
-        Tabular.set_meta( self, dataset, overwrite = True, skip = skip )
     def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
         """
         Tries to determine the number of columns as well as those columns
@@ -35,15 +33,19 @@
         their data type classes are responsible to determine how many invalid
         comment lines should be skipped. Using None for skip will cause skip 
         to be zero, but the first line will be processed as a header.
+
+        Items of interest:
+        1. We treat 'overwrite' as always True (we always want to set tabular metadata when called).
+        2. If a tabular file has no data, it will have one column of type 'str'.
+        3. We used to check only the first 100 lines when setting metadata and this class's
+           set_peek() method read the entire file to determine the number of lines in the file.
+           Since metadata can now be processed on cluster nodes, we've merged the line count portion
+           of the set_peek() processing here, and we now check the entire contents of the file.
         """
-        #we treat 'overwrite' as always True (we always want to set tabular metadata when called)
-        #if a tabular file has no data, it will have one column of type str
-        
-        num_check_lines = 100 #we will only check up to this many lines into the file
-        requested_skip = skip #store original skip value to check with later
+        # Store original skip value to check with later
+        requested_skip = skip
         if skip is None:
             skip = 0
-        
         column_type_set_order = [ 'int', 'float', 'list', 'str'  ] #Order to set column types in
         default_column_type = column_type_set_order[-1] # Default column type is lowest in list
         column_type_compare_order = list( column_type_set_order ) #Order to compare column types
@@ -89,49 +91,47 @@
                 if is_column_type[column_type]( column_text ):
                     return column_type
             return None
-        
+        data_lines = 0
+        comment_lines = 0
         column_types = []
         first_line_column_types = [default_column_type] # default value is one column of type str
         if dataset.has_data():
             #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
             for i, line in enumerate( file ( dataset.file_name ) ):
-                line = line.rstrip('\r\n')
+                line = line.rstrip( '\r\n' )
                 if i < skip or not line or line.startswith( '#' ):
-                    continue
-                
-                fields = line.split( '\t' )
-                for field_count, field in enumerate( fields ):
-                    if field_count >= len( column_types ): #found a previously unknown column, we append None
-                        column_types.append( None )
-                    column_type = guess_column_type( field )
-                    if type_overrules_type( column_type, column_types[field_count] ):
-                        column_types[field_count] = column_type
-                
-                if i == 0 and requested_skip is None:
-                    #this is our first line, people seem to like to upload files that have a header line, but do not start with '#' (i.e. all column types would then most likely be detected as str)
-                    #we will assume that the first line is always a header (this was previous behavior - it was always skipped) when the requested skip is None
-                    #we only use the data from the first line if we have no other data for a column
-                    #this is far from perfect, as:
-                    #1,2,3	1.1	2.2	qwerty
-                    #0	0		1,2,3
-                    #will detect as
-                    #"column_types": ["int", "int", "float", "list"]
-                    #instead of:
-                    #"column_types": ["list", "float", "float", "str"]  *** would seem to be the 'Truth' by manual observation that the first line should be included as data
-                    #old method would have detected as:
-                    #"column_types": ["int", "int", "str", "list"]
-                    first_line_column_types = column_types
-                    column_types = [ None for col in first_line_column_types ]
-                elif i > num_check_lines:
-                    # We exceeded our max check lines
-                    break
-        
+                    # We'll call blank lines comments
+                    comment_lines += 1
+                else:
+                    data_lines += 1
+                    fields = line.split( '\t' )
+                    for field_count, field in enumerate( fields ):
+                        if field_count >= len( column_types ): #found a previously unknown column, we append None
+                            column_types.append( None )
+                        column_type = guess_column_type( field )
+                        if type_overrules_type( column_type, column_types[field_count] ):
+                            column_types[field_count] = column_type
+                    if i == 0 and requested_skip is None:
+                        # This is our first line, people seem to like to upload files that have a header line, but do not 
+                        # start with '#' (i.e. all column types would then most likely be detected as str).  We will assume
+                        # that the first line is always a header (this was previous behavior - it was always skipped).  When
+                        # the requested skip is None, we only use the data from the first line if we have no other data for
+                        # a column.  This is far from perfect, as
+                        # 1,2,3	1.1	2.2	qwerty
+                        # 0	0		1,2,3
+                        # will be detected as
+                        # "column_types": ["int", "int", "float", "list"]
+                        # instead of
+                        # "column_types": ["list", "float", "float", "str"]  *** would seem to be the 'Truth' by manual
+                        # observation that the first line should be included as data.  The old method would have detected as
+                        # "column_types": ["int", "int", "str", "list"]
+                        first_line_column_types = column_types
+                        column_types = [ None for col in first_line_column_types ]
         #we error on the larger number of columns
         #first we pad our column_types by using data from first line
         if len( first_line_column_types ) > len( column_types ):
             for column_type in first_line_column_types[len( column_types ):]:
                 column_types.append( column_type )
-        
         #Now we fill any unknown (None) column_types with data from first line
         for i in range( len( column_types ) ):
             if column_types[i] is None:
@@ -139,10 +139,11 @@
                     column_types[i] = default_column_type
                 else:
                     column_types[i] = first_line_column_types[i]
-        
+        # Set the discovered metadata values for the dataset
+        dataset.metadata.data_lines = data_lines
+        dataset.metadata.comment_lines = comment_lines
         dataset.metadata.column_types = column_types
         dataset.metadata.columns = len( column_types )
-        
     def make_html_table( self, dataset, skipchars=[] ):
         """Create HTML table, used for displaying peek"""
         out = ['<table cellspacing="0" cellpadding="3">']
@@ -202,6 +203,10 @@
                 out.append( '<tr><td>' )
             out.append( '%s</td></tr>'  % escape( comments.pop(0) ) )
         return "".join( out )
+    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
+        data.Text.set_peek( self, dataset, line_count=line_count, is_multi_byte=is_multi_byte )
+        if dataset.metadata.comment_lines:
+            dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) )
     def display_peek( self, dataset ):
         """Returns formatted html of peek"""
         return self.make_html_table( dataset )
@@ -219,7 +224,6 @@
                              'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
                              'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
                              ]
-
     def make_html_table( self, dataset, skipchars=[] ):
         """Create HTML table, used for displaying peek"""
         out = ['<table cellspacing="0" cellpadding="3">']
@@ -317,6 +321,7 @@
                         count += 1
                         if count == 5:
                             return True
+            fh.close()
             if count < 5 and count > 0:
                 return True
         except:
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/datatypes/xml.py
--- a/lib/galaxy/datatypes/xml.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/datatypes/xml.py	Fri Nov 20 11:03:41 2009 -0500
@@ -10,10 +10,10 @@
 class BlastXml( data.Text ):
     """NCBI Blast XML Output data"""
     file_ext = "blastxml"
-    def set_peek( self, dataset ):
+    def set_peek( self, dataset, is_multi_byte=False ):
         """Set the peek and blurb text"""
         if not dataset.dataset.purged:
-            dataset.peek = data.get_file_peek( dataset.file_name )
+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
             dataset.blurb = 'NCBI Blast XML data'
         else:
             dataset.peek = 'file does not exist'
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/jobs/__init__.py
--- a/lib/galaxy/jobs/__init__.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/jobs/__init__.py	Fri Nov 20 11:03:41 2009 -0500
@@ -549,12 +549,12 @@
                     try:
                         assert context.get( 'line_count', None ) is not None
                         if ( not dataset.datatype.composite_type and dataset.dataset.is_multi_byte() ) or self.tool.is_multi_byte:
-                            dataset.set_multi_byte_peek( line_count=context['line_count'] )
+                            dataset.set_peek( line_count=context['line_count'], is_multi_byte=True )
                         else:
                             dataset.set_peek( line_count=context['line_count'] )
                     except:
                         if ( not dataset.datatype.composite_type and dataset.dataset.is_multi_byte() ) or self.tool.is_multi_byte:
-                            dataset.set_multi_byte_peek()
+                            dataset.set_peek( is_multi_byte=True )
                         else:
                             dataset.set_peek()
                     try:
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/jobs/runners/local.py
--- a/lib/galaxy/jobs/runners/local.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/jobs/runners/local.py	Fri Nov 20 11:03:41 2009 -0500
@@ -101,10 +101,9 @@
                 job_wrapper.fail( "failure running job", exception=True )
                 log.exception("failure running job %d" % job_wrapper.job_id)
                 return
-        
         #run the metadata setting script here
-        #this is terminatable when output dataset/job is deleted
-        #so that long running set_meta()s can be cancelled without having to reboot the server
+        #this is terminate-able when output dataset/job is deleted
+        #so that long running set_meta()s can be canceled without having to reboot the server
         if job_wrapper.get_state() not in [ model.Job.states.ERROR, model.Job.states.DELETED ] and self.app.config.set_metadata_externally and job_wrapper.output_paths:
             external_metadata_script = job_wrapper.setup_external_metadata( output_fnames = job_wrapper.get_output_fnames(),
                                                                             set_extension = True,
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/model/__init__.py
--- a/lib/galaxy/model/__init__.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/model/__init__.py	Fri Nov 20 11:03:41 2009 -0500
@@ -537,17 +537,13 @@
     def is_multi_byte( self ):
         """Data consists of multi-byte characters"""
         return self.dataset.is_multi_byte()
-    def set_peek( self ):
-        return self.datatype.set_peek( self )
-    def set_multi_byte_peek( self ):
-        return self.datatype.set_multi_byte_peek( self )
+    def set_peek( self, is_multi_byte=False ):
+        return self.datatype.set_peek( self, is_multi_byte=is_multi_byte )
     def init_meta( self, copy_from=None ):
         return self.datatype.init_meta( self, copy_from=copy_from )
     def set_meta( self, **kwd ):
         self.clear_associated_files( metadata_safe = True )
         return self.datatype.set_meta( self, **kwd )
-    def set_readonly_meta( self, **kwd ):
-        return self.datatype.set_readonly_meta( self, **kwd )
     def missing_meta( self, **kwd ):
         return self.datatype.missing_meta( self, **kwd )
     def as_display_type( self, type, **kwd ):
diff -r 5c46679d0755 -r f4654abcec1e lib/galaxy/model/migrate/versions/0005_cleanup_datasets_fix.py
--- a/lib/galaxy/model/migrate/versions/0005_cleanup_datasets_fix.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/lib/galaxy/model/migrate/versions/0005_cleanup_datasets_fix.py	Fri Nov 20 11:03:41 2009 -0500
@@ -227,17 +227,13 @@
     def get_mime( self ):
         """Returns the mime type of the data"""
         return datatypes_registry.get_mimetype_by_extension( self.extension.lower() )
-    def set_peek( self ):
-        return self.datatype.set_peek( self )
-    def set_multi_byte_peek( self ):
-        return self.datatype.set_multi_byte_peek( self )
+    def set_peek( self, is_multi_byte=False ):
+        return self.datatype.set_peek( self, is_multi_byte=is_multi_byte )
     def init_meta( self, copy_from=None ):
         return self.datatype.init_meta( self, copy_from=copy_from )
     def set_meta( self, **kwd ):
         self.clear_associated_files( metadata_safe = True )
         return self.datatype.set_meta( self, **kwd )
-    def set_readonly_meta( self, **kwd ):
-        return self.datatype.set_readonly_meta( self, **kwd )
     def missing_meta( self, **kwd ):
         return self.datatype.missing_meta( self, **kwd )
     def as_display_type( self, type, **kwd ):
diff -r 5c46679d0755 -r f4654abcec1e test/functional/test_get_data.py
--- a/test/functional/test_get_data.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/test/functional/test_get_data.py	Fri Nov 20 11:03:41 2009 -0500
@@ -294,7 +294,7 @@
                         .first()
         assert hda is not None, "Problem retrieving hda from database"
         self.verify_dataset_correctness( 'qualscores.qualsolid', hid=str( hda.hid ) )
-        self.check_history_for_string( '2.5 Kb, format: <span class="qualsolid">qualsolid</span>, database: \? Info: uploaded file' )
+        self.check_history_for_string( '48 lines, format: <span class="qualsolid">qualsolid</span>, database: \? Info: uploaded file' )
         self.check_metadata_for_string( 'Change data type value="qualsolid" selected="yes">qualsolid' )
         self.delete_history( id=self.security.encode_id( history.id ) )
     def test_0090_upload_file( self ):
@@ -312,7 +312,7 @@
                         .first()
         assert hda is not None, "Problem retrieving hda from database"
         self.verify_dataset_correctness( 'qualscores.qual454', hid=str( hda.hid ) )
-        self.check_history_for_string( '5.6 Kb, format: <span class="qual454">qual454</span>, database: \?' )
+        self.check_history_for_string( '49 lines, format: <span class="qual454">qual454</span>, database: \?' )
         self.check_metadata_for_string( 'Change data type value="qual454" selected="yes">qual454' )
         self.delete_history( id=self.security.encode_id( history.id ) )
     def test_0095_upload_file( self ):
@@ -483,7 +483,7 @@
                         .first()
         assert hda is not None, "Problem retrieving hda from database"
         self.verify_dataset_correctness( 'shrimp_cs_test1.csfasta', hid=str( hda.hid ) )
-        self.check_history_for_string( '162.6 Kb, format: <span class="csfasta">csfasta</span>, <td>>2_14_26_F3,-1282216.0</td>' )
+        self.check_history_for_string( '2,500 sequences, format: <span class="csfasta">csfasta</span>, <td>>2_14_26_F3,-1282216.0</td>' )
         self.check_metadata_for_string( 'value="shrimp_cs_test1.csfasta" value="\?" Change data type value="csfasta" selected="yes"' )
         self.delete_history( id=self.security.encode_id( history.id ) )
     def test_0140_upload_file( self ):
diff -r 5c46679d0755 -r f4654abcec1e test/functional/test_history_functions.py
--- a/test/functional/test_history_functions.py	Fri Nov 20 08:46:49 2009 -0500
+++ b/test/functional/test_history_functions.py	Fri Nov 20 11:03:41 2009 -0500
@@ -737,7 +737,7 @@
                                 deleted_history_ids=deleted_history_ids )
         sa_session.refresh( history6 )
         if len( history6.datasets ) != 2:
-            raise AssertionError, "Copying hda1 to the current history failed"
+            raise AssertionError, "Copying hda1 to the current history failed, history 6 has %d datasets, but should have 2" % len( history6.datasets )
         # Test copying 1 hda to another history
         self.new_history( name=urllib.quote( 'copy history items - 2' ) )
         history7 = sa_session.query( galaxy.model.History ) \

    

Greg Von Kuster

tags

participants (1)