[galaxy-commits] galaxy-dist commit f94f1f2fa4be: Fixes for generating viewports.

23 Jul 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Dan Blankenberg <dan@bx.psu.edu>
# Date 1279901436 14400
# Node ID f94f1f2fa4be20f8f90ffa9482a754f69256f449
# Parent  cecc290755014c02aa21906d433a3d39f2d657d5
Fixes for generating viewports.

Now, a maximum of 1MB is read from a line to determine the viewport; if a line greater than 10MB is encountered, then no viewport will be generated.

Additional fixes for certain cases when viewport was not determined properly for for WIG tracks and GFF.

--- a/lib/galaxy/datatypes/interval.py
+++ b/lib/galaxy/datatypes/interval.py
@@ -38,6 +38,10 @@ for key, value in alias_spec.items():
     for elem in value:
         alias_helper[elem] = key
 
+#Constants for configuring viewport generation
+VIEWPORT_READLINE_BUFFER_SIZE = 1048576 #1MB
+VIEWPORT_MAX_READS_PER_LINE = 10 # If a line is greater than VIEWPORT_MAX_READS_PER_LINE * VIEWPORT_READLINE_BUFFER_SIZE bytes in size, then we will not generate a viewport for that dataset
+
 class Interval( Tabular ):
     """Tab delimited data containing interval information"""
     file_ext = "interval"
@@ -146,33 +150,55 @@ class Interval( Tabular ):
                 and dataset.metadata.endCol
         except:
             return False
-    def get_estimated_display_viewport( self, dataset ):
+    def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
         """Return a chrom, start, stop tuple for viewing a file."""
+        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
+        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
         if self.displayable( dataset ):
             try:
-                c, s, e = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol
-                c, s, e = int(c)-1, int(s)-1, int(e)-1
-                try:
-                    skipme = int(dataset.metadata.comment_lines)
-                except:
-                    skipme = 0
-                peek = []
-                for idx, line in enumerate(file(dataset.file_name)):
-                    if line[0] != '#':
-                        peek.append( line.rstrip( '\n\r' ).split() )
-                        if idx > 100 and idx > skipme: # viewport should have at least 100 features
-                            break
-                chr, start, stop = peek[skipme][c], int( peek[skipme][s] ), int( peek[skipme][e] )
-                for p in peek[(skipme+1):]:
-                    if p[0] == chr:
-                        start = min( start, int( p[s] ) )
-                        stop  = max( stop, int( p[e] ) )
-            except Exception, exc:
-                log.exception( str(exc) )
-                return ( None, None, None )
-            return (chr, str( start ), str( stop )) 
-        else:
-            return ( None, None, None )
+                chrom = None 
+                start = sys.maxint 
+                end = 0
+                if chrom_col is None:
+                    chrom_col = int( dataset.metadata.chromCol ) - 1
+                if start_col is None:
+                    start_col = int( dataset.metadata.startCol ) - 1
+                if end_col is None:
+                    end_col = int( dataset.metadata.endCol ) - 1
+                max_col = max( chrom_col, start_col, end_col )
+                fh = open( dataset.file_name )
+                while True:
+                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                    if not line: break #EOF
+                    if not line.startswith( '#' ):
+                        try:
+                            fields = line.rstrip().split( '\t' )
+                            if len( fields ) > max_col:
+                                if chrom is None or chrom == fields[ chrom_col ]:
+                                    start = min( start, int( fields[ start_col ] ) )
+                                    end = max( end, int( fields[ end_col ] ) )
+                                    chrom = fields[ chrom_col ] #set chrom last, in case start and end are not integers
+                                viewport_feature_count -= 1
+                        except Exception:
+                            #most likely a non-integer field has been encountered for start / stop
+                            continue
+                    #make sure we are at the next new line
+                    readline_count = VIEWPORT_MAX_READS_PER_LINE
+                    while line.rstrip( '\n\r' ) == line:
+                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
+                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                        if not line: break #EOF
+                        readline_count -= 1
+                    max_line_count -= 1
+                    if not viewport_feature_count or not max_line_count:
+                        #exceeded viewport or total line count to check
+                        break
+                if chrom is not None:
+                    return ( chrom, str( start ), str( end ) ) #Necessary to return strings?
+            except Exception, e:
+                #unexpected error, possibly missing metadata
+                log.exception( str( e ) )
+        return ( None, None, None ) #could not determine viewport
     def as_ucsc_display_file( self, dataset, **kwd ):
         """Returns file contents with only the bed data"""
         fd, temp_name = tempfile.mkstemp()
@@ -336,49 +362,11 @@ class BedGraph( Interval ):
         """
         return open( dataset.file_name )
         
-    def get_estimated_display_viewport( self, dataset ):
+    def get_estimated_display_viewport( self, dataset, chrom_col = 0, start_col = 1, end_col = 2 ):
         """
             Set viewport based on dataset's first 100 lines.
         """
-        if self.displayable( dataset ):
-            try:
-                # Set seqid, start, stop.
-                seqid = None
-                start = 2147483647  # Maximum value of a signed 32 bit integer ( 2**31 - 1 )
-                stop = 0
-                for i, line in enumerate( file( dataset.file_name ) ):
-                    line = line.rstrip( '\r\n' )
-                    if not line:
-                        continue
-                    elems = line.split('\t')
-                    if len( elems ) == 4:
-                        # Update seq id, start, end.
-                        if not seqid:
-                            # We can only set the viewport for a single chromosome
-                            seqid = elems[0]
-                        if seqid == elems[0]:
-                            # Make sure we have not spanned chromosomes
-                            start = min( start, int( elems[1] ) )
-                            stop = max( stop, int( elems[2] ) )
-                        else:
-                            # We've spanned a chromosome
-                            break
-                    else:
-                        continue
-                    # Only look through 100 lines.
-                    if i > 100:
-                        break
-                        
-                # Set valid values for start, stop if necessary.
-                if start == 2147483647:
-                    start = 0
-                if stop == 0:
-                    stop = 1
-                return ( seqid, str( start ), str( stop ) )
-            except Exception, exc:
-                log.exception( str( exc ) )
-                return ( None, None, None )
-        return ( None, None, None )
+        return Interval.get_estimated_display_viewport( self, dataset, chrom_col = chrom_col, start_col = start_col, end_col = end_col )
 
 class Bed( Interval ):
     """Tab delimited data in BED format"""
@@ -644,61 +632,75 @@ class Gff( Tabular, _RemoteCallMixin ):
         Return a chrom, start, stop tuple for viewing a file.  There are slight differences between gff 2 and gff 3
         formats.  This function should correctly handle both...
         """
+        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
+        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
         if self.displayable( dataset ):
             try:
-                seqid = ''
-                start = 2147483647  # Maximum value of a signed 32 bit integer ( 2**31 - 1 )
+                seqid = None 
+                start = sys.maxint 
                 stop = 0
-                for i, line in enumerate( file( dataset.file_name ) ):
-                    line = line.rstrip( '\r\n' )
-                    if not line:
-                        continue
-                    if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000
-                        elems = line.split()
-                        if len( elems ) > 3:
-                            # line looks like:
-                            # ##sequence-region   ctg123 1 1497228 
-                            seqid = elems[1] # IV
-                            start = elems[2] # 6000000
-                            stop = elems[3] # 6030000
-                            break
-                        elif len( elems ) == 2 and elems[1].find( '..' ) > 0:
-                            # line looks like this:
-                            # ##sequence-region X:120000..140000
-                            elems = elems[1].split( ':' )
-                            seqid = elems[0]
-                            start = elems[1].split( '..' )[0]
-                            stop = elems[1].split( '..' )[1]
-                            break
-                        else:
-                            log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) )
-                            break
-                    # Allow UCSC style browser and track info in the GFF file
-                    if line.startswith("browser position"):
-                        pos_info = line.split()[-1]
-                        seqid, startend = pos_info.split(":")
-                        start, end = startend.split("-")
+                fh = open( dataset.file_name )
+                while True:
+                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                    if not line: break #EOF
+                    try:
+                        if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000
+                            elems = line.rstrip( '\n\r' ).split()
+                            if len( elems ) > 3:
+                                # line looks like:
+                                # ##sequence-region   ctg123 1 1497228 
+                                seqid = elems[1] # IV
+                                start = int( elems[2] )# 6000000
+                                stop = int( elems[3] ) # 6030000
+                                break #use location declared in file
+                            elif len( elems ) == 2 and elems[1].find( '..' ) > 0:
+                                # line looks like this:
+                                # ##sequence-region X:120000..140000
+                                elems = elems[1].split( ':' )
+                                seqid = elems[0]
+                                start = int( elems[1].split( '..' )[0] )
+                                stop = int( elems[1].split( '..' )[1] )
+                                break #use location declared in file
+                            else:
+                                log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) )
+                                #break #no break, if bad definition, we try another method
+                        elif line.startswith("browser position"):
+                            # Allow UCSC style browser and track info in the GFF file
+                            pos_info = line.split()[-1]
+                            seqid, startend = pos_info.split(":")
+                            start, stop = map( int, startend.split("-") )
+                            break #use location declared in file
+                        elif True not in map( line.startswith, ( '#', 'track', 'browser' ) ):# line.startswith() does not accept iterator in python2.4
+                            viewport_feature_count -= 1
+                            elems = line.rstrip( '\n\r' ).split( '\t' )
+                            if len( elems ) > 3:
+                                if not seqid:
+                                    # We can only set the viewport for a single chromosome
+                                    seqid = elems[0]
+                                if seqid == elems[0]:
+                                    # Make sure we have not spanned chromosomes
+                                    start = min( start, int( elems[3] ) )
+                                    stop = max( stop, int( elems[4] ) )
+                    except:
+                        #most likely start/stop is not an int or not enough fields
+                        pass
+                    #make sure we are at the next new line
+                    readline_count = VIEWPORT_MAX_READS_PER_LINE
+                    while line.rstrip( '\n\r' ) == line:
+                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
+                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                        if not line: break #EOF
+                        readline_count -= 1
+                    max_line_count -= 1
+                    if not viewport_feature_count or not max_line_count:
+                        #exceeded viewport or total line count to check
                         break
-                    if not line.startswith(('#', 'track', 'browser')) :
-                        elems = line.split( '\t' )
-                        if not seqid:
-                            # We can only set the viewport for a single chromosome
-                            seqid = elems[0]
-                        if seqid == elems[0]:
-                            # Make sure we have not spanned chromosomes
-                            start = min( start, int( elems[3] ) )
-                            stop = max( stop, int( elems[4] ) )
-                        else:
-                            # We've spanned a chromosome
-                            break
-                    if i > 10:
-                        break
+                if seqid is not None:
+                    return ( seqid, str( start ), str( stop ) ) #Necessary to return strings?
             except Exception, e:
+                #unexpected error
                 log.exception( str( e ) )
-                return ( None, None, None )
-            return ( seqid, str( start ), str( stop ) )
-        else:
-            return ( None, None, None )
+        return ( None, None, None ) #could not determine viewport
     def ucsc_links( self, dataset, type, app, base_url ):
         ret_val = []
         seqid, start, stop = self.get_estimated_display_viewport( dataset )
@@ -963,44 +965,67 @@ class Wiggle( Tabular, _RemoteCallMixin 
         Tabular.__init__( self, **kwd )
         self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
         self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
-
     def get_estimated_display_viewport( self, dataset ):
+        """Return a chrom, start, stop tuple for viewing a file."""
+        viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines
+        max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines
         if self.displayable( dataset ):
-            num_check_lines = 100 # only check up to this many non empty lines
-            vstart = None
-            vend = 0
-            vwig_chr = '?'
-            value = None
-            for i, line in enumerate( file( dataset.file_name ) ):
-                line = line.rstrip( '\r\n' )
-                if line:
-                    if line.startswith( "browser" ):
-                        chr_info = line.split()[-1]
-                        wig_chr, coords = chr_info.split( ":" )
-                        start, end = coords.split( "-" )
-                        value = ( wig_chr, start, end )
+            try:
+                chrom = None 
+                start = sys.maxint 
+                end = 0
+                span = 1
+                step = None
+                fh = open( dataset.file_name )
+                while True:
+                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                    if not line: break #EOF
+                    try:
+                        if line.startswith( "browser" ):
+                            chr_info = line.rstrip( '\n\r' ).split()[-1]
+                            chrom, coords = chr_info.split( ":" )
+                            start, end = map( int, coords.split( "-" ) )
+                            break # use the browser line
+                        # variableStep chrom=chr20
+                        if line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ):
+                            if chrom is not None: break #different chrom or different section of the chrom
+                            chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0]
+                            if 'span=' in line:
+                                span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] )
+                            if 'step=' in line:
+                                step = int( line.rstrip( '\n\r' ).split("step=")[1].split()[0] )
+                                start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] )
+                        else:
+                            fields = line.rstrip( '\n\r' ).split()
+                            if fields:
+                                if step is not None:
+                                    if not end:
+                                        end = start + span
+                                    else:
+                                        end += step
+                                else:
+                                    start = min( int( fields[0] ), start )
+                                    end = max( end, int( fields[0] ) + span )
+                                viewport_feature_count -= 1
+                    except:
+                        pass
+                    #make sure we are at the next new line
+                    readline_count = VIEWPORT_MAX_READS_PER_LINE
+                    while line.rstrip( '\n\r' ) == line:
+                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
+                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                        if not line: break #EOF
+                        readline_count -= 1
+                    max_line_count -= 1
+                    if not viewport_feature_count or not max_line_count:
+                        #exceeded viewport or total line count to check
                         break
-                    # variableStep chrom=chr20
-                    if line and (line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" )):
-                        c = line.split("chr")[-1]
-                        c = c.split()[0]
-                        vwig_chr = 'chr%s' % c
-                    else:
-                        try:
-                            offset = line.split()[0]
-                            offset = int(offset)
-                            vend = max(vend,offset)
-                            if not vstart:
-                                vstart = offset # first
-                        except:
-                            pass                    
-                if i > num_check_lines:
-                    break
-            if value == None:
-                value = (vwig_chr, vstart, vend)
-            return value
-        else:
-            return ( None, None, None )
+                if chrom is not None:
+                    return ( chrom, str( start ), str( end ) ) #Necessary to return strings?
+            except Exception, e:
+                #unexpected error
+                log.exception( str( e ) )
+        return ( None, None, None ) #could not determine viewport
     def gbrowse_links( self, dataset, type, app, base_url ):
         ret_val = []
         chrom, start, stop = self.get_estimated_display_viewport( dataset )
@@ -1119,44 +1144,61 @@ class CustomTrack ( Tabular ):
     def display_peek( self, dataset ):
         """Returns formated html of peek"""
         return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
-    def get_estimated_display_viewport( self, dataset ):
+    def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ):
+        """Return a chrom, start, stop tuple for viewing a file."""
+        #FIXME: only BED and WIG custom tracks are currently supported
+        #As per previously existing behavior, viewport will only be over the first intervals 
+        max_line_count = 100 # maximum number of lines to check; includes comment lines
+        variable_step_wig = False
+        chrom = None
+        span = 1
         if self.displayable( dataset ):
             try:
-                wiggle_format = False
-                for line in open(dataset.file_name):
-                    if (line.startswith("chr") or line.startswith("scaffold")):  
-                        line = line.rstrip( '\n\r' )
-                        start = line.split("\t")[1].replace(",","")   
-                        end = line.split("\t")[2].replace(",","")
-    
-                        if int(start) < int(end):
-                            value = ( line.split("\t")[0], start, end )
-                        else:
-                            value = ( line.split("\t")[0], end, start )
-    
+                fh = open( dataset.file_name )
+                while True:
+                    line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                    if not line: break #EOF
+                    if not line.startswith( '#' ):
+                        try:
+                            if variable_step_wig:
+                                fields = line.rstrip().split()
+                                if len( fields ) == 2:
+                                    start = int( fields[ 0 ] )
+                                    return ( chrom, str( start ), str( start + span ) )
+                            elif line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ):
+                                chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0]
+                                if 'span=' in line:
+                                    span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] )
+                                if 'start=' in line:
+                                    start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] )
+                                    return ( chrom, str( start ), str( start + span )  )
+                                else:
+                                    variable_step_wig = True 
+                            else:
+                                fields = line.rstrip().split( '\t' )
+                                if len( fields ) >= 3:
+                                    chrom = fields[ 0 ]
+                                    start = int( fields[ 1 ] )
+                                    end = int( fields[ 2 ] )
+                                    return ( chrom, str( start ), str( end ) )
+                        except Exception:
+                            #most likely a non-integer field has been encountered for start / stop
+                            continue
+                    #make sure we are at the next new line
+                    readline_count = VIEWPORT_MAX_READS_PER_LINE
+                    while line.rstrip( '\n\r' ) == line:
+                        assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id )
+                        line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE )
+                        if not line: break #EOF
+                        readline_count -= 1
+                    max_line_count -= 1
+                    if not max_line_count:
+                        #exceeded viewport or total line count to check
                         break
-    
-                    elif (line.startswith('variableStep')):
-                        # wiggle format
-                        wiggle_format = True
-                        wig_chr = line.split()[1].split('=')[1]
-                        if not wig_chr.startswith("chr"):
-                            value = ('', '', '')
-                            break
-                    elif wiggle_format:
-                        # wiggle format
-                        if line.split("\t")[0].isdigit():
-                            start = line.split("\t")[0]
-                            end = str(int(start) + 1)
-                            value = (wig_chr, start, end)
-                        else:
-                            value = (wig_chr, '', '')
-                        break       
-                return value #returns the co-ordinates of the 1st track/dataset
-            except:
-                return ( None, None, None )
-        else:
-            return ( None, None, None )
+            except Exception, e:
+                #unexpected error
+                log.exception( str( e ) )
+        return ( None, None, None ) #could not determine viewport
     def ucsc_links( self, dataset, type, app, base_url ):
         ret_val = []
         chrom, start, stop = self.get_estimated_display_viewport(dataset)

    

[galaxy-commits] galaxy-dist commit f94f1f2fa4be: Fixes for generating viewports.

commits-noreply＠bitbucket.org