commit/galaxy-central: greg: Move duplicate data type checker methods from sniff and upload into a new ~/datatypes/checkers.py.

3 Jun 2011

1 new changeset in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/c2bc73b017cb/
changeset:   c2bc73b017cb
branches:    
user:        greg
date:        2011-06-03 21:36:59
summary:     Move duplicate data type checker methods from sniff and upload into a new ~/datatypes/checkers.py.
affected #:  3 files (4.4 KB)

--- a/lib/galaxy/datatypes/sniff.py	Fri Jun 03 15:18:31 2011 -0400
+++ b/lib/galaxy/datatypes/sniff.py	Fri Jun 03 15:36:59 2011 -0400
@@ -4,6 +4,7 @@
 import logging, sys, os, csv, tempfile, shutil, re, zipfile, gzip
 import registry
 from galaxy import util
+from galaxy.datatypes.checkers import *
 from galaxy.datatypes.binary import unsniffable_binary_formats
 
 log = logging.getLogger(__name__)
@@ -319,59 +320,6 @@
         return 'tabular'    #default tabular data type file extension
     return 'txt'            #default text data type file extension
 
-
-#Methods Used below can be used to upload new datasets into Galaxy. Currently used by the data_source.py script/tools.
-#These should be further abstracted and merged with upload.py script/tool functionality.
-def is_gzip( filename ):
-    temp = open( filename, "U" )
-    magic_check = temp.read( 2 )
-    temp.close()
-    if magic_check != util.gzip_magic:
-        return False
-    return True
-
-
-def is_binary( filename ):
-    is_binary = False
-    temp = open( filename, "U" )
-    chars_read = 0
-    for chars in temp:
-        for char in chars:
-            chars_read += 1
-            if ord( char ) > 128:
-                is_binary = True
-                break
-            if chars_read > 100:
-                break
-        if chars_read > 100:
-            break
-    temp.close()
-    return is_binary
-
-def is_html( temp_name, chunk=None ):
-    if chunk is None:
-        temp = open(temp_name, "U")
-    else:
-        temp = chunk
-    regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
-    regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
-    regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
-    regexp4 = re.compile( "<META[^>]*>", re.I )
-    regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
-    lineno = 0
-    for line in temp:
-        lineno += 1
-        matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
-        if matches:
-            if chunk is None:
-                temp.close()
-            return True
-        if lineno > 100:
-            break
-    if chunk is None:
-        temp.close()
-    return False
-
 def handle_compressed_file( filename, datatypes_registry, ext = 'auto' ):
     CHUNK_SIZE = 2**20 # 1Mb
     is_compressed = False
@@ -429,10 +377,10 @@
     if ext in AUTO_DETECT_EXTENSIONS:
         ext = guess_ext( filename, sniff_order = datatypes_registry.sniff_order, is_multi_byte=is_multi_byte )
     
-    if is_binary( filename ):
+    if check_binary( filename ):
         if ext not in unsniffable_binary_formats and not datatypes_registry.get_datatype_by_extension( ext ).sniff( filename ):
             raise InappropriateDatasetContentError, 'The binary uploaded file contains inappropriate content.'
-    elif is_html( filename ):
+    elif check_html( filename ):
         raise InappropriateDatasetContentError, 'The uploaded file contains inappropriate HTML content.'
     return ext
 
@@ -449,4 +397,3 @@
 if __name__ == '__main__':
     import doctest, sys
     doctest.testmod(sys.modules[__name__])
-    


--- a/tools/data_source/upload.py	Fri Jun 03 15:18:31 2011 -0400
+++ b/tools/data_source/upload.py	Fri Jun 03 15:36:59 2011 -0400
@@ -8,6 +8,7 @@
 from galaxy import eggs
 # need to import model before sniff to resolve a circular import dependency
 import galaxy.model
+from galaxy.datatypes.checkers import *
 from galaxy.datatypes import sniff
 from galaxy.datatypes.binary import *
 from galaxy.datatypes.images import Pdf
@@ -48,104 +49,20 @@
         return [safe_dict(x) for x in d]
     else:
         return d
-def check_html( temp_name, chunk=None ):
-    if chunk is None:
-        temp = open(temp_name, "U")
-    else:
-        temp = chunk
-    regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I )
-    regexp2 = re.compile( "<IFRAME[^>]*>", re.I )
-    regexp3 = re.compile( "<FRAMESET[^>]*>", re.I )
-    regexp4 = re.compile( "<META[^>]*>", re.I )
-    regexp5 = re.compile( "<SCRIPT[^>]*>", re.I )
-    lineno = 0
-    for line in temp:
-        lineno += 1
-        matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line )
-        if matches:
-            if chunk is None:
-                temp.close()
-            return True
-        if lineno > 100:
-            break
-    if chunk is None:
-        temp.close()
-    return False
-def check_binary( temp_name ):
-    is_binary = False
-    temp = open( temp_name, "U" )
-    chars_read = 0
-    for chars in temp:
-        for char in chars:
-            chars_read += 1
-            if ord( char ) > 128:
-                is_binary = True
-                break
-            if chars_read > 100:
-                break
-        if chars_read > 100:
-            break
-    temp.close()
-    return is_binary
-def check_bam( temp_name ):
-    return Bam().sniff( temp_name )
-def check_sff( temp_name ):
-    return Sff().sniff( temp_name )
-def check_pdf( temp_name ):
-    return Pdf().sniff( temp_name )
-def check_bigwig( temp_name ):
-    return BigWig().sniff( temp_name )
-def check_bigbed( temp_name ):
-    return BigBed().sniff( temp_name )
-def check_gzip( temp_name ):
-    # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
-    # Make sure we have a gzipped file
-    try:
-        temp = open( temp_name, "U" )
-        magic_check = temp.read( 2 )
-        temp.close()
-        if magic_check != util.gzip_magic:
-            return ( False, False )
-    except:
-        return ( False, False )
-    # We support some binary data types, so check if the compressed binary file is valid
-    # If the file is Bam, it should already have been detected as such, so we'll just check
-    # for sff format.
-    try:
-        header = gzip.open( temp_name ).read(4)
-        if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
-            return ( True, True )
-    except:
-        return( False, False )
-    CHUNK_SIZE = 2**15 # 32Kb
-    gzipped_file = gzip.GzipFile( temp_name, mode='rb' )
-    chunk = gzipped_file.read( CHUNK_SIZE )
-    gzipped_file.close()
-    # See if we have a compressed HTML file
-    if check_html( temp_name, chunk=chunk ):
-        return ( True, False )
-    return ( True, True )
-def check_bz2( temp_name ):
-    try:
-        temp = open( temp_name, "U" )
-        magic_check = temp.read( 3 )
-        temp.close()
-        if magic_check != util.bz2_magic:
-            return ( False, False )
-    except:
-        return( False, False )
-    CHUNK_SIZE = 2**15 # reKb
-    bzipped_file = bz2.BZ2File( temp_name, mode='rb' )
-    chunk = bzipped_file.read( CHUNK_SIZE )
-    bzipped_file.close()
-    # See if we have a compressed HTML file
-    if check_html( temp_name, chunk=chunk ):
-        return ( True, False )
-    return ( True, True )
-def check_zip( temp_name ):
-    if zipfile.is_zipfile( temp_name ):
-        return True
-    return False
+def check_bam( file_path ):
+    return Bam().sniff( file_path )
+
+def check_sff( file_path ):
+    return Sff().sniff( file_path )
+
+def check_pdf( file_path ):
+    return Pdf().sniff( file_path )
+
+def check_bigwig( file_path ):
+    return BigWig().sniff( file_path )
+
+def check_bigbed( file_path ):
+    return BigBed().sniff( file_path )
 def parse_outputs( args ):
     rval = {}
     for arg in args:

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

Bitbucket

tags

participants (1)