[galaxy-commits] galaxy-dist commit 150c8db8dec1: Remove obsolete binseq.zip and txtseq.zip formats, and allow for uploading single files in a zip archive. Adapted from a patch from Pablo Cingolani.

29 Jun 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Nate Coraor <nate@bx.psu.edu>
# Date 1277326093 14400
# Node ID 150c8db8dec1f36d42baea62c895f52d983d9e60
# Parent  5cde0b6269e320c2bd769222cbd40f2e8956b7c5
Remove obsolete binseq.zip and txtseq.zip formats, and allow for uploading single files in a zip archive.  Adapted from a patch from Pablo Cingolani.

--- a/tools/metag_tools/short_reads_trim_seq.xml
+++ b/tools/metag_tools/short_reads_trim_seq.xml
@@ -6,8 +6,8 @@
 </command><inputs><page>
-    <param name="input1" type="data" format="fasta,txtseq.zip" label="Reads" />
-    <param name="input2" type="data" format="qualsolexa,qual454,txtseq.zip" label="Quality scores" />
+    <param name="input1" type="data" format="fasta" label="Reads" />
+    <param name="input2" type="data" format="qualsolexa,qual454" label="Quality scores" /><param name="trim" type="integer" size="5" value="20" label="Minimal quality score" help="bases scoring below this value will trigger splitting"/><param name="length" type="integer" size="5" value="100" label="Minimal length of contiguous segment" help="report all high quality segments above this length. Setting this option to '0' will cause the program to return a single longest run of high quality bases per read" /><conditional name="sequencing_method_choice">

--- a/tools/metag_tools/short_reads_trim_seq.py
+++ b/tools/metag_tools/short_reads_trim_seq.py
@@ -5,7 +5,7 @@ input: read file and quality score file
 output: trimmed read file
 """
 
-import os, sys, math, tempfile, zipfile, re
+import os, sys, math, tempfile, re
 
 assert sys.version_info[:2] >= ( 2, 4 )
 
@@ -13,14 +13,6 @@ def stop_err( msg ):
     sys.stderr.write( "%s\n" % msg )
     sys.exit()
 
-def unzip( filename ):
-    zip_file = zipfile.ZipFile( filename, 'r' )
-    tmpfilename = tempfile.NamedTemporaryFile().name
-    for name in zip_file.namelist():
-        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
-    zip_file.close()
-    return tmpfilename
-
 def append_to_outfile( outfile_name, seq_title, segments ):
     segments = segments.split( ',' )
     if len( segments ) > 1:
@@ -91,16 +83,8 @@ def __main__():
     infile_score_name = sys.argv[5].strip()
     arg = sys.argv[6].strip()
 
-    infile_seq_is_zipped = False
-    if zipfile.is_zipfile( infile_seq_name ):
-        infile_seq_is_zipped = True
-        seq_infile_name = unzip( infile_seq_name ) 
-    else: seq_infile_name = infile_seq_name
-    infile_score_is_zipped = False
-    if zipfile.is_zipfile( infile_score_name ):
-        infile_score_is_zipped = True 
-        score_infile_name = unzip(infile_score_name)
-    else: score_infile_name = infile_score_name
+    seq_infile_name = infile_seq_name
+    score_infile_name = infile_score_name
     
 
     # Determine quailty score format: tabular or fasta format within the first 100 lines
@@ -247,10 +231,4 @@ def __main__():
     else:
         stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )    
 
-    # Need to delete temporary files created when we unzipped the input file archives                    
-    if infile_seq_is_zipped and os.path.exists( seq_infile_name ):
-        os.remove( seq_infile_name )
-    if infile_score_is_zipped and os.path.exists( score_infile_name ):
-        os.remove( score_infile_name )
-    
 if __name__ == "__main__": __main__()

--- a/datatypes_conf.xml.sample
+++ b/datatypes_conf.xml.sample
@@ -25,7 +25,6 @@
             <converter file="bed_to_genetrack_converter.xml" target_datatype="genetrack"/></datatype><datatype extension="bed12" type="galaxy.datatypes.interval:Bed12" />
-        <datatype extension="binseq.zip" type="galaxy.datatypes.binary:Binseq" mimetype="application/zip" display_in_upload="true"/><datatype extension="len" type="galaxy.datatypes.chrominfo:ChromInfo" display_in_upload="true"><!-- no converters yet --></datatype>
@@ -97,7 +96,6 @@
         <datatype extension="tabular" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true"/><datatype extension="txt" type="galaxy.datatypes.data:Text" display_in_upload="true"/><datatype extension="blastxml" type="galaxy.datatypes.xml:BlastXml" display_in_upload="true"/>
-        <datatype extension="txtseq.zip" type="galaxy.datatypes.data:Txtseq" mimetype="application/zip" display_in_upload="true"/><datatype extension="velvet" type="galaxy.datatypes.assembly:Velvet" display_in_upload="false"/><datatype extension="wig" type="galaxy.datatypes.interval:Wiggle" display_in_upload="true"><converter file="wiggle_to_array_tree_converter.xml" target_datatype="array_tree"/>

--- a/tools/metag_tools/short_reads_figure_score.xml
+++ b/tools/metag_tools/short_reads_figure_score.xml
@@ -5,7 +5,7 @@
 
 <inputs><page>
-    <param name="input1" type="data" format="qualsolexa, qual454, txtseq.zip" label="Quality score file" help="No dataset? Read tip below"/>
+    <param name="input1" type="data" format="qualsolexa, qual454" label="Quality score file" help="No dataset? Read tip below"/></page></inputs>
 

--- a/tools/data_source/upload.xml
+++ b/tools/data_source/upload.xml
@@ -63,12 +63,6 @@ A binary file compressed in the BGZF for
 
 -----
 
-**Binseq.zip**
-
-A zipped archive consisting of binary sequence files in either 'ab1' or 'scf' format.  All files in this archive must have the same file extension which is one of '.ab1' or '.scf'.  You must manually select this 'File Format' when uploading the file.
-
------
-
 **Bed**
 
 * Tab delimited format (tabular)
@@ -199,12 +193,6 @@ Any data in tab delimited format (tabula
 
 -----
 
-**Txtseq.zip**
-
-A zipped archive consisting of flat text sequence files.  All files in this archive must have the same file extension of '.txt'.  You must manually select this 'File Format' when uploading the file.
-
------
-
 **Wig**
 
 The wiggle format is line-oriented.  Wiggle data is preceded by a track definition line, which adds a number of options for controlling the default display of this track.

--- a/tools/metag_tools/megablast_xml_parser.xml
+++ b/tools/metag_tools/megablast_xml_parser.xml
@@ -18,12 +18,6 @@
 </tests><help>
 
-.. class:: warningmark 
-
-Blast XML output **must** be uploaded to Galaxy in zipped form.
- 
------
-	
 **What it does**
 
 This tool processes the XML output of any NCBI blast tool (if you run your own blast jobs, the XML output can be generated with **-m 7** option).

--- a/tools/metag_tools/short_reads_figure_score.py
+++ b/tools/metag_tools/short_reads_figure_score.py
@@ -8,7 +8,7 @@ boxplot:
 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
 """
 
-import os, sys, math, tempfile, zipfile, re
+import os, sys, math, tempfile, re
 from rpy import *
 
 assert sys.version_info[:2] >= ( 2, 4 )
@@ -17,14 +17,6 @@ def stop_err( msg ):
     sys.stderr.write( "%s\n" % msg )
     sys.exit()
 
-def unzip( filename ):
-    zip_file = zipfile.ZipFile( filename, 'r' )
-    tmpfilename = tempfile.NamedTemporaryFile().name
-    for name in zip_file.namelist():
-        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
-    zip_file.close()
-    return tmpfilename
-
 def merge_to_20_datapoints( score ):
     number_of_points = 20
     read_length = len( score )
@@ -68,12 +60,7 @@ def __main__():
     infile_score_name = sys.argv[1].strip()
     outfile_R_name = sys.argv[2].strip()
 
-    infile_is_zipped = False
-    if zipfile.is_zipfile( infile_score_name ):
-        infile_is_zipped = True
-        infile_name = unzip( infile_score_name )
-    else:
-        infile_name = infile_score_name
+    infile_name = infile_score_name
 
     # Determine tabular or fasta format within the first 100 lines
     seq_method = None
@@ -249,10 +236,6 @@ def __main__():
         r.axis( 1, x_old_range, x_new_range )
     r.dev_off()
 
-    if infile_is_zipped and os.path.exists( infile_name ):
-        # Need to delete temporary file created when we unzipped the infile archive
-        os.remove( infile_name )
-
     if invalid_scores > 0:
         print 'Skipped %d invalid scores. ' % invalid_scores
     if invalid_lines > 0:

--- a/tools/data_source/upload.py
+++ b/tools/data_source/upload.py
@@ -114,24 +114,9 @@ def check_gzip( temp_name ):
         return ( True, False )
     return ( True, True )
 def check_zip( temp_name ):
-    if not zipfile.is_zipfile( temp_name ):
-        return ( False, False, None )
-    zip_file = zipfile.ZipFile( temp_name, "r" )
-    # Make sure the archive consists of valid files.  The current rules are:
-    # 1. Archives can only include .ab1, .scf or .txt files
-    # 2. All file extensions within an archive must be the same
-    name = zip_file.namelist()[0]
-    try:
-        test_ext = name.split( "." )[1].strip().lower()
-    except:
-        return ( True, False, None )
-    if not ( test_ext in unsniffable_binary_formats or test_ext == 'txt' ):
-        return ( True, False, test_ext )
-    for name in zip_file.namelist():
-        ext = name.split( "." )[1].strip().lower()
-        if ext != test_ext:
-            return ( True, False, test_ext )
-    return ( True, True, test_ext )
+    if zipfile.is_zipfile( temp_name ):
+        return True
+    return False
 def parse_outputs( args ):
     rval = {}
     for arg in args:
@@ -142,6 +127,7 @@ def add_file( dataset, json_file, output
     data_type = None
     line_count = None
     converted_path = None
+    stdout = None
 
     if dataset.type == 'url':
         try:
@@ -204,24 +190,29 @@ def add_file( dataset, json_file, output
             data_type = 'gzip'
         if not data_type:
             # See if we have a zip archive
-            is_zipped, is_valid, test_ext = check_zip( dataset.path )
-            if is_zipped and not is_valid:
-                file_err( 'The zipped uploaded file contains inappropriate content', dataset, json_file )
-                return
-            elif is_zipped and is_valid:
-                # Currently, we force specific tools to handle this case.  We also require the user
-                # to manually set the incoming file_type
-                if ( test_ext in unsniffable_binary_formats ) and dataset.file_type != 'binseq.zip':
-                    file_err( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'", dataset, json_file )
-                    return
-                elif test_ext == 'txt' and dataset.file_type != 'txtseq.zip':
-                    file_err( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'", dataset, json_file )
-                    return
-                if not ( dataset.file_type == 'binseq.zip' or dataset.file_type == 'txtseq.zip' ):
-                    file_err( "You must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files", dataset, json_file )
-                    return
-                data_type = 'zip'
-                ext = dataset.file_type
+            is_zipped = check_zip( dataset.path )
+            if is_zipped:
+                unzipped = False
+                z = zipfile.ZipFile( dataset.path )
+                for name in z.namelist():
+                    if name.endswith('/'):
+                        continue
+                    if unzipped:
+                        stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
+                        break
+                    fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False )
+                    try:
+                        outfile = open( uncompressed, 'wb' )
+                        outfile.write( z.read( name ) )
+                        outfile.close()
+                        shutil.move( uncompressed, dataset.path )
+                        dataset.name = name
+                        unzipped = True
+                    except IOError:
+                        os.close( fd )
+                        os.remove( uncompressed )
+                        file_err( 'Problem decompressing zipped data', dataset, json_file )
+                        return
         if not data_type:
             if check_binary( dataset.path ):
                 # We have a binary dataset, but it is not Bam or Sff
@@ -242,7 +233,7 @@ def add_file( dataset, json_file, output
             if check_html( dataset.path ):
                 file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file )
                 return
-        if data_type != 'binary' and data_type != 'zip':
+        if data_type != 'binary':
             # don't convert newlines on data we're only going to symlink
             if not dataset.get( 'link_data_only', False ):
                 in_place = True
@@ -278,10 +269,11 @@ def add_file( dataset, json_file, output
     else:
         shutil.move( dataset.path, output_path )
     # Write the job info
+    stdout = stdout or 'uploaded %s file' % data_type
     info = dict( type = 'dataset',
                  dataset_id = dataset.dataset_id,
                  ext = ext,
-                 stdout = 'uploaded %s file' % data_type,
+                 stdout = stdout,
                  name = dataset.name,
                  line_count = line_count )
     json_file.write( to_json_string( info ) + "\n" )

--- a/tools/solid_tools/solid_qual_stats.xml
+++ b/tools/solid_tools/solid_qual_stats.xml
@@ -3,7 +3,7 @@
     <command interpreter="python">solid_qual_stats.py $input $output1</command><inputs>
-        <param format="qualsolid, txtseq.zip" name="input" type="data" label="SOLiD qual file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" />
+        <param format="qualsolid" name="input" type="data" label="SOLiD qual file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /></inputs><outputs><data format="txt" name="output1" metadata_source="input" />

    

[galaxy-commits] galaxy-dist commit 150c8db8dec1: Remove obsolete binseq.zip and txtseq.zip formats, and allow for uploading single files in a zip archive. Adapted from a patch from Pablo Cingolani.

commits-noreply＠bitbucket.org