[galaxy-commits] commit/galaxy-central: 6 new changesets

27 Dec 2014

6 new commits in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/919dd242fd62/
Changeset:   919dd242fd62
Branch:      arff_datatype
User:        BjoernGruening
Date:        2014-12-16 11:40:29+00:00
Summary:     Created new branch arff_datatype
Affected #:  0 files



https://bitbucket.org/galaxy/galaxy-central/commits/807b1e8a7815/
Changeset:   807b1e8a7815
Branch:      arff_datatype
User:        BjoernGruening
Date:        2014-12-16 12:08:50+00:00
Summary:     Add Arff datatype class with sniffer and metadata.
Affected #:  1 file

diff -r 919dd242fd625a71d26b3935cabfbdb9b48c97b4 -r 807b1e8a7815f1514712146e909a508acc31a93d lib/galaxy/datatypes/text.py

--- a/lib/galaxy/datatypes/text.py
+++ b/lib/galaxy/datatypes/text.py
@@ -6,6 +6,7 @@
 from galaxy.datatypes.data import Text
 from galaxy.datatypes.data import get_file_peek
 from galaxy.datatypes.data import nice_size
+from galaxy.datatypes.metadata import MetadataElement
 from galaxy import util
 
 import tempfile
@@ -155,3 +156,106 @@
                         return True
         return False
 
+
+class Arff( Text ):
+    """
+        An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. 
+        http://weka.wikispaces.com/ARFF
+    """
+    file_ext = "arff"
+
+
+    """Add metadata elements"""
+    MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=True, optional=True, no_value=0 )
+    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0 )
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = "Attribute-Relation File Format (ARFF)"
+            dataset.blurb += ", %s comments, %s attributes" % ( dataset.metadata.comment_lines, dataset.metadata.columns )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disc'
+
+    def sniff( self, filename ):
+        """
+            Try to guess the Arff filetype. 
+            It usually starts with a "format-version:" string and has several stanzas which starts with "id:".
+        """
+        with open( filename ) as handle:
+            relation_found = False
+            attribute_found = False
+            prefix = ""
+            for line_count, line in enumerate( handle ):
+                if line_count > 1000:
+                    # only investigate the first 1000 lines
+                    return False
+                line = line.strip()
+                if not line:
+                    continue
+
+                start_string = line[:20].upper()
+                if start_string.startswith("@RELATION"):
+                    relation_found = True
+                elif start_string.startswith("@ATTRIBUTE"):
+                    attribute_found = True
+                elif start_string.startswith("@DATA"):
+                    # @DATA should be the last data block
+                    if relation_found and attribute_found:
+                        return True
+        return False
+
+    def set_meta( self, dataset, **kwd ):
+        """
+            Trying to count the comment lines and the number of columns included.
+            A typical ARFF data block looks like this:
+            @DATA
+            5.1,3.5,1.4,0.2,Iris-setosa
+            4.9,3.0,1.4,0.2,Iris-setosa
+        """
+        if dataset.has_data():
+            comment_lines = 0
+            first_real_line = False
+            data_block = False
+            with open( dataset.file_name ) as handle:
+                for line in handle:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    if line.startswith('%') and not first_real_line:
+                        comment_lines += 1
+                    else:
+                        first_real_line = True
+                    if data_block:
+                        if line.startswith('{'):
+                            # Sparse representation
+                            """
+                                @data
+                                0, X, 0, Y, "class A", {5}
+                            or
+                                @data
+                                {1 X, 3 Y, 4 "class A"}, {5}
+                            """
+                            token = line.split('}',1)
+                            first_part = token[0]
+                            last_column = first_part.split(',')[-1].strip()
+                            numeric_value = last_column.split()[0]
+                            column_count = int(numeric_value)
+                            if len(token) > 1:
+                                # we have an additional weight
+                                column_count -= 1
+                        else:
+                            columns = line.strip().split(',')
+                            column_count = len(columns)
+                            if columns[-1].strip().startswith('{'):
+                                # we have an additional weight at the end
+                                column_count -= 1
+
+                        # We have now the column_count and we know the initial comment lines. So we can terminate here.
+                        break
+                    if line[:5].upper() == "@DATA":
+                        data_block = True
+        dataset.metadata.comment_lines = comment_lines
+        dataset.metadata.columns = column_count
+


https://bitbucket.org/galaxy/galaxy-central/commits/a876fb01dba3/
Changeset:   a876fb01dba3
Branch:      arff_datatype
User:        BjoernGruening
Date:        2014-12-16 12:16:00+00:00
Summary:     Add Arff datatype and sniffer to datatypes_conf.xml.
Affected #:  1 file

diff -r 807b1e8a7815f1514712146e909a508acc31a93d -r a876fb01dba3595d68ae99fea8e2070fa3ffad55 config/datatypes_conf.xml.sample
--- a/config/datatypes_conf.xml.sample
+++ b/config/datatypes_conf.xml.sample
@@ -3,6 +3,7 @@
   <registration converters_path="lib/galaxy/datatypes/converters" display_path="display_applications"><datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'ab1' format with a '.ab1' file extension.  You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Ab1"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false" />
+    <!--datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True"/--><datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true" /><datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true" /><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format.  Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.  Blocks are separated from one another by blank lines.  The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/>
@@ -298,6 +299,7 @@
     <sniffer type="galaxy.datatypes.data:Newick"/><sniffer type="galaxy.datatypes.data:Nexus"/><sniffer type="galaxy.datatypes.text:Obo"/>
+    <!--sniffer type="galaxy.datatypes.text.Arff"/--><sniffer type="galaxy.datatypes.text:Ipynb"/><sniffer type="galaxy.datatypes.text:Json"/><sniffer type="galaxy.datatypes.images:Jpg"/>


https://bitbucket.org/galaxy/galaxy-central/commits/d91d8e8beada/
Changeset:   d91d8e8beada
Branch:      arff_datatype
User:        BjoernGruening
Date:        2014-12-23 00:27:36+00:00
Summary:     Activate arff datatype by default
Affected #:  1 file

diff -r a876fb01dba3595d68ae99fea8e2070fa3ffad55 -r d91d8e8beadac2a187645f86c199119064a5ccb0 config/datatypes_conf.xml.sample
--- a/config/datatypes_conf.xml.sample
+++ b/config/datatypes_conf.xml.sample
@@ -3,7 +3,7 @@
   <registration converters_path="lib/galaxy/datatypes/converters" display_path="display_applications"><datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'ab1' format with a '.ab1' file extension.  You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Ab1"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false" />
-    <!--datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True"/-->
+    <datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True" /><datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true" /><datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true" /><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format.  Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.  Blocks are separated from one another by blank lines.  The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/>
@@ -299,7 +299,7 @@
     <sniffer type="galaxy.datatypes.data:Newick"/><sniffer type="galaxy.datatypes.data:Nexus"/><sniffer type="galaxy.datatypes.text:Obo"/>
-    <!--sniffer type="galaxy.datatypes.text.Arff"/-->
+    <sniffer type="galaxy.datatypes.text.Arff"/><sniffer type="galaxy.datatypes.text:Ipynb"/><sniffer type="galaxy.datatypes.text:Json"/><sniffer type="galaxy.datatypes.images:Jpg"/>


https://bitbucket.org/galaxy/galaxy-central/commits/63ec80be64bb/
Changeset:   63ec80be64bb
User:        jmchilton
Date:        2014-12-26 22:33:02+00:00
Summary:     Merge pull request #614.
Affected #:  2 files

diff -r 338a32cc6067d92055324e4b827697051ec99254 -r 63ec80be64bb00414be0f7122a51907114ffb714 config/datatypes_conf.xml.sample
--- a/config/datatypes_conf.xml.sample
+++ b/config/datatypes_conf.xml.sample
@@ -3,6 +3,7 @@
   <registration converters_path="lib/galaxy/datatypes/converters" display_path="display_applications"><datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'ab1' format with a '.ab1' file extension.  You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Ab1"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false" />
+    <datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True" /><datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true" /><datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true" /><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format.  Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.  Blocks are separated from one another by blank lines.  The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/>
@@ -298,6 +299,7 @@
     <sniffer type="galaxy.datatypes.data:Newick"/><sniffer type="galaxy.datatypes.data:Nexus"/><sniffer type="galaxy.datatypes.text:Obo"/>
+    <sniffer type="galaxy.datatypes.text.Arff"/><sniffer type="galaxy.datatypes.text:Ipynb"/><sniffer type="galaxy.datatypes.text:Json"/><sniffer type="galaxy.datatypes.images:Jpg"/>

diff -r 338a32cc6067d92055324e4b827697051ec99254 -r 63ec80be64bb00414be0f7122a51907114ffb714 lib/galaxy/datatypes/text.py
--- a/lib/galaxy/datatypes/text.py
+++ b/lib/galaxy/datatypes/text.py
@@ -6,6 +6,7 @@
 from galaxy.datatypes.data import Text
 from galaxy.datatypes.data import get_file_peek
 from galaxy.datatypes.data import nice_size
+from galaxy.datatypes.metadata import MetadataElement
 from galaxy import util
 
 import tempfile
@@ -155,3 +156,106 @@
                         return True
         return False
 
+
+class Arff( Text ):
+    """
+        An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. 
+        http://weka.wikispaces.com/ARFF
+    """
+    file_ext = "arff"
+
+
+    """Add metadata elements"""
+    MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=True, optional=True, no_value=0 )
+    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0 )
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
+            dataset.blurb = "Attribute-Relation File Format (ARFF)"
+            dataset.blurb += ", %s comments, %s attributes" % ( dataset.metadata.comment_lines, dataset.metadata.columns )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disc'
+
+    def sniff( self, filename ):
+        """
+            Try to guess the Arff filetype. 
+            It usually starts with a "format-version:" string and has several stanzas which starts with "id:".
+        """
+        with open( filename ) as handle:
+            relation_found = False
+            attribute_found = False
+            prefix = ""
+            for line_count, line in enumerate( handle ):
+                if line_count > 1000:
+                    # only investigate the first 1000 lines
+                    return False
+                line = line.strip()
+                if not line:
+                    continue
+
+                start_string = line[:20].upper()
+                if start_string.startswith("@RELATION"):
+                    relation_found = True
+                elif start_string.startswith("@ATTRIBUTE"):
+                    attribute_found = True
+                elif start_string.startswith("@DATA"):
+                    # @DATA should be the last data block
+                    if relation_found and attribute_found:
+                        return True
+        return False
+
+    def set_meta( self, dataset, **kwd ):
+        """
+            Trying to count the comment lines and the number of columns included.
+            A typical ARFF data block looks like this:
+            @DATA
+            5.1,3.5,1.4,0.2,Iris-setosa
+            4.9,3.0,1.4,0.2,Iris-setosa
+        """
+        if dataset.has_data():
+            comment_lines = 0
+            first_real_line = False
+            data_block = False
+            with open( dataset.file_name ) as handle:
+                for line in handle:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    if line.startswith('%') and not first_real_line:
+                        comment_lines += 1
+                    else:
+                        first_real_line = True
+                    if data_block:
+                        if line.startswith('{'):
+                            # Sparse representation
+                            """
+                                @data
+                                0, X, 0, Y, "class A", {5}
+                            or
+                                @data
+                                {1 X, 3 Y, 4 "class A"}, {5}
+                            """
+                            token = line.split('}',1)
+                            first_part = token[0]
+                            last_column = first_part.split(',')[-1].strip()
+                            numeric_value = last_column.split()[0]
+                            column_count = int(numeric_value)
+                            if len(token) > 1:
+                                # we have an additional weight
+                                column_count -= 1
+                        else:
+                            columns = line.strip().split(',')
+                            column_count = len(columns)
+                            if columns[-1].strip().startswith('{'):
+                                # we have an additional weight at the end
+                                column_count -= 1
+
+                        # We have now the column_count and we know the initial comment lines. So we can terminate here.
+                        break
+                    if line[:5].upper() == "@DATA":
+                        data_block = True
+        dataset.metadata.comment_lines = comment_lines
+        dataset.metadata.columns = column_count
+


https://bitbucket.org/galaxy/galaxy-central/commits/3ce3ebfdd734/
Changeset:   3ce3ebfdd734
Branch:      arff_datatype
User:        jmchilton
Date:        2014-12-26 22:34:57+00:00
Summary:     Close arff_datatype.
Affected #:  0 files

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

[galaxy-commits] commit/galaxy-central: 6 new changesets

commits-noreply＠bitbucket.org