6 new commits in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/919dd242fd62/ Changeset: 919dd242fd62 Branch: arff_datatype User: BjoernGruening Date: 2014-12-16 11:40:29+00:00 Summary: Created new branch arff_datatype Affected #: 0 files https://bitbucket.org/galaxy/galaxy-central/commits/807b1e8a7815/ Changeset: 807b1e8a7815 Branch: arff_datatype User: BjoernGruening Date: 2014-12-16 12:08:50+00:00 Summary: Add Arff datatype class with sniffer and metadata. Affected #: 1 file diff -r 919dd242fd625a71d26b3935cabfbdb9b48c97b4 -r 807b1e8a7815f1514712146e909a508acc31a93d lib/galaxy/datatypes/text.py --- a/lib/galaxy/datatypes/text.py +++ b/lib/galaxy/datatypes/text.py @@ -6,6 +6,7 @@ from galaxy.datatypes.data import Text from galaxy.datatypes.data import get_file_peek from galaxy.datatypes.data import nice_size +from galaxy.datatypes.metadata import MetadataElement from galaxy import util import tempfile @@ -155,3 +156,106 @@ return True return False + +class Arff( Text ): + """ + An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. + http://weka.wikispaces.com/ARFF + """ + file_ext = "arff" + + + """Add metadata elements""" + MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=True, optional=True, no_value=0 ) + MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0 ) + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = "Attribute-Relation File Format (ARFF)" + dataset.blurb += ", %s comments, %s attributes" % ( dataset.metadata.comment_lines, dataset.metadata.columns ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disc' + + def sniff( self, filename ): + """ + Try to guess the Arff filetype. + It usually starts with a "format-version:" string and has several stanzas which starts with "id:". + """ + with open( filename ) as handle: + relation_found = False + attribute_found = False + prefix = "" + for line_count, line in enumerate( handle ): + if line_count > 1000: + # only investigate the first 1000 lines + return False + line = line.strip() + if not line: + continue + + start_string = line[:20].upper() + if start_string.startswith("@RELATION"): + relation_found = True + elif start_string.startswith("@ATTRIBUTE"): + attribute_found = True + elif start_string.startswith("@DATA"): + # @DATA should be the last data block + if relation_found and attribute_found: + return True + return False + + def set_meta( self, dataset, **kwd ): + """ + Trying to count the comment lines and the number of columns included. + A typical ARFF data block looks like this: + @DATA + 5.1,3.5,1.4,0.2,Iris-setosa + 4.9,3.0,1.4,0.2,Iris-setosa + """ + if dataset.has_data(): + comment_lines = 0 + first_real_line = False + data_block = False + with open( dataset.file_name ) as handle: + for line in handle: + line = line.strip() + if not line: + continue + if line.startswith('%') and not first_real_line: + comment_lines += 1 + else: + first_real_line = True + if data_block: + if line.startswith('{'): + # Sparse representation + """ + @data + 0, X, 0, Y, "class A", {5} + or + @data + {1 X, 3 Y, 4 "class A"}, {5} + """ + token = line.split('}',1) + first_part = token[0] + last_column = first_part.split(',')[-1].strip() + numeric_value = last_column.split()[0] + column_count = int(numeric_value) + if len(token) > 1: + # we have an additional weight + column_count -= 1 + else: + columns = line.strip().split(',') + column_count = len(columns) + if columns[-1].strip().startswith('{'): + # we have an additional weight at the end + column_count -= 1 + + # We have now the column_count and we know the initial comment lines. So we can terminate here. + break + if line[:5].upper() == "@DATA": + data_block = True + dataset.metadata.comment_lines = comment_lines + dataset.metadata.columns = column_count + https://bitbucket.org/galaxy/galaxy-central/commits/a876fb01dba3/ Changeset: a876fb01dba3 Branch: arff_datatype User: BjoernGruening Date: 2014-12-16 12:16:00+00:00 Summary: Add Arff datatype and sniffer to datatypes_conf.xml. Affected #: 1 file diff -r 807b1e8a7815f1514712146e909a508acc31a93d -r a876fb01dba3595d68ae99fea8e2070fa3ffad55 config/datatypes_conf.xml.sample --- a/config/datatypes_conf.xml.sample +++ b/config/datatypes_conf.xml.sample @@ -3,6 +3,7 @@ <registration converters_path="lib/galaxy/datatypes/converters" display_path="display_applications"><datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'ab1' format with a '.ab1' file extension. You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Ab1"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false" /> + <!--datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True"/--><datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true" /><datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true" /><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/> @@ -298,6 +299,7 @@ <sniffer type="galaxy.datatypes.data:Newick"/><sniffer type="galaxy.datatypes.data:Nexus"/><sniffer type="galaxy.datatypes.text:Obo"/> + <!--sniffer type="galaxy.datatypes.text.Arff"/--><sniffer type="galaxy.datatypes.text:Ipynb"/><sniffer type="galaxy.datatypes.text:Json"/><sniffer type="galaxy.datatypes.images:Jpg"/> https://bitbucket.org/galaxy/galaxy-central/commits/d91d8e8beada/ Changeset: d91d8e8beada Branch: arff_datatype User: BjoernGruening Date: 2014-12-23 00:27:36+00:00 Summary: Activate arff datatype by default Affected #: 1 file diff -r a876fb01dba3595d68ae99fea8e2070fa3ffad55 -r d91d8e8beadac2a187645f86c199119064a5ccb0 config/datatypes_conf.xml.sample --- a/config/datatypes_conf.xml.sample +++ b/config/datatypes_conf.xml.sample @@ -3,7 +3,7 @@ <registration converters_path="lib/galaxy/datatypes/converters" display_path="display_applications"><datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'ab1' format with a '.ab1' file extension. You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Ab1"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false" /> - <!--datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True"/--> + <datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True" /><datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true" /><datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true" /><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/> @@ -299,7 +299,7 @@ <sniffer type="galaxy.datatypes.data:Newick"/><sniffer type="galaxy.datatypes.data:Nexus"/><sniffer type="galaxy.datatypes.text:Obo"/> - <!--sniffer type="galaxy.datatypes.text.Arff"/--> + <sniffer type="galaxy.datatypes.text.Arff"/><sniffer type="galaxy.datatypes.text:Ipynb"/><sniffer type="galaxy.datatypes.text:Json"/><sniffer type="galaxy.datatypes.images:Jpg"/> https://bitbucket.org/galaxy/galaxy-central/commits/63ec80be64bb/ Changeset: 63ec80be64bb User: jmchilton Date: 2014-12-26 22:33:02+00:00 Summary: Merge pull request #614. Affected #: 2 files diff -r 338a32cc6067d92055324e4b827697051ec99254 -r 63ec80be64bb00414be0f7122a51907114ffb714 config/datatypes_conf.xml.sample --- a/config/datatypes_conf.xml.sample +++ b/config/datatypes_conf.xml.sample @@ -3,6 +3,7 @@ <registration converters_path="lib/galaxy/datatypes/converters" display_path="display_applications"><datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true" description="A binary sequence file in 'ab1' format with a '.ab1' file extension. You must manually select this 'File Format' when uploading the file." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Ab1"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false" /> + <datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="True" /><datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true" /><datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true" /><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/> @@ -298,6 +299,7 @@ <sniffer type="galaxy.datatypes.data:Newick"/><sniffer type="galaxy.datatypes.data:Nexus"/><sniffer type="galaxy.datatypes.text:Obo"/> + <sniffer type="galaxy.datatypes.text.Arff"/><sniffer type="galaxy.datatypes.text:Ipynb"/><sniffer type="galaxy.datatypes.text:Json"/><sniffer type="galaxy.datatypes.images:Jpg"/> diff -r 338a32cc6067d92055324e4b827697051ec99254 -r 63ec80be64bb00414be0f7122a51907114ffb714 lib/galaxy/datatypes/text.py --- a/lib/galaxy/datatypes/text.py +++ b/lib/galaxy/datatypes/text.py @@ -6,6 +6,7 @@ from galaxy.datatypes.data import Text from galaxy.datatypes.data import get_file_peek from galaxy.datatypes.data import nice_size +from galaxy.datatypes.metadata import MetadataElement from galaxy import util import tempfile @@ -155,3 +156,106 @@ return True return False + +class Arff( Text ): + """ + An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. + http://weka.wikispaces.com/ARFF + """ + file_ext = "arff" + + + """Add metadata elements""" + MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=True, optional=True, no_value=0 ) + MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0 ) + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = "Attribute-Relation File Format (ARFF)" + dataset.blurb += ", %s comments, %s attributes" % ( dataset.metadata.comment_lines, dataset.metadata.columns ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disc' + + def sniff( self, filename ): + """ + Try to guess the Arff filetype. + It usually starts with a "format-version:" string and has several stanzas which starts with "id:". + """ + with open( filename ) as handle: + relation_found = False + attribute_found = False + prefix = "" + for line_count, line in enumerate( handle ): + if line_count > 1000: + # only investigate the first 1000 lines + return False + line = line.strip() + if not line: + continue + + start_string = line[:20].upper() + if start_string.startswith("@RELATION"): + relation_found = True + elif start_string.startswith("@ATTRIBUTE"): + attribute_found = True + elif start_string.startswith("@DATA"): + # @DATA should be the last data block + if relation_found and attribute_found: + return True + return False + + def set_meta( self, dataset, **kwd ): + """ + Trying to count the comment lines and the number of columns included. + A typical ARFF data block looks like this: + @DATA + 5.1,3.5,1.4,0.2,Iris-setosa + 4.9,3.0,1.4,0.2,Iris-setosa + """ + if dataset.has_data(): + comment_lines = 0 + first_real_line = False + data_block = False + with open( dataset.file_name ) as handle: + for line in handle: + line = line.strip() + if not line: + continue + if line.startswith('%') and not first_real_line: + comment_lines += 1 + else: + first_real_line = True + if data_block: + if line.startswith('{'): + # Sparse representation + """ + @data + 0, X, 0, Y, "class A", {5} + or + @data + {1 X, 3 Y, 4 "class A"}, {5} + """ + token = line.split('}',1) + first_part = token[0] + last_column = first_part.split(',')[-1].strip() + numeric_value = last_column.split()[0] + column_count = int(numeric_value) + if len(token) > 1: + # we have an additional weight + column_count -= 1 + else: + columns = line.strip().split(',') + column_count = len(columns) + if columns[-1].strip().startswith('{'): + # we have an additional weight at the end + column_count -= 1 + + # We have now the column_count and we know the initial comment lines. So we can terminate here. + break + if line[:5].upper() == "@DATA": + data_block = True + dataset.metadata.comment_lines = comment_lines + dataset.metadata.columns = column_count + https://bitbucket.org/galaxy/galaxy-central/commits/3ce3ebfdd734/ Changeset: 3ce3ebfdd734 Branch: arff_datatype User: jmchilton Date: 2014-12-26 22:34:57+00:00 Summary: Close arff_datatype. Affected #: 0 files Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.