galaxy-dist commit 7ca26b1cc9b6: Added a sample loc file listing all fasta files and a script to generate the file given a base genome directory.

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Kelly Vincent <kpvincent@bx.psu.edu>
# Date 1289194982 18000
# Node ID 7ca26b1cc9b6fd67364a6c6af6005ceecaa3f45b
# Parent  535d276c92bcc35e563c4ce5a2351f562f451132
Added a sample loc file listing all fasta files and a script to generate the file given a base genome directory.

--- /dev/null
+++ b/tool-data/all_fasta.loc.sample
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id><dbkey><display_name><file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3     apiMel3   Honeybee (Apis mellifera): apiMel3     /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon   hg19      Human (Homo sapiens): hg19 Canonical   /path/to/genome/hg19/hg19canon.fa
+#hg19full    hg19      Human (Homo sapiens): hg19 Full        /path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#

--- /dev/null
+++ b/scripts/loc_files/create_all_fasta_loc.py
@@ -0,0 +1,300 @@
+import optparse, os, sys
+import elementtree.ElementTree as etree
+
+"""
+Generates a loc file containing names of all the fasta files that match the
+name of the genome subdirectory they're in.
+Assumptions:
+    - fasta files should be named the same as the genome subdirectory they're
+      in, with the possible addition of a recognized variant (canon, full, etc.)
+    - for "variants" (like full, canon[ical], chrM, etc.) the naming needs to be
+      consistent and specific:
+        - <genome_name><variant>, like hg19canon, hg19full, or hg19chrM
+Normal usage:
+create_all_fasta_loc.py -f unmatching_fasta.txt -i seq
+
+usage: %prog [options]
+   -d, --data-table-xml=d: The name of the data table configuration file to get format of loc file
+   -t, --data-table=t: The name of the data table listed in the data table XML file
+   -g, --genome-dir=g: Genome directory to look in
+   -e, --exemptions=e: Comma-separated list of genome dir subdirectories to not look in
+   -i, --inspect-dirs=i: Comma-separated list of subdirectories inside genome dirs to look in (default is all)
+   -x, --fasta-exts=x: Comma-separated list of all fasta extensions to list
+   -s, --loc-sample=s: The name of the sample loc file (to copy text into top of output loc file)
+   -f, --unmatching-fasta=f: Name of file to output non-matching fasta files to, if included
+   -v, --variants=v: Comma-separated list of recognized variants of fasta file names
+   -a, --append=a: Append to existing all_fasta.loc file rather than create new
+   -p, --sample-text=p: Copy over text from all_fasta.loc.sample file (false if set to append)
+"""
+
+DEFAULT_TOOL_DATA_TABLE_CONF = 'tool_data_table_conf.xml'
+DEFAULT_ALL_FASTA_LOC_BASE = 'all_fasta'
+DEFAULT_BASE_GENOME_DIR = '/afs/bx.psu.edu/depot/data/genome'
+EXEMPTIONS = 'bin,tmp,lengths,equCab2_chrM,microbes'
+INSPECT_DIR = None
+FASTA_EXTS = '.fa,.fasta,.fna'
+VARIANTS = 'chrM,chr21,full,canon,female,male,haps,nohaps'
+
+VARIANT_EXCLUSIONS = ':full'
+
+DBKEY_DESCRIPTION_MAP = { 'AaegL1': 'Mosquito (Aedes aegypti): AaegL1',
+                          'AgamP3': 'Mosquito (Anopheles gambiae): AgamP3',
+                          'anoCar1': 'Lizard (Anolis carolinensis): anoCar1',
+                          'anoGam1': 'Mosquito (Anopheles gambiae): anoGam1',
+                          'apiMel1': 'Honeybee (Apis mellifera): apiMel1',
+                          'apiMel2': 'Honeybee (Apis mellifera): apiMel2',
+                          'apiMel3': 'Honeybee (Apis mellifera): apiMel3',
+                          'Arabidopsis_thaliana_TAIR9': '',
+                          'borEut13': 'Boreoeutherian: borEut13',
+                          'bosTau2': 'Cow (Bos taurus): bosTau2',
+                          'bosTau3': 'Cow (Bos taurus): bosTau3',
+                          'bosTau4': 'Cow (Bos taurus): bosTau4',
+                          'bosTauMd3': 'Cow (Bos taurus): bosTauMd3',
+                          'calJac1': 'Marmoset (Callithrix jacchus): calJac1',
+                          'canFam1': 'Dog (Canis lupus familiaris): canFam1',
+                          'canFam2': 'Dog (Canis lupus familiaris): canFam2',
+                          'cavPor3': 'Guinea Pig (Cavia porcellus): cavPor3',
+                          'ce2': 'Caenorhabditis elegans: ce2',
+                          'ce4': 'Caenorhabditis elegans: ce4',
+                          'ce5': 'Caenorhabditis elegans: ce5',
+                          'ce6': 'Caenorhabditis elegans: ce6',
+                          'CpipJ1': 'Mosquito (Culex quinquefasciatus): CpipJ1',
+                          'danRer2': 'Zebrafish (Danio rerio): danRer2',
+                          'danRer3': 'Zebrafish (Danio rerio): danRer3',
+                          'danRer4': 'Zebrafish (Danio rerio): danRer4',
+                          'danRer5': 'Zebrafish (Danio rerio): danRer5',
+                          'danRer6': 'Zebrafish (Danio rerio): danRer6',
+                          'dm1': 'Fruit Fly (Drosophila melanogaster): dm1',
+                          'dm2': 'Fruit Fly (Drosophila melanogaster): dm2',
+                          'dm3': 'Fruit Fly (Drosophila melanogaster): dm3',
+                          'dm4': 'Fruit Fly (Drosophila melanogaster): dm',
+                          'dp3': 'Fruit Fly (Drosophila pseudoobscura): dp3',
+                          'dp4': 'Fruit Fly (Drosophila pseudoobscura): dp4',
+                          'droAna1': 'Fruit Fly (Drosophila ananassae): droAna1',
+                          'droAna2': 'Fruit Fly (Drosophila ananassae): droAna2',
+                          'droAna3': 'Fruit Fly (Drosophila ananassae): droAna3',
+                          'droEre1': 'Fruit Fly (Drosophila erecta): droEre1',
+                          'droEre2': 'Fruit Fly (Drosophila erecta): droEre2',
+                          'droGri1': 'Fruit Fly (Drosophila grimshawi): droGri1',
+                          'droGri2': 'Fruit Fly (Drosophila grimshawi): droGri2',
+                          'droMoj1': 'Fruit Fly (Drosophila mojavensis): droMoj1',
+                          'droMoj2': 'Fruit Fly (Drosophila mojavensis): droMoj2',
+                          'droMoj3': 'Fruit Fly (Drosophila mojavensis): droMoj3',
+                          'droPer1': 'Fruit Fly (Drosophila persimilis): droPer1',
+                          'droSec1': 'Fruit Fly (Drosophila sechellia): droSec1',
+                          'droSim1': 'Fruit Fly (Drosophila simulans): droSim1',
+                          'droVir1': 'Fruit Fly (Drosophila virilis): droVir1',
+                          'droVir2': 'Fruit Fly (Drosophila virilis): droVir2',
+                          'droVir3': 'Fruit Fly (Drosophila virilis): droVir3',
+                          'droYak1': 'Fruit Fly (Drosophila yakuba): droYak1',
+                          'droYak2': 'Fruit Fly (Drosophila yakuba): droYak2',
+                          'echTel1': 'Tenrec (Echinops telfairi): echTel1',
+                          'equCab1': 'Horse (Equus caballus): equCab1',
+                          'equCab2': 'Horse (Equus caballus): equCab2',
+                          'eriEur1': 'Hedgehog (Erinaceus europaeus): eriEur1',
+                          'felCat3': 'Cat (Felis catus): felCat3',
+                          'fr1': 'Fugu (Takifugu rubripes): fr1',
+                          'fr2': 'Fugu (Takifugu rubripes): fr2',
+                          'galGal2': 'Chicken (Gallus gallus): galGal2',
+                          'galGal3': 'Chicken (Gallus gallus): galGal3',
+                          'gasAcu1': 'Stickleback (Gasterosteus aculeatus): gasAcu1',
+                          'hg16': 'Human (Homo sapiens): hg16',
+                          'hg17': 'Human (Homo sapiens): hg17',
+                          'hg18': 'Human (Homo sapiens): hg18',
+                          'hg19': 'Human (Homo sapiens): hg19',
+                          'IscaW1': 'Deer Tick (Ixodes scapularis): IscaW1',
+                          'lMaj5': 'Leishmania major: lMaj5',
+                          'mm5': 'Mouse (Mus musculus): mm5',
+                          'mm6': 'Mouse (Mus musculus): mm6',
+                          'mm7': 'Mouse (Mus musculus): mm7',
+                          'mm8': 'Mouse (Mus musculus): mm8',
+                          'mm9': 'Mouse (Mus musculus): mm9',
+                          'monDom4': 'Opossum (Monodelphis domestica): monDom4',
+                          'monDom5': 'Opossum (Monodelphis domestica): monDom5',
+                          'ornAna1': 'Platypus (Ornithorhynchus anatinus): ornAna1',
+                          'oryCun1': 'Rabbit (Oryctolagus cuniculus): oryCun1',
+                          'oryLat1': 'Medaka (Oryzias latipes): oryLat1',
+                          'oryLat2': 'Medaka (Oryzias latipes): oryLat2',
+                          'oryza_sativa_japonica_nipponbare_IRGSP4.0': 'Rice (Oryza sativa L. ssp. japonica var. Nipponbare): IRGSP4.0',
+                          'otoGar1': 'Bushbaby (Otolemur garnetti): otoGar1',
+                          'panTro1': 'Chimpanzee (Pan troglodytes): panTro1',
+                          'panTro2': 'Chimpanzee (Pan troglodytes): panTro2',
+                          'petMar1': 'Lamprey (Petromyzon marinus): petMar1',
+                          'phiX': 'phiX174 (AF176034)',
+                          'PhumU1': 'Head Louse (Pediculus humanus): PhumU1',
+                          'ponAbe2': 'Orangutan (Pongo pygmaeus abelii): ponAbe2',
+                          'pUC18': 'pUC18 (L09136)',
+                          'rheMac2': 'Rhesus Macaque (Macaca mulatta): rheMac2',
+                          'rn3': 'Rat (Rattus norvegicus): rn3',
+                          'rn4': 'Rat (Rattus norvegicus): rn4',
+                          'sacCer1': 'Yeast (Saccharomyces cerevisiae): sacCer1',
+                          'sacCer2': 'Yeast (Saccharomyces cerevisiae): sacCer2',
+                          'sorAra1': 'Common Shrew (Sorex araneus): sorAra1',
+                          'Sscrofa9.58': 'Pig (Sus scrofa): Sscrofa9.58',
+                          'strPur2': 'Purple Sea Urchin (Strongylocentrotus purpuratus): strPur2',
+                          'susScr2': 'Pig (Sus scrofa): susScr2',
+                          'taeGut1': 'Zebra Finch (Taeniopygia guttata): taeGut1',
+                          'tetNig1': 'Tetraodon (Tetraodon nigroviridis): tetNig1',
+                          'tetNig2': 'Tetraodon (Tetraodon nigroviridis): tetNig2',
+                          'tupBel1': 'Tree Shrew (Tupaia belangeri): tupBel1',
+                          'venter1': 'Human (J. Craig Venter): venter1',
+                          'xenTro2': 'Frog (Xenopus tropicalis): xenTro2'
+                        }
+
+VARIANT_MAP = { 'canon': 'Canonical',
+               'full': 'Full',
+               'female': 'Female',
+               'male': 'Male'
+              }
+
+# alphabetize ignoring case
+def caseless_compare( a, b ):
+    au = a.upper()
+    bu = b.upper()
+    if au > bu:
+        return 1
+    elif au == bu:
+        return 0
+    elif au < bu:
+        return -1
+
+def __main__():
+    # command line variables
+    parser = optparse.OptionParser()
+    parser.add_option( '-d', '--data-table-xml', dest='data_table_xml', type='string', default=DEFAULT_TOOL_DATA_TABLE_CONF, help='The name of the data table configuration file to get format of loc file' )
+    parser.add_option( '-t', '--data-table', dest='data_table_name', type='string', default=DEFAULT_ALL_FASTA_LOC_BASE, help='The name of the data table listed in the data table XML file' )
+    parser.add_option( '-g', '--genome_dir', dest='genome_dir', type='string', default=DEFAULT_BASE_GENOME_DIR, help='Genome directory to look in' )
+    parser.add_option( '-e', '--exemptions', dest='exemptions', type='string', default=EXEMPTIONS, help='Comma-separated list of subdirectories in genome dir to not look in' )
+    parser.add_option( '-i', '--inspect-dir', dest='inspect_dir', type='string', default=INSPECT_DIR, help='Comma-separated list of subdirectories inside genome dirs to look in (default is all)' )
+    parser.add_option( '-x', '--fasta_exts', dest='fasta_exts', type='string', default=FASTA_EXTS, help='Comma-separated list of all fasta extensions to list' )
+    parser.add_option( '-s', '--loc-sample', dest='loc_sample_name', type='string', help='The name of the sample loc file (to copy text into top of output loc file)' )
+    parser.add_option( '-f', '--unmatching-fasta', dest='unmatching_fasta', type='string', default=None, help='Name of file to output non-matching fasta files to' )
+    parser.add_option( '-v', '--variants', dest='variants', type='string', default=VARIANTS, help='Comma-separated list of recognized variants of fasta file names' )
+    parser.add_option( '-n', '--variant-exclusions', dest='variant_exclusions', type='string', default=VARIANT_EXCLUSIONS, help="List of files to exclude because they're duplicated by a variants; of the format: '<variant_to_keep_1>:<variant_to_remove_1>[,<variant_to_remove_2>[,...]][;<variant_to_keep_2>:<variant_to_remove_1>[,<variant_to_remove_2>[,...]]]'; default ':(full)' (if non-variant version present (like 'hg19'), full version (like 'hg19full') will be thrown out)" )
+    parser.add_option( '-a', '--append', dest='append', action='store_true', default=False, help='Append to existing all_fasta.loc file rather than create new' )
+    parser.add_option( '-p', '--sample-text', dest='sample_text', action='store_true', default='True', help='Copy over text from all_fasta.loc.sample file (false if set to append)' )
+    (options, args) = parser.parse_args()
+
+    exemptions = [ e.strip() for e in options.exemptions.split( ',' ) ]
+    fasta_exts = [ x.strip() for x in  options.fasta_exts.split( ',' ) ]
+    variants = [ v.strip() for v in options.variants.split( ',' ) ]
+    variant_exclusions = {}
+    try:
+        for ve in options.variant_exclusions.split( ';' ):
+            v, e = ve.split( ':' )
+            variant_exclusions[ v ] = e.split( ',' )
+    except:
+        sys.stderr.write( 'Problem parsing the variant exclusion parameter (-n/--variant-exclusion). Make sure it follows the expected format\n' )
+        sys.exit( 1 )
+    if options.append:
+        sample_text = False
+    else:
+        sample_text = options.sample_text
+
+    # all paths to look in
+    if options.inspect_dir:
+        paths_to_look_in = [ os.path.join( options.genome_dir, '%s', id ) for id in options.inspect_dir.split( ',' ) ]
+    else:
+        paths_to_look_in = os.path.join( options.genome_dir, '%s' )
+
+    # say what we're looking in
+    print '\nLooking in:\n\t%s' % '\n\t'.join( [ p % '<build_name>' for p in paths_to_look_in ] )
+    poss_names = [ '<build_name>%s' % v for v in variants ]
+    print 'for files that are named %s' % ', '.join( poss_names[:-1] ),
+    if len( poss_names ) > 1:
+        print 'or %s' % poss_names[-1],
+    if len( options.fasta_exts ) == 1:
+        print 'with the extension %s.' % ', '.join( fasta_exts[:-1] )
+    else:
+        print 'with the extension %s or %s.' % ( ', '.join( fasta_exts[:-1] ), fasta_exts[-1] )
+    print '\nSkipping the following:\n\t%s' % '\n\t'.join( exemptions )
+
+    # get column names
+    col_values = []
+    loc_path = None
+    tree = etree.parse( options.data_table_xml )
+    tables = tree.getroot()
+    for table in tables.getiterator():
+        name = table.attrib.get( 'name' )
+        if name == options.data_table_name:
+            cols = None
+            for node in table.getiterator():
+                if node.tag == 'columns':
+                    cols = node.text
+                elif node.tag == 'file':
+                    loc_path = node.attrib.get( 'path' )
+            if cols:
+                col_values = [ col.strip() for col in cols.split( ',' ) ]
+    if not col_values or not loc_path:
+        stop_err( 'No columns can be found for this data table (%s) in %s' % ( options.data_table, options.data_table_xml ) )
+
+    # get all fasta paths under genome directory
+    fasta_locs = {}
+    unmatching_fasta_paths = []
+    genome_subdirs = [ dr for dr in os.listdir( options.genome_dir ) if dr not in exemptions ]
+    for genome_subdir in genome_subdirs:
+        possible_names = [ genome_subdir ]
+        possible_names.extend( [ '%s%s' % ( genome_subdir, v ) for v in variants ] )
+        # get paths to all fasta files
+        for path_to_look_in in paths_to_look_in:
+            for dirpath, dirnames, filenames in os.walk( path_to_look_in % genome_subdir ):
+                for fn in filenames:
+                    ext = os.path.splitext( fn )[-1]
+                    fasta_base = os.path.splitext( fn )[0]
+                    if ext in fasta_exts:
+                        if fasta_base in possible_names:
+                            if fasta_base == genome_subdir:
+                                name = DBKEY_DESCRIPTION_MAP[ genome_subdir ]
+                            else:
+                                try:
+                                    name = '%s %s' % ( DBKEY_DESCRIPTION_MAP[ genome_subdir ], VARIANT_MAP[ fasta_base.replace( genome_subdir, '' ) ] )
+                                except KeyError:
+                                    name = '%s %s' % ( DBKEY_DESCRIPTION_MAP[ genome_subdir ], fasta_base.replace( genome_subdir, '' ) )
+                            fasta_locs[ fasta_base ] = { 'value': fasta_base, 'dbkey': genome_subdir, 'name': name, 'path': os.path.join( dirpath, fn ) }
+                        else:
+                            unmatching_fasta_paths.append( os.path.join( dirpath, fn ) )
+        # remove redundant fasta files
+        if variant_exclusions.keys():
+            for k in variant_exclusions.keys():
+                leave_in = '%s%s' % ( genome_subdir, k )
+                if fasta_locs.has_key( leave_in ):
+                    to_remove = [ '%s%s' % ( genome_subdir, k ) for k in variant_exclusions[ k ] ]
+                    for tr in to_remove:
+                        if fasta_locs.has_key( tr ):
+                            del fasta_locs[ tr ]
+
+    # output results
+    print '\nThere were %s fasta files found that were not included because they did not have the expected file names.' % len( unmatching_fasta_paths )
+    print '%s fasta files were found and listed.\n' % len( fasta_locs.keys() )
+
+    # output unmatching fasta files
+    if options.unmatching_fasta and unmatching_fasta_paths:
+        open( options.unmatching_fasta, 'wb' ).write( '%s\n' % '\n'.join( unmatching_fasta_paths ) )
+
+    # output loc file
+    if options.append:
+        all_fasta_loc = open( loc_path, 'ab' )
+    else:
+        all_fasta_loc = open( loc_path, 'wb' )
+    # put sample loc file text at top of file if appropriate
+    if sample_text:
+        if options.loc_sample_name:
+            all_fasta_loc.write( '%s\n' % open( options.loc_sample_name, 'rb' ).read().strip() )
+        else:
+            all_fasta_loc.write( '%s\n' % open( '%s.sample' % loc_path, 'rb' ).read().strip() )
+    # output list of fasta files in alphabetical order
+    fasta_bases = fasta_locs.keys()
+    fasta_bases.sort( caseless_compare )
+    for fb in fasta_bases:
+        out_line = []
+        for col in col_values:
+            try:
+                out_line.append( fasta_locs[ fb ][ col ] )
+            except KeyError:
+                stop_err( 'Unexpected column (%s) encountered' % col )
+        if out_line:
+            all_fasta_loc.write( '%s\n' % '\t'.join( out_line ) )
+    # close up output loc file
+    all_fasta_loc.close()
+
+if __name__=='__main__': __main__()

    

commits-noreply＠bitbucket.org

tags

participants (1)