Revision: 2773 Author: nate Date: 2008-09-24 10:26:07 -0400 (Wed, 24 Sep 2008) Log Message: ----------- Added locally modified sputnik source to svn Added Paths: ----------- dependencies/sputnik/ dependencies/sputnik/README dependencies/sputnik/sputnik.c Added: dependencies/sputnik/README =================================================================== --- dependencies/sputnik/README (rev 0) +++ dependencies/sputnik/README 2008-09-24 14:26:07 UTC (rev 2773) @@ -0,0 +1,12 @@ +Sputnik's original source is available from: + +http://espressosoftware.com/pages/sputnik.jsp + +The version available from bx.psu.edu contains modifications to include +mononucleotide microsatellites in the output. + +Build with: + +gcc -g -o bx-sputnik sputnik.c + +And ensure that it can be found in your Galaxy user's path. Added: dependencies/sputnik/sputnik.c =================================================================== --- dependencies/sputnik/sputnik.c (rev 0) +++ dependencies/sputnik/sputnik.c 2008-09-24 14:26:07 UTC (rev 2773) @@ -0,0 +1,599 @@ +/* #define DEBUG_SPUTNIK 1 */ + + +/* + find repeats in fasta format seq file + allows for indels, returns score. + + beta version. caveat emptor. + + chrisa 29-Jul-94 + + chris abajian + University of Washington + Dept. of Molecular Biotechnology FJ-20 + Fluke Hall, Mason Road + Seattle WA 98195 +*/ + +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> + +/* trivial defs */ +#ifndef True +#define True 1 +#endif +#ifndef False +#define False 0 +#endif + +typedef int Boolean; + +/* size of buffer for reads. */ +#define BUF_SIZE 1024*10 /* 10K */ +/* max size of description line (begins with ">") */ +#define MAX_DESCRIPTION_LEN 1024 +/* max sequence length */ +#define MAX_SEQUENCE_LEN 1024*800 /* 800K */ +/* max number of sequence chars dumped to line */ +#define MAX_OUT_LINE_CHARS 60 + +/* for debugging only */ +#define MAX_ERRCODES 1024 + +/* search params and definitions */ +#define MIN_UNIT_LENGTH 1 /* start search with dinucleotide repeats */ +/* will search for di, tri, tetra ... <n>nucleotide repeats up to + this value for n */ +#define MAX_UNIT_LENGTH 5 /* up to and including pentanucleotides */ +/* this is the point score for each exact match */ +#define EXACT_MATCH_POINTS 1 +/* this is the point score for a mismatch, insertion or deletion */ +#define ERROR_MATCH_POINTS -6 +/* this is the minimum score required to be considered a match */ +#define MATCH_MIN_SCORE 8 +/* this is the low score at which we stop trying */ +#define MATCH_FAIL_SCORE -1 +/* this is the max recursion depth we try to recover errors */ +#define MAX_RECURSION 5 + + +char *repeatName[MAX_UNIT_LENGTH+1] = +{ + "***ERROR***", /* bad programmer! no latte! */ + "mononucleotide", + "dinucleotide", + "trinucleotide", + "tetranucleotide", + "pentanucleotide" +}; + + +char readBuf[BUF_SIZE]; +Boolean endOfFile; +int curBufLen; +int curBufPos; +int fd; +Boolean havePutBack; +char putBack; + +/* struct for indiv sequence in a file */ +typedef struct ss +{ + char descStr[MAX_DESCRIPTION_LEN]; + char seqStr[MAX_SEQUENCE_LEN]; + unsigned int seqLen; +} SeqStruct, *SeqStructPtr; + + +/* + * this structure describes the current state of a comparison. + * it gets passed down to recursive calls of the find repeat + * call so it can know when to bail out of an unsuccessful + * search, or return the size/state of a successful hit, etc. + */ +typedef struct ms +{ + int curPos; /* putative pattern starts here */ + int testPos; /* start testing here */ + int testLen; /* di, tri, tetra, etc. */ + int testCtr; /* # chars in testLen already tested. mod counter */ + int curScore; /* current score */ + int missense; /* keep track of ins, del, err */ + int insertions; + int deletions; + int depth; /* how deep is recursion for this match */ + char errCodes[MAX_ERRCODES]; +} MatchStruct, *MatchStructPtr; +/* a utility macro to copy one testStruct to another */ +#define copyMSPtr(dest,source) memcpy((char *)dest,(char *)source,sizeof(MatchStruct)) +/* a utility macro to increment the modular testCtr */ +#define bumpTestCtr(msp) (msp)->testCtr++; if ((msp)->testCtr==(msp)->testLen) (msp)->testCtr=0; + + +/* + ************************************************************ + * these routines are used to read and parse the fasta format + * sequence file + ************************************************************ + */ + +void fillBuf() +{ + size_t result; + + result = read(fd, (void *)readBuf, BUF_SIZE); + if (result == -1) + { + fprintf(stderr,"error reading file! errno = %d\n",errno); + exit(1); + } + else if (result == 0) + endOfFile = True; + else + { + curBufLen = result; + curBufPos = 0; + } +} /* readBuf */ + + +/* returns True on success */ +Boolean getChar(char *achar) +{ + if (havePutBack) + { + *achar = putBack; + havePutBack = False; + return(True); + } + + if (curBufPos == curBufLen) + fillBuf(); + + if (endOfFile) + return (False); + + *achar = readBuf[curBufPos++]; + return (True); +} + + +void putCharBack(char c) +{ + havePutBack = True; + putBack = c; +} + + +void openFile(char *fn) +{ + /* open the specified file */ + fd = open(fn, O_RDONLY); + if (fd == -1) + { + fprintf(stderr,"unable to open file %s\n", fn); + exit(1); + } +} + +/* should call this once for each file read */ +void initBuffer() +{ + /* initialize length and pointer */ + curBufPos = 0; + curBufLen = 0; + havePutBack = False; + endOfFile = False; +} + +void addCharToLine(char c, char *line, int *lineLen) +{ + if (*lineLen < MAX_DESCRIPTION_LEN) + line[(*lineLen)++] = c; + else + fprintf(stderr,"warning: description line truncated\n"); +} + + +/* + ********************************************************************* + * these routines are (more) specific to reading the fasta file format + ********************************************************************* + */ + + +/* + * pick up a non-blank line from the file, presumably description. + * truncates all leading blanks and/or blank lines + */ +Boolean getNonBlankLine(char *line) +{ + Boolean stop, nonBlank; + char c; + int lineLen; + + lineLen = 0; + stop = False; + nonBlank = False; /* will be set by any non whitespace char */ + while ((! endOfFile) && (! stop)) + if (getChar(&c)) + if (c == '\n') + stop = nonBlank; /* stop if have anything. don't save eol char. */ + else + if (nonBlank) + /* add it to line no matter what */ + addCharToLine(c,line,&lineLen); + else if ((c != ' ') && (c != '\t')) + { + /* only non whitespace will start the line */ + nonBlank = True; + addCharToLine(c,line,&lineLen); + } +} + + +/* load the sequence struct with comment line and bases */ +SeqStructPtr getSeq(char *fname) +{ + SeqStructPtr newSeqP; + Boolean endOfSeq; + char c; + + if (endOfFile) return ((SeqStructPtr )0); /* bombproofing */ + + /* malloc a new seq */ + if (! (newSeqP = (SeqStructPtr )malloc(sizeof(SeqStruct)) ) ) + { + fprintf(stderr,"unable to malloc() memory for sequence.\n"); + exit(1); + } + /* clear mem */ + memset( (void *)newSeqP, '\0', sizeof(SeqStruct)); + + /* pick up description line */ + if (! getNonBlankLine(newSeqP->descStr) ) + { + free(newSeqP); + return ((SeqStructPtr )0); + } + + /* did it start correctly ? */ + if (newSeqP->descStr[0] != '>') + { + fprintf(stderr,"format error in input file: missing '>'\n"); + exit(1); + } + + endOfSeq = False; + while ((!endOfFile) && (!endOfSeq)) + { + if (getChar(&c)) + { + if (c == '>') + { + /* hit new sequence */ + endOfSeq = True; + putCharBack(c); + } + else if (((c >= 'A') && (c <= 'Z')) || + ((c >= 'a') && (c <= 'z')) || (c == '-'))/* bogus test, chris */ + /* have nucleotide */ + newSeqP->seqStr[newSeqP->seqLen++] = toupper(c); + else if ((c != '\n') && (c != ' ') && (c != '\t') && (c != '#') && (c != '$') && (c != '*') && (c != '?') && (c != '^')) + { + /* wierd shit in file. bail. */ + fprintf(stderr,">bad char in sequence, %c\n",c); + exit(1); + } + } + } + + if (! newSeqP->seqLen) + { + fprintf(stderr,"? Null sequence encountered in file %s (ignored)\n",fname); + fprintf(stderr," %s\n", newSeqP->descStr); + free(newSeqP); + return ((SeqStructPtr )0); + } + + return(newSeqP); +} /* getSeq */ + + +/* for debugging. dump entire seq to stdout. */ +#ifdef DEBUG_SPUTNIK +void dumpSeq(SeqStructPtr seqP) +{ + int i, charsOnLine; + + fprintf(stdout,"%s\n", seqP->descStr); + fprintf(stdout,"Sequence (length = %d):\n", seqP->seqLen); + i = 0; + charsOnLine = 0; + while (i < seqP->seqLen) + { + if (charsOnLine == MAX_OUT_LINE_CHARS) + { + fprintf(stdout,"\n"); + charsOnLine = 1; + } + else + charsOnLine++; + fprintf(stdout,"%c", seqP->seqStr[i++]); + } + fprintf(stdout,"\n"); +} /* dumpSeq */ +#endif /* DEBUG_SPUTNIK */ + +/* dump the matched seq & stats to stdout */ +void dumpMatch(SeqStructPtr seqP, + MatchStructPtr matchP, + Boolean anyMatchThisSeq) +{ + int i, charsOnLine; + + if (! anyMatchThisSeq) + fprintf(stdout,"%s\n", seqP->descStr); + + fprintf(stdout,"%s %d : %d -- length %d score %d\n", + repeatName[matchP->testLen], + matchP->curPos+1, + matchP->testPos, + matchP->testPos - matchP->curPos, + matchP->curScore); + +#ifdef DEBUG_SPUTNIK + fprintf(stdout,"mis = %d, del = %d, ins = %d\n", + matchP->missense, + matchP->deletions, + matchP->insertions); +#endif + + i = matchP->curPos; + charsOnLine = 0; + while (i < matchP->testPos) + { + if (charsOnLine == MAX_OUT_LINE_CHARS) + { + fprintf(stdout,"\n"); + charsOnLine = 1; + } + else + charsOnLine++; + fprintf(stdout,"%c", seqP->seqStr[i++]); + } + fprintf(stdout,"\n"); + +#ifdef DEBUG_SPUTNIK + i = 0; + charsOnLine = 0; + while (i < (matchP->testPos - matchP->curPos)) + { + if (charsOnLine == MAX_OUT_LINE_CHARS) + { + fprintf(stdout,"\n"); + charsOnLine = 1; + } + else + charsOnLine++; + if (matchP->errCodes[i] == '\0') + fprintf(stdout," "); + else + fprintf(stdout,"%c", matchP->errCodes[i]); + i++; + } + fprintf(stdout,"\n"); +#endif +} /* dumpMatch */ + + +Boolean testForNRepeat(SeqStructPtr seqP, + MatchStructPtr matchP) +{ + MatchStruct curMatch, recover, bestSoFar, bestOfABadLot; + + /* save matchP in case we fail altogether. */ + copyMSPtr(&curMatch, matchP); + /* keep track of the best score and return that if over thresh. */ + copyMSPtr(&bestSoFar, matchP); + + while ( (curMatch.testPos < seqP->seqLen) /* anything to test */ + && (curMatch.curScore > MATCH_FAIL_SCORE) ) /* above fail threshold */ + { + /* test a base */ + if (seqP->seqStr[curMatch.curPos+curMatch.testCtr] + == seqP->seqStr[curMatch.testPos]) + { + /* we matched. this is easy. */ + curMatch.curScore += EXACT_MATCH_POINTS; /* score your points */ + curMatch.testPos++; /* advance the downstream test position */ + bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */ + } + else if ((seqP->seqStr[curMatch.testPos] == 'N') || (seqP->seqStr[curMatch.testPos] == '-')) + { + /* don't call it wrong, but no credit either */ + curMatch.testPos++; /* advance the downstream test position */ + bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */ + } + else + { + /* no match. take the score penalty, but keep going (maybe). */ + curMatch.curScore += ERROR_MATCH_POINTS; + curMatch.testPos++; /* advance the downstream test position */ + bumpTestCtr(&curMatch); /* advance pos in seq */ + /* is the score too bad to continue, or are we + already too deep? */ + if ( (curMatch.curScore > MATCH_FAIL_SCORE) + && (curMatch.depth < MAX_RECURSION) ) + { + /* try simple missense */ + copyMSPtr(&recover,&curMatch); + if ((recover.testPos - recover.curPos) < MAX_ERRCODES) + recover.errCodes[recover.testPos - recover.curPos -1] = 'M'; + recover.missense++; + recover.depth++; + (void )testForNRepeat(seqP,&recover); + copyMSPtr(&bestOfABadLot,&recover); + + /* try deletion */ + copyMSPtr(&recover,&curMatch); + if ((recover.testPos - recover.curPos) < MAX_ERRCODES) + recover.errCodes[recover.testPos - recover.curPos -1] = 'D'; + recover.testPos--; /* DON'T advance downstream */ + recover.deletions++; + recover.depth++; + (void )testForNRepeat(seqP,&recover); + if (recover.curScore > bestOfABadLot.curScore) + copyMSPtr(&bestOfABadLot,&recover); + + /* try insertion */ + copyMSPtr(&recover,&curMatch); + if ((recover.testPos - recover.curPos) < MAX_ERRCODES) + recover.errCodes[recover.testPos - recover.curPos -1] = 'I'; + /* RETEST for this base in the repeating seq */ + if (recover.testCtr == 0) + recover.testCtr = recover.testLen - 1; + else + recover.testCtr--; + recover.insertions++; + recover.depth++; + (void )testForNRepeat(seqP,&recover); + if (recover.curScore > bestOfABadLot.curScore) + copyMSPtr(&bestOfABadLot,&recover); + + /* take the best of a bad lot */ + bestOfABadLot.depth--; /* dec recursion counter */ + copyMSPtr(&curMatch, &bestOfABadLot); + } /* it was worth carrying on */ + } /* no match, found best of bad lot */ + + /* whatever happened, the best we could do is now in matchP */ + if (curMatch.curScore > bestSoFar.curScore) + copyMSPtr(&bestSoFar, &curMatch); + + } /* while loop to test a single base */ + + /* for whatever reason, we've stopped searching for more of this + putative repeat. if there were any matches that passed + the global threshold, return the best of them. note that this + has the effect of NOT advancing the pointer(s) if nothing + rang the bell. remember that we will test the same position + for ntide repeats of several different lengths. */ + if (bestSoFar.curScore > MATCH_MIN_SCORE) + { + copyMSPtr(matchP, &bestSoFar); + return(True); + } + return(False); /* the whole thing was a waste of time */ +} /* testForNRepeat */ + + +/* + * returns True if the sequence we want to look for repeats of is + * + * a) all the same base (i.e. 'AAA' or 'GG'). This filters out + * single nucleotide repeats + * + * b) conains 'N'. we search against these, but don't use them + * as wildcards. + */ +Boolean ignoreSeq(SeqStructPtr seqP, + MatchStructPtr matchP) +{ + int i; + + /* firstly, never search for any pattern that contains N */ + for (i = 0; i < matchP->testLen; i++) + if ((seqP->seqStr[matchP->curPos+i] == 'N') || (seqP->seqStr[matchP->curPos+i] == '-')) + return(True); + + /* now test for mononucleotide repeat. other tests may get + added, in which case this one will beed to be changed. */ + for (i = 1; i < matchP->testLen; i++) + if (seqP->seqStr[matchP->curPos] != seqP->seqStr[matchP->curPos+i]) + return(False); /* they're not all the same */ + return (False); /* they ARE all same:changed by Guru to allow mononucleotide repeats */ +} + + +void findRepeats(SeqStructPtr seqP) +{ + int curPos; + Boolean anyMatchThisSeq, matchAtThisPos; + MatchStruct match; + + memset( (char *)&match, 0, sizeof(MatchStruct) ); /* clear match struct */ + + anyMatchThisSeq = False; /* avoid dumping description more than once. */ + /* loop on all positions in the sequence. note that a match + will advance curPos past all matching chars to the first + unmatched char. */ + while ( match.curPos <= seqP->seqLen) + { + /* now loop on all the different lengths of repeats we're + looking for (i.e. di, tri, tetra nucleotides. if we + find a match at a shorter repeat length, forego testing + for longer lengths. */ + match.testLen = MIN_UNIT_LENGTH; + matchAtThisPos = False; + while ((match.testLen <= MAX_UNIT_LENGTH) && (!matchAtThisPos)) + { + /* initialize the state of the match */ + match.curScore = 0; /* no points yet */ + match.testCtr = 0; /* no chars tested yet */ + match.testPos = match.curPos + match.testLen; + match.insertions = 0; + match.deletions = 0; + match.missense = 0; + /* there are some things we don't want to test for */ + if (! ignoreSeq(seqP,&match)) + matchAtThisPos = testForNRepeat(seqP, &match); + else + matchAtThisPos = False; + if (! matchAtThisPos) match.testLen++; + } + + if (matchAtThisPos) + { + dumpMatch(seqP,&match,anyMatchThisSeq); + anyMatchThisSeq |= matchAtThisPos; + match.curPos = match.testPos; + } + else + match.curPos++; /* no, so advance to next base. */ + } +} + + +main(int argc, char* argv[]) +{ + SeqStructPtr seqP; + int count; + + if (argc != 2) + { + fprintf(stderr,"Usage: %s <fasta format sequence file name>\n", argv[0]); + exit(1); + } + + openFile(argv[1]); + + initBuffer(); + + count = 0; + while (! endOfFile) + if (seqP = getSeq(argv[1])) + { +#ifdef DEBUG_SPUTNIK + fprintf(stdout,"processing sequence %d\n", count++); +#endif + /* dumpSeq(seqP); */ + findRepeats(seqP); + free((void *)seqP); + } +}