Revision: 2773
Author: nate
Date: 2008-09-24 10:26:07 -0400 (Wed, 24 Sep 2008)
Log Message:
-----------
Added locally modified sputnik source to svn
Added Paths:
-----------
dependencies/sputnik/
dependencies/sputnik/README
dependencies/sputnik/sputnik.c
Added: dependencies/sputnik/README
===================================================================
--- dependencies/sputnik/README (rev 0)
+++ dependencies/sputnik/README 2008-09-24 14:26:07 UTC (rev 2773)
@@ -0,0 +1,12 @@
+Sputnik's original source is available from:
+
+http://espressosoftware.com/pages/sputnik.jsp
+
+The version available from bx.psu.edu contains modifications to include
+mononucleotide microsatellites in the output.
+
+Build with:
+
+gcc -g -o bx-sputnik sputnik.c
+
+And ensure that it can be found in your Galaxy user's path.
Added: dependencies/sputnik/sputnik.c
===================================================================
--- dependencies/sputnik/sputnik.c (rev 0)
+++ dependencies/sputnik/sputnik.c 2008-09-24 14:26:07 UTC (rev 2773)
@@ -0,0 +1,599 @@
+/* #define DEBUG_SPUTNIK 1 */
+
+
+/*
+ find repeats in fasta format seq file
+ allows for indels, returns score.
+
+ beta version. caveat emptor.
+
+ chrisa 29-Jul-94
+
+ chris abajian
+ University of Washington
+ Dept. of Molecular Biotechnology FJ-20
+ Fluke Hall, Mason Road
+ Seattle WA 98195
+*/
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+
+/* trivial defs */
+#ifndef True
+#define True 1
+#endif
+#ifndef False
+#define False 0
+#endif
+
+typedef int Boolean;
+
+/* size of buffer for reads. */
+#define BUF_SIZE 1024*10 /* 10K */
+/* max size of description line (begins with ">") */
+#define MAX_DESCRIPTION_LEN 1024
+/* max sequence length */
+#define MAX_SEQUENCE_LEN 1024*800 /* 800K */
+/* max number of sequence chars dumped to line */
+#define MAX_OUT_LINE_CHARS 60
+
+/* for debugging only */
+#define MAX_ERRCODES 1024
+
+/* search params and definitions */
+#define MIN_UNIT_LENGTH 1 /* start search with dinucleotide repeats */
+/* will search for di, tri, tetra ... <n>nucleotide repeats up to
+ this value for n */
+#define MAX_UNIT_LENGTH 5 /* up to and including pentanucleotides */
+/* this is the point score for each exact match */
+#define EXACT_MATCH_POINTS 1
+/* this is the point score for a mismatch, insertion or deletion */
+#define ERROR_MATCH_POINTS -6
+/* this is the minimum score required to be considered a match */
+#define MATCH_MIN_SCORE 8
+/* this is the low score at which we stop trying */
+#define MATCH_FAIL_SCORE -1
+/* this is the max recursion depth we try to recover errors */
+#define MAX_RECURSION 5
+
+
+char *repeatName[MAX_UNIT_LENGTH+1] =
+{
+ "***ERROR***", /* bad programmer! no latte! */
+ "mononucleotide",
+ "dinucleotide",
+ "trinucleotide",
+ "tetranucleotide",
+ "pentanucleotide"
+};
+
+
+char readBuf[BUF_SIZE];
+Boolean endOfFile;
+int curBufLen;
+int curBufPos;
+int fd;
+Boolean havePutBack;
+char putBack;
+
+/* struct for indiv sequence in a file */
+typedef struct ss
+{
+ char descStr[MAX_DESCRIPTION_LEN];
+ char seqStr[MAX_SEQUENCE_LEN];
+ unsigned int seqLen;
+} SeqStruct, *SeqStructPtr;
+
+
+/*
+ * this structure describes the current state of a comparison.
+ * it gets passed down to recursive calls of the find repeat
+ * call so it can know when to bail out of an unsuccessful
+ * search, or return the size/state of a successful hit, etc.
+ */
+typedef struct ms
+{
+ int curPos; /* putative pattern starts here */
+ int testPos; /* start testing here */
+ int testLen; /* di, tri, tetra, etc. */
+ int testCtr; /* # chars in testLen already tested. mod counter */
+ int curScore; /* current score */
+ int missense; /* keep track of ins, del, err */
+ int insertions;
+ int deletions;
+ int depth; /* how deep is recursion for this match */
+ char errCodes[MAX_ERRCODES];
+} MatchStruct, *MatchStructPtr;
+/* a utility macro to copy one testStruct to another */
+#define copyMSPtr(dest,source) memcpy((char *)dest,(char *)source,sizeof(MatchStruct))
+/* a utility macro to increment the modular testCtr */
+#define bumpTestCtr(msp) (msp)->testCtr++; if ((msp)->testCtr==(msp)->testLen) (msp)->testCtr=0;
+
+
+/*
+ ************************************************************
+ * these routines are used to read and parse the fasta format
+ * sequence file
+ ************************************************************
+ */
+
+void fillBuf()
+{
+ size_t result;
+
+ result = read(fd, (void *)readBuf, BUF_SIZE);
+ if (result == -1)
+ {
+ fprintf(stderr,"error reading file! errno = %d\n",errno);
+ exit(1);
+ }
+ else if (result == 0)
+ endOfFile = True;
+ else
+ {
+ curBufLen = result;
+ curBufPos = 0;
+ }
+} /* readBuf */
+
+
+/* returns True on success */
+Boolean getChar(char *achar)
+{
+ if (havePutBack)
+ {
+ *achar = putBack;
+ havePutBack = False;
+ return(True);
+ }
+
+ if (curBufPos == curBufLen)
+ fillBuf();
+
+ if (endOfFile)
+ return (False);
+
+ *achar = readBuf[curBufPos++];
+ return (True);
+}
+
+
+void putCharBack(char c)
+{
+ havePutBack = True;
+ putBack = c;
+}
+
+
+void openFile(char *fn)
+{
+ /* open the specified file */
+ fd = open(fn, O_RDONLY);
+ if (fd == -1)
+ {
+ fprintf(stderr,"unable to open file %s\n", fn);
+ exit(1);
+ }
+}
+
+/* should call this once for each file read */
+void initBuffer()
+{
+ /* initialize length and pointer */
+ curBufPos = 0;
+ curBufLen = 0;
+ havePutBack = False;
+ endOfFile = False;
+}
+
+void addCharToLine(char c, char *line, int *lineLen)
+{
+ if (*lineLen < MAX_DESCRIPTION_LEN)
+ line[(*lineLen)++] = c;
+ else
+ fprintf(stderr,"warning: description line truncated\n");
+}
+
+
+/*
+ *********************************************************************
+ * these routines are (more) specific to reading the fasta file format
+ *********************************************************************
+ */
+
+
+/*
+ * pick up a non-blank line from the file, presumably description.
+ * truncates all leading blanks and/or blank lines
+ */
+Boolean getNonBlankLine(char *line)
+{
+ Boolean stop, nonBlank;
+ char c;
+ int lineLen;
+
+ lineLen = 0;
+ stop = False;
+ nonBlank = False; /* will be set by any non whitespace char */
+ while ((! endOfFile) && (! stop))
+ if (getChar(&c))
+ if (c == '\n')
+ stop = nonBlank; /* stop if have anything. don't save eol char. */
+ else
+ if (nonBlank)
+ /* add it to line no matter what */
+ addCharToLine(c,line,&lineLen);
+ else if ((c != ' ') && (c != '\t'))
+ {
+ /* only non whitespace will start the line */
+ nonBlank = True;
+ addCharToLine(c,line,&lineLen);
+ }
+}
+
+
+/* load the sequence struct with comment line and bases */
+SeqStructPtr getSeq(char *fname)
+{
+ SeqStructPtr newSeqP;
+ Boolean endOfSeq;
+ char c;
+
+ if (endOfFile) return ((SeqStructPtr )0); /* bombproofing */
+
+ /* malloc a new seq */
+ if (! (newSeqP = (SeqStructPtr )malloc(sizeof(SeqStruct)) ) )
+ {
+ fprintf(stderr,"unable to malloc() memory for sequence.\n");
+ exit(1);
+ }
+ /* clear mem */
+ memset( (void *)newSeqP, '\0', sizeof(SeqStruct));
+
+ /* pick up description line */
+ if (! getNonBlankLine(newSeqP->descStr) )
+ {
+ free(newSeqP);
+ return ((SeqStructPtr )0);
+ }
+
+ /* did it start correctly ? */
+ if (newSeqP->descStr[0] != '>')
+ {
+ fprintf(stderr,"format error in input file: missing '>'\n");
+ exit(1);
+ }
+
+ endOfSeq = False;
+ while ((!endOfFile) && (!endOfSeq))
+ {
+ if (getChar(&c))
+ {
+ if (c == '>')
+ {
+ /* hit new sequence */
+ endOfSeq = True;
+ putCharBack(c);
+ }
+ else if (((c >= 'A') && (c <= 'Z')) ||
+ ((c >= 'a') && (c <= 'z')) || (c == '-'))/* bogus test, chris */
+ /* have nucleotide */
+ newSeqP->seqStr[newSeqP->seqLen++] = toupper(c);
+ else if ((c != '\n') && (c != ' ') && (c != '\t') && (c != '#') && (c != '$') && (c != '*') && (c != '?') && (c != '^'))
+ {
+ /* wierd shit in file. bail. */
+ fprintf(stderr,">bad char in sequence, %c\n",c);
+ exit(1);
+ }
+ }
+ }
+
+ if (! newSeqP->seqLen)
+ {
+ fprintf(stderr,"? Null sequence encountered in file %s (ignored)\n",fname);
+ fprintf(stderr," %s\n", newSeqP->descStr);
+ free(newSeqP);
+ return ((SeqStructPtr )0);
+ }
+
+ return(newSeqP);
+} /* getSeq */
+
+
+/* for debugging. dump entire seq to stdout. */
+#ifdef DEBUG_SPUTNIK
+void dumpSeq(SeqStructPtr seqP)
+{
+ int i, charsOnLine;
+
+ fprintf(stdout,"%s\n", seqP->descStr);
+ fprintf(stdout,"Sequence (length = %d):\n", seqP->seqLen);
+ i = 0;
+ charsOnLine = 0;
+ while (i < seqP->seqLen)
+ {
+ if (charsOnLine == MAX_OUT_LINE_CHARS)
+ {
+ fprintf(stdout,"\n");
+ charsOnLine = 1;
+ }
+ else
+ charsOnLine++;
+ fprintf(stdout,"%c", seqP->seqStr[i++]);
+ }
+ fprintf(stdout,"\n");
+} /* dumpSeq */
+#endif /* DEBUG_SPUTNIK */
+
+/* dump the matched seq & stats to stdout */
+void dumpMatch(SeqStructPtr seqP,
+ MatchStructPtr matchP,
+ Boolean anyMatchThisSeq)
+{
+ int i, charsOnLine;
+
+ if (! anyMatchThisSeq)
+ fprintf(stdout,"%s\n", seqP->descStr);
+
+ fprintf(stdout,"%s %d : %d -- length %d score %d\n",
+ repeatName[matchP->testLen],
+ matchP->curPos+1,
+ matchP->testPos,
+ matchP->testPos - matchP->curPos,
+ matchP->curScore);
+
+#ifdef DEBUG_SPUTNIK
+ fprintf(stdout,"mis = %d, del = %d, ins = %d\n",
+ matchP->missense,
+ matchP->deletions,
+ matchP->insertions);
+#endif
+
+ i = matchP->curPos;
+ charsOnLine = 0;
+ while (i < matchP->testPos)
+ {
+ if (charsOnLine == MAX_OUT_LINE_CHARS)
+ {
+ fprintf(stdout,"\n");
+ charsOnLine = 1;
+ }
+ else
+ charsOnLine++;
+ fprintf(stdout,"%c", seqP->seqStr[i++]);
+ }
+ fprintf(stdout,"\n");
+
+#ifdef DEBUG_SPUTNIK
+ i = 0;
+ charsOnLine = 0;
+ while (i < (matchP->testPos - matchP->curPos))
+ {
+ if (charsOnLine == MAX_OUT_LINE_CHARS)
+ {
+ fprintf(stdout,"\n");
+ charsOnLine = 1;
+ }
+ else
+ charsOnLine++;
+ if (matchP->errCodes[i] == '\0')
+ fprintf(stdout," ");
+ else
+ fprintf(stdout,"%c", matchP->errCodes[i]);
+ i++;
+ }
+ fprintf(stdout,"\n");
+#endif
+} /* dumpMatch */
+
+
+Boolean testForNRepeat(SeqStructPtr seqP,
+ MatchStructPtr matchP)
+{
+ MatchStruct curMatch, recover, bestSoFar, bestOfABadLot;
+
+ /* save matchP in case we fail altogether. */
+ copyMSPtr(&curMatch, matchP);
+ /* keep track of the best score and return that if over thresh. */
+ copyMSPtr(&bestSoFar, matchP);
+
+ while ( (curMatch.testPos < seqP->seqLen) /* anything to test */
+ && (curMatch.curScore > MATCH_FAIL_SCORE) ) /* above fail threshold */
+ {
+ /* test a base */
+ if (seqP->seqStr[curMatch.curPos+curMatch.testCtr]
+ == seqP->seqStr[curMatch.testPos])
+ {
+ /* we matched. this is easy. */
+ curMatch.curScore += EXACT_MATCH_POINTS; /* score your points */
+ curMatch.testPos++; /* advance the downstream test position */
+ bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */
+ }
+ else if ((seqP->seqStr[curMatch.testPos] == 'N') || (seqP->seqStr[curMatch.testPos] == '-'))
+ {
+ /* don't call it wrong, but no credit either */
+ curMatch.testPos++; /* advance the downstream test position */
+ bumpTestCtr(&curMatch); /* advance pos in the (presumed) repeating seq */
+ }
+ else
+ {
+ /* no match. take the score penalty, but keep going (maybe). */
+ curMatch.curScore += ERROR_MATCH_POINTS;
+ curMatch.testPos++; /* advance the downstream test position */
+ bumpTestCtr(&curMatch); /* advance pos in seq */
+ /* is the score too bad to continue, or are we
+ already too deep? */
+ if ( (curMatch.curScore > MATCH_FAIL_SCORE)
+ && (curMatch.depth < MAX_RECURSION) )
+ {
+ /* try simple missense */
+ copyMSPtr(&recover,&curMatch);
+ if ((recover.testPos - recover.curPos) < MAX_ERRCODES)
+ recover.errCodes[recover.testPos - recover.curPos -1] = 'M';
+ recover.missense++;
+ recover.depth++;
+ (void )testForNRepeat(seqP,&recover);
+ copyMSPtr(&bestOfABadLot,&recover);
+
+ /* try deletion */
+ copyMSPtr(&recover,&curMatch);
+ if ((recover.testPos - recover.curPos) < MAX_ERRCODES)
+ recover.errCodes[recover.testPos - recover.curPos -1] = 'D';
+ recover.testPos--; /* DON'T advance downstream */
+ recover.deletions++;
+ recover.depth++;
+ (void )testForNRepeat(seqP,&recover);
+ if (recover.curScore > bestOfABadLot.curScore)
+ copyMSPtr(&bestOfABadLot,&recover);
+
+ /* try insertion */
+ copyMSPtr(&recover,&curMatch);
+ if ((recover.testPos - recover.curPos) < MAX_ERRCODES)
+ recover.errCodes[recover.testPos - recover.curPos -1] = 'I';
+ /* RETEST for this base in the repeating seq */
+ if (recover.testCtr == 0)
+ recover.testCtr = recover.testLen - 1;
+ else
+ recover.testCtr--;
+ recover.insertions++;
+ recover.depth++;
+ (void )testForNRepeat(seqP,&recover);
+ if (recover.curScore > bestOfABadLot.curScore)
+ copyMSPtr(&bestOfABadLot,&recover);
+
+ /* take the best of a bad lot */
+ bestOfABadLot.depth--; /* dec recursion counter */
+ copyMSPtr(&curMatch, &bestOfABadLot);
+ } /* it was worth carrying on */
+ } /* no match, found best of bad lot */
+
+ /* whatever happened, the best we could do is now in matchP */
+ if (curMatch.curScore > bestSoFar.curScore)
+ copyMSPtr(&bestSoFar, &curMatch);
+
+ } /* while loop to test a single base */
+
+ /* for whatever reason, we've stopped searching for more of this
+ putative repeat. if there were any matches that passed
+ the global threshold, return the best of them. note that this
+ has the effect of NOT advancing the pointer(s) if nothing
+ rang the bell. remember that we will test the same position
+ for ntide repeats of several different lengths. */
+ if (bestSoFar.curScore > MATCH_MIN_SCORE)
+ {
+ copyMSPtr(matchP, &bestSoFar);
+ return(True);
+ }
+ return(False); /* the whole thing was a waste of time */
+} /* testForNRepeat */
+
+
+/*
+ * returns True if the sequence we want to look for repeats of is
+ *
+ * a) all the same base (i.e. 'AAA' or 'GG'). This filters out
+ * single nucleotide repeats
+ *
+ * b) conains 'N'. we search against these, but don't use them
+ * as wildcards.
+ */
+Boolean ignoreSeq(SeqStructPtr seqP,
+ MatchStructPtr matchP)
+{
+ int i;
+
+ /* firstly, never search for any pattern that contains N */
+ for (i = 0; i < matchP->testLen; i++)
+ if ((seqP->seqStr[matchP->curPos+i] == 'N') || (seqP->seqStr[matchP->curPos+i] == '-'))
+ return(True);
+
+ /* now test for mononucleotide repeat. other tests may get
+ added, in which case this one will beed to be changed. */
+ for (i = 1; i < matchP->testLen; i++)
+ if (seqP->seqStr[matchP->curPos] != seqP->seqStr[matchP->curPos+i])
+ return(False); /* they're not all the same */
+ return (False); /* they ARE all same:changed by Guru to allow mononucleotide repeats */
+}
+
+
+void findRepeats(SeqStructPtr seqP)
+{
+ int curPos;
+ Boolean anyMatchThisSeq, matchAtThisPos;
+ MatchStruct match;
+
+ memset( (char *)&match, 0, sizeof(MatchStruct) ); /* clear match struct */
+
+ anyMatchThisSeq = False; /* avoid dumping description more than once. */
+ /* loop on all positions in the sequence. note that a match
+ will advance curPos past all matching chars to the first
+ unmatched char. */
+ while ( match.curPos <= seqP->seqLen)
+ {
+ /* now loop on all the different lengths of repeats we're
+ looking for (i.e. di, tri, tetra nucleotides. if we
+ find a match at a shorter repeat length, forego testing
+ for longer lengths. */
+ match.testLen = MIN_UNIT_LENGTH;
+ matchAtThisPos = False;
+ while ((match.testLen <= MAX_UNIT_LENGTH) && (!matchAtThisPos))
+ {
+ /* initialize the state of the match */
+ match.curScore = 0; /* no points yet */
+ match.testCtr = 0; /* no chars tested yet */
+ match.testPos = match.curPos + match.testLen;
+ match.insertions = 0;
+ match.deletions = 0;
+ match.missense = 0;
+ /* there are some things we don't want to test for */
+ if (! ignoreSeq(seqP,&match))
+ matchAtThisPos = testForNRepeat(seqP, &match);
+ else
+ matchAtThisPos = False;
+ if (! matchAtThisPos) match.testLen++;
+ }
+
+ if (matchAtThisPos)
+ {
+ dumpMatch(seqP,&match,anyMatchThisSeq);
+ anyMatchThisSeq |= matchAtThisPos;
+ match.curPos = match.testPos;
+ }
+ else
+ match.curPos++; /* no, so advance to next base. */
+ }
+}
+
+
+main(int argc, char* argv[])
+{
+ SeqStructPtr seqP;
+ int count;
+
+ if (argc != 2)
+ {
+ fprintf(stderr,"Usage: %s <fasta format sequence file name>\n", argv[0]);
+ exit(1);
+ }
+
+ openFile(argv[1]);
+
+ initBuffer();
+
+ count = 0;
+ while (! endOfFile)
+ if (seqP = getSeq(argv[1]))
+ {
+#ifdef DEBUG_SPUTNIK
+ fprintf(stdout,"processing sequence %d\n", count++);
+#endif
+ /* dumpSeq(seqP); */
+ findRepeats(seqP);
+ free((void *)seqP);
+ }
+}