libStatGen Software 1
Loading...
Searching...
No Matches
SmithWaterman.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#if !defined(_SMITH_WATERMAN_)
19#define _SMITH_WATERMAN_
20
21#include <string.h> // for inline use of strcat, etc
22#include <limits.h> // for INT_MAX
23#include <stdint.h> // for uint32_t and friends
24
25//
26// This file implements a bi-directional, banded Smith Waterman matching
27// algorithm.
28//
29// The design is dictated by several observations:
30//
31// - building the full matrix H for the read and reference is slow,
32// so we perform it only for a band down the diagonal, thus speeding
33// up the algorithm at the cost of reduced detection of insert/deletes.
34//
35// - we must minimize data copying
36//
37// - we must have the ability to test the algorithm quickly and simply,
38// hence we implement the class as a template so that this file doesn't
39// have to depend on Karma's GenomeSequence object, which is a relatively
40// heavy weight object to do testing against.
41//
42// - because Karma uses index words to determine match candidate locations
43// across the genome, and because we use the banded Smith Waterman approach,
44// we must provide bi-directional Smith Waterman matching. See example below.
45//
46// To fully understand the examples below, make sure you understand
47// Phred quality scores, CIGAR strings, and pattern matching.
48//
49// Simple Functional Examples:
50//
51// Given a read, a read quality and a reference, we want to obtain some
52// measure of how well that read matches the reference at that given location.
53// So, we have:
54// Read: "ATCG"
55// Quality: "$$$$" (Phred scores - all are decimal 3)
56// Reference: "ATCG"
57// We expect: sumQ = 0, and Cigar "4M"
58//
59// Complex Functional Examples:
60// Read: "AATCG"
61// Quality: "$$$$$" (Phred scores - all are decimal 3)
62// Reference: "ATCG"
63// We expect: sumQ = 3, and Cigar "1M1I3M"
64//
65// Backwards matching:
66// It is harder for me to construct a clear example, so imagine a read with
67// cumulative inserts or deletes that sum up to a number larger than the
68// width of the band along the diagonal. If we perform SW in the 'wrong'
69// direction, we will lose the read entirely, whereas if we start from the
70// end that is known to be matching, we may obtain a good match for the bulk
71// of the read.
72//
73// For example, you could imagine a read where the first 10 bases had a mess
74// of inserts, but then was clean for the next 100 bases. You'd want it
75// to match as many of the good bases as practical, even if you knew you were
76// losing information at the end.
77//
78
79#include <assert.h>
80#include <stdio.h>
81#include <stdlib.h>
82#include <string>
83#include <iostream>
84#include <iomanip>
85#include <utility>
86#include <vector>
87
88#include "CigarRoller.h"
89#include "Generic.h"
90
91using std::cout;
92using std::cin;
93using std::cerr;
94using std::setw;
95using std::endl;
96using std::pair;
97using std::vector;
98
99#if !defined(MAX)
100#define MAX(x,y) ((x)>(y) ? (x) : (y))
101#endif
102#if !defined(MIN)
103#define MIN(x,y) ((x)<(y) ? (x) : (y))
104#endif
105
106//
107// Implement the core of Smith Waterman as described in:
108// http://en.wikipedia.org/wiki/Smith_waterman
109//
110// The only variation from the basic SW algorithm is the
111// use of a banded approach - to limit the algorithm to
112// a band along the diagonal. This limits the maximum
113// additive number of indels, but allows an O(c*M) approach,
114// where c is the constant max number of indels allowed.
115//
116// This is implemented as function templates because for testing, it is easier
117// to use character arrays. In our mapper, we will be using a
118// combination of a String object for the read and the genome object
119// as the reference. Both can be indexed, and give a character (base)
120// value, but the code would be duplicated if we implement SW for
121// each type of argument.
122//
123// Htype -> the type of the array H cell (8 or 16 bit unsigned int)
124// Atype -> the read string type (must have Atype::operator [] defined)
125//
126template <int maxReadLengthH, int maxReferenceLengthH, typename HCellType, typename Atype, typename Btype, typename QualityType, typename readIndexType, typename referenceIndexType>
128{
129public:
130
131 //
132 // XXX in theory, this weight should be sensitive
133 // to the quality of the base, and should have
134 // an appropriate cost for an indel, as well.
135 //
136 // I think we need to get rid of this, since it is
137 // basically wrong for our needs.
138 //
139 struct weight
140 {
141 weight()
142 {
143 match=2;
144 misMatch=-1;
145 insert=-1;
146 del=-1;
147 };
148
149 int match;
150 int misMatch;
151 int insert;
152 int del;
153 };
154
155 HCellType H[maxReadLengthH][maxReferenceLengthH];
156 Atype *A;
157 Btype *B;
158 QualityType *qualities;
159
160 int m,n;
161 readIndexType MOffset; // constant offset to m (read)
162 referenceIndexType NOffset; // constant offset to n (reference)
163 weight w;
164 int allowedInsertDelete;
165 int direction;
166 int gapOpenCount;
167 int gapCloseCount;
168 int gapExtendCount;
169 vector<pair<int,int> > alignment;
170 void clearAlignment()
171 {
172 alignment.clear();
173 }
174
175 HCellType maxCostValue; // max Cost value in H
176 pair<int,int> maxCostPosition; // row/col of max cost value in H
177
178 //
179 // Clear the member variables only.
180 // To clear H, call clearH().
181 //
182 // In theory, clear() plus set() should be sufficient to
183 // get a clean run, but I haven't tested this extensively.
184 //
185 void clear()
186 {
187 maxCostPosition.first = 0;
188 maxCostPosition.second = 0;
189 A = NULL;
190 B = NULL;
191 qualities = NULL;
192 m = 0;
193 n = 0;
194 MOffset = 0;
195 NOffset = 0;
196 allowedInsertDelete = 0;
197 direction = 0;
198 gapOpenCount = 0;
199 gapCloseCount = 0;
200 gapExtendCount = 0;
201 }
202
203 // caller will be using set* methods to set everything up.
205 {
206 clear();
207 clearH();
208 }
209
210 // construct with everything and the kitchen sink:
212 Atype *A,
213 QualityType *qualities,
214 Btype *B,
215 int m,
216 int n,
217 int allowedInsertDelete = INT_MAX,
218 int direction = 1,
219 readIndexType MOffset = 0,
220 referenceIndexType NOffset = 0):
221 A(A),
222 qualities(qualities),
223 B(B),
224 m(m),
225 n(n),
226 allowedInsertDelete(allowedInsertDelete),
227 direction(direction),
228 MOffset(MOffset),
229 NOffset(NOffset),
230 maxCostValue((HCellType) 0)
231 {
232 }
233
234 void setRead(Atype *A)
235 {
236 this->A = A;
237 }
238 void setReadQuality(QualityType *qualities)
239 {
240 this->qualities = qualities;
241 }
242 void setReference(Btype *B)
243 {
244 this->B = B;
245 }
246
247 // Caller may wish to index into the read to do the matching against
248 // only part of the read.
249 // NB: the quality length and offset are the same as the read.
250 void setReadLength(int m)
251 {
252 this->m = m;
253 }
254 void setReadOffset(readIndexType MOffset)
255 {
256 this->MOffset = MOffset;
257 }
258
259 // The reference is typically long, and not necessarily a char *,
260 // so we provide an offset here. If it were always a char *,
261 // we'd just modify the caller to point directly at the reference
262 // location.
263 void setReferenceLength(int n)
264 {
265 this->n = n;
266 }
267 void setReferenceOffset(referenceIndexType NOffset)
268 {
269 this->NOffset = NOffset;
270 }
271
272 //
273 // Configuration: how wide is the band on the diagonal?
274 // We should keep this small -- 1, 2, 3 or similar. If
275 // the value is default (INT_MAX), then the full matrix
276 // will be built, which is fine, but quite slow.
277 //
278 // If this paramater is made smaller than when a previous
279 // call to populateH was made, clearH will also need to be called.
280 //
281 void setAllowedInsertDelete(int allowedInsertDelete = INT_MAX)
282 {
283 this->allowedInsertDelete = allowedInsertDelete;
284 }
285
286 //
287 // Configuration: which end do we begin performing SW matching
288 // from? We need this because of index 'anchors' in the karma
289 // matcher.
290 void setDirection(int direction)
291 {
292 this->direction = direction;
293 }
294
295 void clearH()
296 {
297 memset(H, 0, sizeof(H));
298 }
299
300 void populateH()
301 {
302
303 maxCostValue = 0;
304
305 for (int i=1; i<=m ; i++)
306 {
307
308 // implement a banded Smith-Waterman approach:
309 int low = MAX(1, i - allowedInsertDelete);
310 int high = MIN(n, i + allowedInsertDelete);
311
312 for (int j=low; j<=high ; j++)
313 {
314 HCellType c;
315 c = 0;
316 if (direction>0) c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + i-1]==(*B)[NOffset + j-1]) ? w.match : w.misMatch));
317 else c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + m-i+0]==(*B)[NOffset + n-j+0]) ? w.match : w.misMatch));
318 c = MAX(c, H[i-1][j] + w.del);
319 c = MAX(c, H[i][j-1] + w.insert);
320 H[i][j] = c;
321 if (c>maxCostValue)
322 {
323 maxCostValue = c;
324 maxCostPosition.first = i;
325 maxCostPosition.second = j;
326 }
327 }
328 }
329 }
330
331//
332// Given the matrix H as filled in by above routine, print it out.
333//
334 void printH(bool prettyPrint = true)
335 {
336 // print the scoring matrix:
337 for (int i=-1; i<=m ; i++)
338 {
339 for (int j=-1; j<=n ; j++)
340 {
341 if (prettyPrint) cout << setw(3);
342 if (i==-1 && j==-1)
343 {
344 if (prettyPrint) cout << " ";
345 else cout << "\t";
346 }
347 else if (j==-1)
348 {
349 if (!prettyPrint) cout << "\t";
350 if (i==0) cout << "-";
351 else cout << (*A)[MOffset + direction>0 ? i-1 : m - i];
352 }
353 else if (i==-1)
354 {
355 if (!prettyPrint) cout << "\t";
356 if (j==0) cout << "-";
357 else cout << (*B)[NOffset + direction>0 ? j-1 : n - j];
358 }
359 else
360 {
361 if (!prettyPrint) cout << "\t";
362 cout << H[i][j];
363 }
364 }
365 cout << endl;
366 }
367 }
368
369 void debugPrint(bool doPrintH = true)
370 {
371 if (doPrintH) printH();
372 cout << "maxCostPosition = " << maxCostPosition << std::endl;
373 if (alignment.empty()) cout << "alignment vector is empty.\n";
374 else
375 {
376 cout << "alignment vector:\n";
377 for (vector<pair<int,int> >::iterator i=alignment.begin(); i < alignment.end(); i++)
378 {
379 cout << (i - alignment.begin()) << ": " << *i << "\n";
380 }
381 }
382 cout << std::endl;
383 }
384
385//
386// Given the Matrix H as filled in by populateH, fill in the
387// alignment vector with the indeces of the optimal match.
388//
389 void populateAlignment()
390 {
391 alignment.clear();
392 int i = m, j = n;
393
394 i = maxCostPosition.first;
395 j = maxCostPosition.second;
396
397 //
398 // Stop when we either reach zero cost cell or
399 // when we reach the upper left corner of H.
400 // A zero cost cell to the lower right means we
401 // are soft clipping that end.
402 //
403 while (H[i][j] > 0 || (i>0 && j>0))
404 {
405// #define DEBUG_ALIGNMENT_VECTOR
406#if defined(DEBUG_ALIGNMENT_VECTOR)
407 cout << "alignment.push_back(" << i << ", " << j << ")" << endl;
408#endif
409 alignment.push_back(pair<int,int>(i,j));
410 if (H[i-1][j-1]>=H[i-1][j] && H[i-1][j-1]>=H[i][j-1])
411 {
412 // diagonal upper left cell is biggest
413 i--;
414 j--;
415 }
416 else if (H[i-1][j] < H[i][j-1])
417 {
418 // upper cell is biggest
419 j--;
420 }
421 else
422 {
423 // left cell is biggest
424 i--;
425 }
426 }
427 alignment.push_back(pair<int,int>(i,j));
428#if defined(DEBUG_ALIGNMENT_VECTOR)
429 cout << "alignment.push_back(" << i << ", " << j << ")" << endl;
430 cout << "alignment.size(): " << alignment.size() << endl;
431#endif
432 }
433
434 //
435 // Compute the sumQ for a read that has been mapped using populateH().
436 //
437 // In the simplest case, the read lies on the diagonal of the
438 // matrix H, which means it has only matches and mismatches:
439 // no inserts or deletes.
440 //
441 // However, in general, it is possible to have 0 or more insert,
442 // delete, mismatch and soft clipped bases in the read, so we
443 // need to accomodate all of those variations.
444 //
445 // XXX finish this.
446 //
447
448 int getSumQ()
449 {
450 if (direction>0) return getSumQForward();
451 else return getSumQBackward();
452 }
453
454 int getSumQForward()
455 {
456 int sumQ = 0;
457 vector<pair<int,int> >::reverse_iterator i;
458
459 for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
460 {
461// #define DEBUG_GETSUMQ
462#if defined(DEBUG_GETSUMQ)
463 cout << *i << ": ";
464#endif
465 if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1))
466 {
467 // match/mismatch
468#if defined(DEBUG_GETSUMQ)
469 cout << "Match/Mismatch";
470#endif
471 if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second])
472 sumQ += (*qualities)[MOffset + (*i).first] - '!';
473 }
474 else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second))
475 {
476 // insert?
477#if defined(DEBUG_GETSUMQ)
478 cout << "Insert";
479#endif
480 sumQ += 50;
481 }
482 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1))
483 {
484 // delete?
485#if defined(DEBUG_GETSUMQ)
486 cout << "Delete";
487#endif
488 sumQ += 50;
489 }
490 }
491#if defined(DEBUG_GETSUMQ)
492 cout << endl;
493#endif
494 return sumQ;
495 }
496
497 int getSumQBackward()
498 {
499 int sumQ = 0;
500 vector<pair<int,int> >::iterator i;
501
502 for (i=alignment.begin(); i < alignment.end() - 1; i++)
503 {
504#if defined(DEBUG_GETSUMQ)
505 cout << *i << ": ";
506#endif
507 if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1))
508 {
509 // match/mismatch
510#if defined(DEBUG_GETSUMQ)
511 cout << "Match/Mismatch";
512#endif
513 if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second])
514 sumQ += (*qualities)[MOffset + m - (*i).first] - '!';
515 }
516 else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second))
517 {
518 // insert?
519#if defined(DEBUG_GETSUMQ)
520 cout << "Insert?";
521#endif
522 sumQ += 50;
523 }
524 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1))
525 {
526 // delete?
527#if defined(DEBUG_GETSUMQ)
528 cout << "Delete?";
529#endif
530 sumQ += 50;
531 }
532 }
533#if defined(DEBUG_GETSUMQ)
534 cout << endl;
535#endif
536 return sumQ;
537 }
538
539#if 0
540 int getSumQ()
541 {
542 vector<pair<int,int> >::reverse_iterator i;
543 int sumQ = 0;
544 for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
545 {
546#if defined(DEBUG_ALIGNMENT_VECTOR)
547 cout << "i: " << i - alignment.rbegin() << *i << endl;
548#endif
549 // XXX NOT THIS SIMPLE - need to account for indels
550 if (direction>0)
551 {
552 if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second])
553 sumQ += (*qualities)[MOffset + (*i).first] - '!';
554 }
555 else
556 {
557 // m and n are sizes, first and second are 1 based offsets
558 if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second])
559 sumQ += (*qualities)[MOffset + m - (*i).first] - '!';
560 }
561 }
562 return sumQ;
563 }
564#endif
565
566 //
567 // Append cigar operations to an existing cigar list.
568 //
569 // XXX we no longer need the CigarRoller += methods.
570 //
571 // In this case, the Smith Waterman array H was created from
572 // the read and reference in the forward direction.
573 //
574 void rollCigarForward(CigarRoller &cigar)
575 {
576 vector<pair<int,int> >::reverse_iterator i;
577
578 for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
579 {
580// #define DEBUG_CIGAR
581#if defined(DEBUG_CIGAR)
582 cout << *i << ": ";
583#endif
584 if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1))
585 {
586 // match/mismatch
587#if defined(DEBUG_CIGAR)
588 cout << "Match/Mismatch";
589#endif
590 cigar.Add(CigarRoller::match, 1);
591 }
592 else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second))
593 {
594 // insert?
595#if defined(DEBUG_CIGAR)
596 cout << "Insert";
597#endif
598 cigar.Add(CigarRoller::insert, 1);
599 }
600 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1))
601 {
602 // delete?
603#if defined(DEBUG_CIGAR)
604 cout << "Delete";
605#endif
606 cigar.Add(CigarRoller::del, 1);
607 }
608 }
609 // if there is soft clipping, allow for it (::Add will
610 // ignore if the count is 0):
611 cigar.Add(CigarRoller::softClip, getSoftClipCount());
612#if defined(DEBUG_CIGAR)
613 cout << endl;
614#endif
615 }
616
617 //
618 // Append cigar operations to an existing cigar list.
619 //
620 // XXX we no longer need the CigarRoller += methods.
621 //
622 // In this case, the Smith Waterman array H was created from
623 // the read and reference in the reverse direction.
624 //
625 void rollCigarBackward(CigarRoller &cigar)
626 {
627 vector<pair<int,int> >::iterator i;
628
629 // if there is soft clipping, allow for it (::Add will
630 // ignore if the count is 0):
631 cigar.Add(CigarRoller::softClip, getSoftClipCount());
632
633 i = alignment.begin();
634
635 for (i=alignment.begin();
636 i < alignment.end() - 1;
637 i++)
638 {
639#if defined(DEBUG_CIGAR)
640 cout << *i << ": ";
641#endif
642 if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1))
643 {
644 // match/mismatch
645#if defined(DEBUG_CIGAR)
646 cout << "Match/Mismatch";
647#endif
648 cigar.Add(CigarRoller::match, 1);
649 }
650 else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second))
651 {
652 // insert?
653#if defined(DEBUG_CIGAR)
654 cout << "Insert?";
655#endif
656 cigar.Add(CigarRoller::insert, 1);
657 }
658 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1))
659 {
660 // delete?
661#if defined(DEBUG_CIGAR)
662 cout << "Delete?";
663#endif
664 cigar.Add(CigarRoller::del, 1);
665 }
666 }
667#if defined(DEBUG_CIGAR)
668 cout << endl;
669#endif
670 }
671
672 //
673 // Given the direction, and the alignment vector, obtain
674 // the soft clip (the mismatches at the end of the string which
675 // can in Smith Waterman matching be considered as a separate case).
676 //
677 // NB: be careful that the backward case is correct - it passes
678 // all of two built in tests, but it may not be generally correct.
679 //
680 int getSoftClipCount()
681 {
682 if (direction>0)
683 {
684 // invariant: assert(maxCostPosition == alignment.front());
685 return m - maxCostPosition.first;
686 }
687 else
688 {
689// return alignment.back().first; // nope, this always returns 0
690 // XXX BE CAREFUL... not sure this is right, either.
691// return n - maxCostPosition.second;
692 return m - maxCostPosition.first;
693 }
694 }
695
696 void rollCigar(CigarRoller &cigar)
697 {
698 if (direction>0) rollCigarForward(cigar);
699 else rollCigarBackward(cigar);
700 }
701
702 //
703 // all in one local alignment:
704 //
705 // Steps:
706 // 1 - do internal setup
707 // 2 - populate H
708 // 3 - create alignment vector (this chooses the best path)
709 // 4 - compute sumQ
710 // 5 - compute the cigar string
711 // 6 - compute and update the softclip for the read
712 //
713 bool localAlignment(
714 uint32_t bandSize,
715 Atype &read,
716 readIndexType readLength,
717 QualityType &quality,
718 Btype &reference,
719 referenceIndexType referenceLength,
720 referenceIndexType referenceOffset,
721 CigarRoller &cigarRoller,
722 uint32_t &softClipCount,
723 referenceIndexType &cigarStartingPoint,
724 int &sumQ
725 )
726 {
727
728 clear();
729
730 cigarRoller.clear();
731
732 setDirection(+1);
733 setAllowedInsertDelete(bandSize);
734
735 setRead(&read);
736 setReadOffset(0);
737 setReadLength(readLength);
738
739 setReadQuality(&quality);
740
741 setReference(&reference);
742 setReferenceOffset(referenceOffset);
743 setReferenceLength(referenceLength);
744
745 populateH();
746
747 softClipCount = getSoftClipCount();
748
749 populateAlignment();
750
751 rollCigar(cigarRoller);
752
753 sumQ = getSumQ();
754
755 return false;
756
757 };
758
759};
760
761#endif
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition CigarRoller.h:67
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
void clear()
Clear this object so that it has no Cigar Operations.
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition Cigar.h:92
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition Cigar.h:89
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition Cigar.h:91
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition Cigar.h:94