libStatGen Software 1
Loading...
Searching...
No Matches
Pileup.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef __PILEUP_H__
19#define __PILEUP_H__
20
21#include <stdexcept>
22#include "SamFile.h"
23#include "PosList.h"
24
25
27{
28public:
29 static const int DEFAULT_WINDOW_SIZE = 1024;
30};
31
32template <class PILEUP_TYPE>
34{
35public:
36 bool operator() (PILEUP_TYPE& element)
37 {
38 element.analyze();
39 return(true);
40 }
41 void analyze(PILEUP_TYPE element)
42 {
43 element.analyze();
44 }
45};
46
47template <class PILEUP_TYPE>
48void defaultPileupAnalyze(PILEUP_TYPE& element)
49{
50 element.analyze();
51}
52
53
54/// Class to perform a pileup of all reads by position, assuming
55/// the reads are coordinate sorted.
56template <class PILEUP_TYPE,
57 class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
58class Pileup
59{
60public:
61 /// Constructor using the default maximum number of bases a read spans.
62 Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
63
64 /// Constructor that sets the maximum number of bases a read spans.
65 /// This is the "window" the length of the buffer that holds
66 /// the pileups for each position until the read start has moved
67 /// past the position.
68 Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
69
70 /// Perform pileup with a reference.
71 Pileup(const std::string& refSeqFileName,
72 const FUNC_CLASS& fp = FUNC_CLASS());
73
74 /// Perform pileup with a reference and a specified window size.
75 Pileup(int window, const std::string& refSeqFileName,
76 const FUNC_CLASS& fp = FUNC_CLASS());
77
78 /// Destructor
79 virtual ~Pileup();
80
81 /// Performs a pileup on the specified file.
82 /// \param excludeFlag if specified, if any bit set in the exclude flag
83 /// is set in the record's flag, it will be dropped.
84 /// Defaulted to exclude:
85 /// * unmapped,
86 /// * not primary alignment
87 /// * failed platform/vendor quality check
88 /// * PCR or optical duplicate
89 /// \param includeFlag if specified, every bit must be set in the
90 /// record's flag for it to be included -
91 /// defaulted to 0, no bits are required to be set.
92 /// \return 0 for success and non-zero for failure.
93 virtual int processFile(const std::string& fileName,
94 uint16_t excludeFlag = 0x0704,
95 uint16_t includeFlag = 0);
96
97 /// Add an alignment to the pileup.
98 virtual void processAlignment(SamRecord& record);
99
100 /// Add only positions that fall within the specified region of the
101 /// alignment to the pileup and outside of the specified excluded positions.
102 /// \param record alignment to be added to the pileup.
103 /// \param startPos 0-based start position of the bases that should be
104 /// added to the pileup.
105 /// \param endPos 0-based end position of the bases that should be added
106 /// to the pileup (this position is not added).
107 /// Set to -1 if there is no end position to the region.
108 /// \param excludeList list of refID/positions to exclude from processing.
109 virtual void processAlignmentRegion(SamRecord& record,
110 int startPos,
111 int endPos,
112 PosList* excludeList = NULL);
113
114 /// Done processing, flush every position that is currently being stored
115 /// in the pileup.
117
118protected:
119 FUNC_CLASS myAnalyzeFuncPtr;
120
121 // Always need the reference position.
122 void addAlignmentPosition(int refPosition, SamRecord& record);
123
124
125 virtual void flushPileup(int refID, int refPosition);
126 void flushPileup(int refPosition);
127
128 // Get the position in the myElements container that is associated
129 // with the specified position. If the specified position cannot
130 // fit within the myElements container, -1 is returned.
131 int pileupPosition(int refPosition);
132
133 virtual void resetElement(PILEUP_TYPE& element, int position);
134 virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
135 virtual void analyzeElement(PILEUP_TYPE& element);
136 virtual void analyzeHead();
137
138 std::vector<PILEUP_TYPE> myElements;
139
140 int pileupStart;
141 int pileupHead;
142 int pileupTail; // last used position
143 int pileupWindow;
144
145 int myCurrentRefID;
146
147 GenomeSequence* myRefPtr;
148};
149
150
151template <class PILEUP_TYPE, class FUNC_CLASS>
153 : myAnalyzeFuncPtr(fp),
154 myElements(),
155 pileupStart(0),
156 pileupHead(0),
157 pileupTail(-1),
158 pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
159 myCurrentRefID(-2),
160 myRefPtr(NULL)
161{
162 // Not using pointers since this is templated.
163 myElements.resize(pileupWindow);
164}
165
166
167template <class PILEUP_TYPE, class FUNC_CLASS>
168Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
169 : myAnalyzeFuncPtr(fp),
170 myElements(),
171 pileupStart(0),
172 pileupHead(0),
173 pileupTail(-1),
174 pileupWindow(window),
175 myCurrentRefID(-2),
176 myRefPtr(NULL)
177{
178 // Not using pointers since this is templated.
179 myElements.resize(window);
180}
181
182
183template <class PILEUP_TYPE, class FUNC_CLASS>
184Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
185 : myAnalyzeFuncPtr(fp),
186 myElements(),
187 pileupStart(0),
188 pileupHead(0),
189 pileupTail(-1),
190 pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
191 myCurrentRefID(-2),
192 myRefPtr(NULL)
193{
194 myRefPtr = new GenomeSequence(refSeqFileName.c_str());
195
196 // Not using pointers since this is templated.
197 myElements.resize(pileupWindow);
198
199 PILEUP_TYPE::setReference(myRefPtr);
200}
201
202
203template <class PILEUP_TYPE, class FUNC_CLASS>
204Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
205 : myAnalyzeFuncPtr(fp),
206 myElements(),
207 pileupStart(0),
208 pileupHead(0),
209 pileupTail(-1),
210 pileupWindow(window),
211 myCurrentRefID(-2),
212 myRefPtr(NULL)
213{
214 myRefPtr = new GenomeSequence(refSeqFileName.c_str());
215
216 // Not using pointers since this is templated.
217 myElements.resize(window);
218
219 PILEUP_TYPE::setReference(myRefPtr);
220}
221
222
223template <class PILEUP_TYPE, class FUNC_CLASS>
225{
226 flushPileup();
227 if(myRefPtr != NULL)
228 {
229 delete myRefPtr;
230 myRefPtr = NULL;
231 }
232}
233
234template <class PILEUP_TYPE, class FUNC_CLASS>
235int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName,
236 uint16_t excludeFlag,
237 uint16_t includeFlag)
238{
239 SamFile samIn;
240 SamFileHeader header;
241 SamRecord record;
242
243 if(myRefPtr != NULL)
244 {
245 samIn.SetReference(myRefPtr);
246 }
247
248 if(!samIn.OpenForRead(fileName.c_str()))
249 {
250 fprintf(stderr, "%s\n", samIn.GetStatusMessage());
251 return(samIn.GetStatus());
252 }
253
254 if(!samIn.ReadHeader(header))
255 {
256 fprintf(stderr, "%s\n", samIn.GetStatusMessage());
257 return(samIn.GetStatus());
258 }
259
260 // The file needs to be sorted by coordinate.
262
263 // Iterate over all records
264 while (samIn.ReadRecord(header, record))
265 {
266 uint16_t flag = record.getFlag();
267 if(flag & excludeFlag)
268 {
269 // This record has an excluded flag set,
270 // so continue to the next one.
271 continue;
272 }
273 if((flag & includeFlag) != includeFlag)
274 {
275 // This record does not have all required flags set,
276 // so continue to the next one.
277 continue;
278 }
279 processAlignment(record);
280 }
281
282 flushPileup();
283
284 int returnValue = 0;
286 {
287 // Failed to read a record.
288 fprintf(stderr, "%s\n", samIn.GetStatusMessage());
289 returnValue = samIn.GetStatus();
290 }
291 return(returnValue);
292}
293
294
295template <class PILEUP_TYPE, class FUNC_CLASS>
297{
298 int refPosition = record.get0BasedPosition();
299 int refID = record.getReferenceID();
300
301 // Flush any elements from the pileup that are prior to this record
302 // since the file is sorted, we are done with those positions.
303 flushPileup(refID, refPosition);
304
305 // Loop through for each reference position covered by the record.
306 // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
307 // that are related with the given reference position.
308 for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
309 {
310 addAlignmentPosition(refPosition, record);
311 }
312}
313
314
315template <class PILEUP_TYPE, class FUNC_CLASS>
317 int startPos,
318 int endPos,
319 PosList* excludeList)
320{
321 int refPosition = record.get0BasedPosition();
322 int refID = record.getReferenceID();
323
324 // Flush any elements from the pileup that are prior to this record
325 // since the file is sorted, we are done with those positions.
326 flushPileup(refID, refPosition);
327
328 // Check if the region starts after this reference starts. If so,
329 // we only want to start adding at the region start position.
330 if(startPos > refPosition)
331 {
332 refPosition = startPos;
333 }
334
335 // Loop through for each reference position covered by the record.
336 // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
337 // that are related with the given reference position.
338 for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
339 {
340 // Check to see if we have gone past the end of the region, in which
341 // case we can stop processing this record. Check >= since the
342 // end position is not in the region.
343 if((endPos != -1) && (refPosition >= endPos))
344 {
345 break;
346 }
347
348 // Check to see if this position is in the exclude list.
349 bool addPos = true;
350 if(excludeList != NULL)
351 {
352 // There is an exclude list, so lookup the position.
353 if(excludeList->hasPosition(refID, refPosition))
354 {
355 // This position is in the exclude list, so don't add it.
356 addPos = false;
357 }
358 }
359 if(addPos)
360 {
361 addAlignmentPosition(refPosition, record);
362 }
363 }
364}
365
366
367template <class PILEUP_TYPE, class FUNC_CLASS>
369{
370 // while there are still entries between the head and tail, flush,
371 // but no need to flush if pileupTail == -1 because in that case
372 // no entries have been added
373 while ((pileupHead <= pileupTail) && (pileupTail != -1))
374 {
375 flushPileup(pileupHead+1);
376 }
377 pileupStart = pileupHead = 0;
378 pileupTail = -1;
379}
380
381
382// Always need the reference position.
383template <class PILEUP_TYPE, class FUNC_CLASS>
385 SamRecord& record)
386{
387 int offset = 0;
388 try{
389 offset = pileupPosition(refPosition);
390 }
391 catch(std::runtime_error& err)
392 {
393 const char* overflowErr = "Overflow on the pileup buffer:";
394 String errorMessage = err.what();
395 if(strncmp(err.what(), overflowErr, strlen(overflowErr)) == 0)
396 {
397
398 errorMessage += "\n\tPileup Buffer Overflow: recordName = ";
399 errorMessage += record.getReadName();
400 errorMessage += "; Cigar = ";
401 errorMessage += record.getCigar();
402 }
403 throw std::runtime_error(errorMessage.c_str());
404 }
405
406 if((offset < 0) || (offset >= pileupWindow))
407 {
408 std::cerr << "Pileup Buffer Overflow: position = " << refPosition
409 << "; refID = " << record.getReferenceID()
410 << "; recStartPos = " << record.get1BasedPosition()
411 << "; pileupStart = " << pileupStart
412 << "; pileupHead = " << pileupHead
413 << "; pileupTail = " << pileupTail;
414 }
415
416 addElement(myElements[offset], record);
417}
418
419
420template <class PILEUP_TYPE, class FUNC_CLASS>
421void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
422{
423 // if new chromosome, flush the entire pileup.
424 if(refID != myCurrentRefID)
425 {
426 // New chromosome, flush everything.
427 flushPileup();
428 myCurrentRefID = refID;
429 }
430 else
431 {
432 // on the same chromosome, so flush just up to this new position.
433 flushPileup(position);
434 }
435}
436
437
438template <class PILEUP_TYPE, class FUNC_CLASS>
440{
441 // Flush up to this new position, but no reason to flush if
442 // pileupHead has not been set.
443 while((pileupHead < position) && (pileupHead <= pileupTail))
444 {
445 analyzeHead();
446
447 pileupHead++;
448
449 if(pileupHead - pileupStart >= pileupWindow)
450 pileupStart += pileupWindow;
451 }
452
453 if(pileupHead > pileupTail)
454 {
455 // All positions have been flushed, so reset pileup info
456 pileupHead = pileupStart = 0;
457 pileupTail = -1;
458 }
459}
460
461
462// Get the position in the myElements container that is associated
463// with the specified position. If the specified position cannot
464// fit within the myElements container, -1 is returned.
465template <class PILEUP_TYPE, class FUNC_CLASS>
467{
468 // Check to see if this is the first position (pileupTail == -1)
469 if(pileupTail == -1)
470 {
471 pileupStart = pileupHead = position;
472 // This is the first time this position is being used, so
473 // reset the element.
474 resetElement(myElements[0], position);
475 pileupTail = position;
476 return(0);
477 }
478
479
480 if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
481 {
482 String errorMessage =
483 "Overflow on the pileup buffer: specifiedPosition = ";
484 errorMessage += position;
485 errorMessage += ", pileup buffer start position: ";
486 errorMessage += pileupHead;
487 errorMessage += ", pileup buffer end position: ";
488 errorMessage += pileupHead + pileupWindow;
489
490 throw std::runtime_error(errorMessage.c_str());
491 }
492
493 // int offset = position - pileupStart;
494 int offset = position - pileupStart;
495
496 if(offset >= pileupWindow)
497 {
498 offset -= pileupWindow;
499 }
500
501 // Check to see if position is past the end of the currently
502 // setup pileup positions.
503 while(position > pileupTail)
504 {
505 // Increment pileupTail to the next position since the current
506 // pileupTail is already in use.
507 ++pileupTail;
508
509 // Figure out the offset for this next position.
510 offset = pileupTail - pileupStart;
511 if(offset >= pileupWindow)
512 {
513 offset -= pileupWindow;
514 }
515
516 // This is the first time this position is being used, so
517 // reset the element.
518 resetElement(myElements[offset], pileupTail);
519 }
520
521 return(offset);
522}
523
524
525template <class PILEUP_TYPE, class FUNC_CLASS>
526void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
527 int position)
528{
529 element.reset(position);
530}
531
532
533template <class PILEUP_TYPE, class FUNC_CLASS>
534void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
535 SamRecord& record)
536{
537 element.addEntry(record);
538}
539
540
541template <class PILEUP_TYPE, class FUNC_CLASS>
543{
544 myAnalyzeFuncPtr(element);
545}
546
547
548template <class PILEUP_TYPE, class FUNC_CLASS>
550{
551 myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]);
552}
553
554
555#endif
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Class to perform a pileup of all reads by position, assuming the reads are coordinate sorted.
Definition Pileup.h:59
virtual void processAlignment(SamRecord &record)
Add an alignment to the pileup.
Definition Pileup.h:296
virtual ~Pileup()
Destructor.
Definition Pileup.h:224
virtual int processFile(const std::string &fileName, uint16_t excludeFlag=0x0704, uint16_t includeFlag=0)
Performs a pileup on the specified file.
Definition Pileup.h:235
Pileup(int window, const std::string &refSeqFileName, const FUNC_CLASS &fp=FUNC_CLASS())
Perform pileup with a reference and a specified window size.
Definition Pileup.h:204
void flushPileup()
Done processing, flush every position that is currently being stored in the pileup.
Definition Pileup.h:368
Pileup(const FUNC_CLASS &fp=FUNC_CLASS())
Constructor using the default maximum number of bases a read spans.
Definition Pileup.h:152
Pileup(const std::string &refSeqFileName, const FUNC_CLASS &fp=FUNC_CLASS())
Perform pileup with a reference.
Definition Pileup.h:184
Pileup(int window, const FUNC_CLASS &fp=FUNC_CLASS())
Constructor that sets the maximum number of bases a read spans.
Definition Pileup.h:168
virtual void processAlignmentRegion(SamRecord &record, int startPos, int endPos, PosList *excludeList=NULL)
Add only positions that fall within the specified region of the alignment to the pileup and outside o...
Definition Pileup.h:316
Store refID/position, but does not store values < 0.
Definition PosList.h:25
bool hasPosition(int refID, int refPosition)
Return whether or not this list contains the specified reference ID and position (negative values wil...
Definition PosList.cpp:81
This class allows a user to get/set the fields in a SAM/BAM Header.
Allows the user to easily read/write a SAM/BAM file.
Definition SamFile.h:36
bool ReadHeader(SamFileHeader &header)
Reads the header section from the file and stores it in the passed in header.
Definition SamFile.cpp:450
@ COORDINATE
file is sorted by coordinate.
Definition SamFile.h:49
bool ReadRecord(SamFileHeader &header, SamRecord &record)
Reads the next record from the file & stores it in the passed in record.
Definition SamFile.cpp:514
void SetReference(GenomeSequence *reference)
Sets the reference to the specified genome sequence object.
Definition SamFile.cpp:380
bool OpenForRead(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for reading with the specified filename, determing the type of file and SAM/BAM b...
Definition SamFile.cpp:93
SamStatus::Status GetStatus()
Get the Status of the last call that sets status.
Definition SamFile.h:212
const char * GetStatusMessage()
Get the Status Message of the last call that sets status.
Definition SamFile.h:218
void setSortedValidation(SortedType sortType)
Set the flag to validate that the file is sorted as it is read/written.
Definition SamFile.cpp:682
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition SamRecord.h:52
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
uint16_t getFlag()
Get the flag (FLAG).
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
const char * getCigar()
Returns the SAM formatted CIGAR string.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...