libStatGen Software 1
Loading...
Searching...
No Matches
SamFilter.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18//////////////////////////////////////////////////////////////////////////
19
20
21#include "SamFilter.h"
22
23#include "SamQuerySeqWithRefHelper.h"
24#include "BaseUtilities.h"
25#include "SamFlag.h"
26
28 GenomeSequence& refSequence,
29 double mismatchThreshold)
30{
31 // Read & clip from the left & right.
32 SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
33 SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
34
35 SamSingleBaseMatchInfo baseMatchInfo;
36
37 int32_t readLength = record.getReadLength();
38 // Init last front clip to be prior to the lastFront index (0).
39 const int32_t initialLastFrontClipPos = -1;
40 int32_t lastFrontClipPos = initialLastFrontClipPos;
41 // Init first back clip to be past the last index (readLength).
42 int32_t firstBackClipPos = readLength;
43
44 bool fromFrontComplete = false;
45 bool fromBackComplete = false;
46 int32_t numBasesFromFront = 0;
47 int32_t numBasesFromBack = 0;
48 int32_t numMismatchFromFront = 0;
49 int32_t numMismatchFromBack = 0;
50
51 //////////////////////////////////////////////////////////
52 // Determining the clip positions.
53 while(!fromFrontComplete || !fromBackComplete)
54 {
55 // Read from the front (left to right) of the read until
56 // more have been read from that direction than the opposite direction.
57 while(!fromFrontComplete &&
58 ((numBasesFromFront <= numBasesFromBack) ||
59 (fromBackComplete)))
60 {
61 if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
62 {
63 // Nothing more to read in this direction.
64 fromFrontComplete = true;
65 break;
66 }
67 // Got a read. Check to see if it is to or past the last clip.
68 if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
69 {
70 // This base is past where we are clipping, so we
71 // are done reading in this direction.
72 fromFrontComplete = true;
73 break;
74 }
75 // This is an actual base read from the left to the
76 // right, so up the counter and determine if it was a mismatch.
77 ++numBasesFromFront;
78
79 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
80 {
81 // Mismatch
82 ++numMismatchFromFront;
83 // Check to see if it is over the threshold.
84 double mismatchPercent =
85 (double)numMismatchFromFront / numBasesFromFront;
86 if(mismatchPercent > mismatchThreshold)
87 {
88 // Need to clip.
89 lastFrontClipPos = baseMatchInfo.getQueryIndex();
90 // Reset the counters.
91 numBasesFromFront = 0;
92 numMismatchFromFront = 0;
93 }
94 }
95 }
96
97 // Now, read from right to left until more have been read
98 // from the back than from the front.
99 while(!fromBackComplete &&
100 ((numBasesFromBack <= numBasesFromFront) ||
101 (fromFrontComplete)))
102 {
103 if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
104 {
105 // Nothing more to read in this direction.
106 fromBackComplete = true;
107 break;
108 }
109 // Got a read. Check to see if it is to or past the first clip.
110 if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
111 {
112 // This base is past where we are clipping, so we
113 // are done reading in this direction.
114 fromBackComplete = true;
115 break;
116 }
117 // This is an actual base read from the right to the
118 // left, so up the counter and determine if it was a mismatch.
119 ++numBasesFromBack;
120
121 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
122 {
123 // Mismatch
124 ++numMismatchFromBack;
125 // Check to see if it is over the threshold.
126 double mismatchPercent =
127 (double)numMismatchFromBack / numBasesFromBack;
128 if(mismatchPercent > mismatchThreshold)
129 {
130 // Need to clip.
131 firstBackClipPos = baseMatchInfo.getQueryIndex();
132 // Reset the counters.
133 numBasesFromBack = 0;
134 numMismatchFromBack = 0;
135 }
136 }
137 }
138 }
139
140 //////////////////////////////////////////////////////////
141 // Done determining the clip positions, so clip.
142 // To determine the number of clips from the front, add 1 to the
143 // lastFrontClipPos since the index starts at 0.
144 // To determine the number of clips from the back, subtract the
145 // firstBackClipPos from the readLength.
146 // Example:
147 // Pos: 012345
148 // Read: AAAAAA
149 // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
150 return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
151}
152
153
154// Soft clip the record from the front and/or the back.
156 int32_t numFrontClips,
157 int32_t numBackClips)
158{
159 //////////////////////////////////////////////////////////
160 Cigar* cigar = record.getCigarInfo();
161 FilterStatus status = NONE;
162 int32_t startPos = record.get0BasedPosition();
163 CigarRoller updatedCigar;
164
165 status = softClip(*cigar, numFrontClips, numBackClips,
166 startPos, updatedCigar);
167
168 if(status == FILTERED)
169 {
170 /////////////////////////////
171 // The entire read is clipped, so rather than clipping it,
172 // filter it out.
173 filterRead(record);
174 return(FILTERED);
175 }
176 else if(status == CLIPPED)
177 {
178 // Part of the read was clipped, and now that we have
179 // an updated cigar, update the read.
180 record.setCigar(updatedCigar);
181
182 // Update the starting position.
183 record.set0BasedPosition(startPos);
184 }
185 return(status);
186}
187
188
189// Soft clip the cigar from the front and/or the back, writing the value
190// into the new cigar.
192 int32_t numFrontClips,
193 int32_t numBackClips,
194 int32_t& startPos,
195 CigarRoller& updatedCigar)
196{
197 int32_t readLength = oldCigar.getExpectedQueryBaseCount();
198 int32_t endClipPos = readLength - numBackClips;
199 FilterStatus status = NONE;
200
201 if((numFrontClips != 0) || (numBackClips != 0))
202 {
203 // Clipping from front and/or from the back.
204
205 // Check to see if the entire read was clipped.
206 int32_t totalClips = numFrontClips + numBackClips;
207 if(totalClips >= readLength)
208 {
209 /////////////////////////////
210 // The entire read is clipped, so rather than clipping it,
211 // filter it out.
212 return(FILTERED);
213 }
214
215 // Part of the read was clipped.
216 status = CLIPPED;
217
218 // Loop through, creating an updated cigar.
219 int origCigarOpIndex = 0;
220
221 // Track how many read positions are covered up to this
222 // point by the cigar to determine up to up to what
223 // point in the cigar is affected by this clipping.
224 int32_t numPositions = 0;
225
226 // Track if any non-clips are in the new cigar.
227 bool onlyClips = true;
228
229 const Cigar::CigarOperator* op = NULL;
230
231 //////////////////
232 // Clip from front
233 while((origCigarOpIndex < oldCigar.size()) &&
234 (numPositions < numFrontClips))
235 {
236 op = &(oldCigar.getOperator(origCigarOpIndex));
237 switch(op->operation)
238 {
239 case Cigar::hardClip:
240 // Keep this operation as the new clips do not
241 // affect other clips.
242 updatedCigar += *op;
243 break;
244 case Cigar::del:
245 case Cigar::skip:
246 // Skip and delete are going to be dropped, and
247 // are not in the read, so the read index doesn't
248 // need to be updated
249 break;
250 case Cigar::insert:
251 case Cigar::match:
252 case Cigar::mismatch:
253 case Cigar::softClip:
254 // Update the read index as these types
255 // are found in the read.
256 numPositions += op->count;
257 break;
258 case Cigar::none:
259 default:
260 // Nothing to do for none.
261 break;
262 };
263 ++origCigarOpIndex;
264 }
265
266 // If bases were clipped from the front, add the clip and
267 // any partial cigar operation as necessary.
268 if(numFrontClips != 0)
269 {
270 // Add the softclip to the front of the read.
271 updatedCigar.Add(Cigar::softClip, numFrontClips);
272
273 // Add the rest of the last Cigar operation if
274 // it is not entirely clipped.
275 int32_t newCount = numPositions - numFrontClips;
276 if(newCount > 0)
277 {
278 // Before adding it, check to see if the same
279 // operation is clipped from the end.
280 // numPositions greater than the endClipPos
281 // means that it is equal or past that position,
282 // so shorten the number of positions.
283 if(numPositions > endClipPos)
284 {
285 newCount -= (numPositions - endClipPos);
286 }
287 if(newCount > 0)
288 {
289 updatedCigar.Add(op->operation, newCount);
290 if(!Cigar::isClip(op->operation))
291 {
292 onlyClips = false;
293 }
294 }
295 }
296 }
297
298 // Add operations until the point of the end clip is reached.
299 // For example...
300 // 2M1D3M = MMDMMM readLength = 5
301 // readIndex: 01 234
302 // at cigarOpIndex 0 (2M), numPositions = 2.
303 // at cigarOpIndex 1 (1D), numPositions = 2.
304 // at cigarOpIndex 2 (3M), numPositions = 5.
305 // if endClipPos = 2, we still want to consume the 1D, so
306 // need to keep looping until numPositions > endClipPos
307 while((origCigarOpIndex < oldCigar.size()) &&
308 (numPositions <= endClipPos))
309 {
310 op = &(oldCigar.getOperator(origCigarOpIndex));
311
312 // Update the numPositions count if the operations indicates
313 // bases within the read.
314 if(!Cigar::foundInQuery(op->operation))
315 {
316 // This operation is not in the query read sequence,
317 // so it is not yet to the endClipPos, just add the
318 // operation do not increment the number of positions.
319 updatedCigar += *op;
320 if(!Cigar::isClip(op->operation))
321 {
322 onlyClips = false;
323 }
324 }
325 else
326 {
327 // This operation appears in the query sequence, so
328 // check to see if the clip occurs in this operation.
329
330 // endClipPos is 0 based & numPositions is a count.
331 // If endClipPos is 4, then it is the 5th position.
332 // If 4 positions are covered so far (numPositions = 4),
333 // then we are right at endCLipPos: 4-4 = 0, none of
334 // this operation should be kept.
335 // If only 3 positions were covered, then we are at offset
336 // 3, so offset 3 should be added: 4-3 = 1.
337 uint32_t numPosTilClip = endClipPos - numPositions;
338
339 if(numPosTilClip < op->count)
340 {
341 // this operation is partially clipped, write the part
342 // that was not clipped if it is not all clipped.
343 if(numPosTilClip != 0)
344 {
345 updatedCigar.Add(op->operation,
346 numPosTilClip);
347 if(!Cigar::isClip(op->operation))
348 {
349 onlyClips = false;
350 }
351 }
352 }
353 else
354 {
355 // This operation is not clipped, so add it
356 updatedCigar += *op;
357 if(!Cigar::isClip(op->operation))
358 {
359 onlyClips = false;
360 }
361 }
362 // This operation occurs in the query sequence, so
363 // increment the number of positions covered.
364 numPositions += op->count;
365 }
366
367 // Move to the next cigar position.
368 ++origCigarOpIndex;
369 }
370
371 //////////////////
372 // Add the softclip to the back.
373 if(numBackClips != 0)
374 {
375 // Add the softclip to the end
376 updatedCigar.Add(Cigar::softClip, numBackClips);
377 }
378
379 //////////////////
380 // Add any hardclips remaining in the original cigar to the back.
381 while(origCigarOpIndex < oldCigar.size())
382 {
383 op = &(oldCigar.getOperator(origCigarOpIndex));
384 if(op->operation == Cigar::hardClip)
385 {
386 // Keep this operation as the new clips do not
387 // affect other clips.
388 updatedCigar += *op;
389 }
390 ++origCigarOpIndex;
391 }
392
393 // Check to see if the new cigar is only clips.
394 if(onlyClips)
395 {
396 // Only clips in the new cigar, so mark the read as filtered
397 // instead of updating the cigar.
398 /////////////////////////////
399 // The entire read was clipped.
400 status = FILTERED;
401 }
402 else
403 {
404 // Part of the read was clipped.
405 // Update the starting position if a clip was added to
406 // the front.
407 if(numFrontClips > 0)
408 {
409 // Convert from query index to reference position (from the
410 // old cigar)
411 // Get the position for the last front clipped position by
412 // getting the position associated with the clipped base on
413 // the reference. Then add one to get to the first
414 // non-clipped position.
415 int32_t lastFrontClipPos = numFrontClips - 1;
416 int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos,
417 startPos);
418 if(newStartPos != Cigar::INDEX_NA)
419 {
420 // Add one to get first non-clipped position.
421 startPos = newStartPos + 1;
422 }
423 }
424 }
425 }
426 return(status);
427}
428
429
431 GenomeSequence& refSequence,
432 uint32_t qualityThreshold,
433 uint8_t defaultQualityInt)
434{
435 uint32_t totalMismatchQuality =
436 sumMismatchQuality(record, refSequence, defaultQualityInt);
437
438 // If the total mismatch quality is over the threshold,
439 // filter the read.
440 if(totalMismatchQuality > qualityThreshold)
441 {
442 filterRead(record);
443 return(FILTERED);
444 }
445 return(NONE);
446}
447
448
449// NOTE: Only positions where the reference and read both have bases that
450// are different and not 'N' are considered mismatches.
452 GenomeSequence& refSequence,
453 uint8_t defaultQualityInt)
454{
455 // Track the mismatch info.
456 int mismatchQual = 0;
457 int numMismatch = 0;
458
459 SamQuerySeqWithRefIter sequenceIter(record, refSequence);
460
461 SamSingleBaseMatchInfo baseMatchInfo;
462 while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
463 {
464 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
465 {
466 // Got a mismatch, get the associated quality.
467 char readQualityChar =
468 record.getQuality(baseMatchInfo.getQueryIndex());
469 uint8_t readQualityInt =
470 BaseUtilities::getPhredBaseQuality(readQualityChar);
471
472 if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
473 {
474 // Quality was not specified, so use the configured setting.
475 readQualityInt = defaultQualityInt;
476 }
477 mismatchQual += readQualityInt;
478 ++numMismatch;
479 }
480 }
481
482 return(mismatchQual);
483}
484
485
487{
488 // Filter the read by marking it as unmapped.
489 uint16_t flag = record.getFlag();
491 // Clear N/A flags.
492 flag &= ~SamFlag::PROPER_PAIR;
493 flag &= ~SamFlag::SECONDARY_ALIGNMENT;
494 flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
495 record.setFlag(flag);
496 // Clear Cigar
497 record.setCigar("*");
498 // Clear mapping quality
499 record.setMapQuality(0);
500}
static const uint8_t UNKNOWN_QUALITY_INT
Int value used when the quality is unknown.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition CigarRoller.h:67
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition Cigar.h:84
int size() const
Return the number of cigar operations.
Definition Cigar.h:364
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition Cigar.cpp:95
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition Cigar.h:492
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition Cigar.h:219
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition Cigar.cpp:217
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition Cigar.h:92
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition Cigar.h:90
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition Cigar.h:95
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition Cigar.h:89
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition Cigar.h:91
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition Cigar.h:93
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition Cigar.h:94
@ none
no operation has been set.
Definition Cigar.h:88
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition Cigar.h:354
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition Cigar.h:261
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
FilterStatus
Enum describing what sort of filtering was done.
Definition SamFilter.h:29
@ NONE
The filter did not affect the read.
Definition SamFilter.h:30
@ FILTERED
Filtering caused the read to be modified to unmapped.
Definition SamFilter.h:32
@ CLIPPED
Filtering clipped the read.
Definition SamFilter.h:31
static FilterStatus softClip(SamRecord &record, int32_t numFrontClips, int32_t numBackClips)
Soft clip the record from the front and/or the back.
static uint32_t sumMismatchQuality(SamRecord &record, GenomeSequence &refSequence, uint8_t defaultQualityInt)
Get the sum of the qualities of all mismatches in the record.
static FilterStatus clipOnMismatchThreshold(SamRecord &record, GenomeSequence &refSequence, double mismatchThreshold)
Clip the read based on the specified mismatch threshold.
Definition SamFilter.cpp:27
static void filterRead(SamRecord &record)
Filter the read by marking it as unmapped.
static FilterStatus filterOnMismatchQuality(SamRecord &record, GenomeSequence &refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt)
Filter the read based on the specified quality threshold.
Class for extracting information from a SAM Flag.
Definition SamFlag.h:29
static void setUnmapped(uint16_t &flag)
Mark the passed in flag as unmapped.
Definition SamFlag.h:104
Iterates through the query and compare with reference.
bool getNextMatchMismatch(SamSingleBaseMatchInfo &matchMismatchInfo)
Returns information for the next position where the query and the reference match or mismatch.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition SamRecord.h:52
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
uint16_t getFlag()
Get the flag (FLAG).
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
int32_t getReadLength()
Get the length of the read.
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
This class contains the match/mismatch information between the reference and a read for a single base...
int32_t getQueryIndex()
Get the query index for this object.
Type getType()
Get the type (match/mismatch/unknown) for this object.