BamTools  2.4.1
BamAlignment.h
Go to the documentation of this file.
1 // ***************************************************************************
2 // BamAlignment.h (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the BamAlignment data structure
8 // ***************************************************************************
9 
10 #ifndef BAMALIGNMENT_H
11 #define BAMALIGNMENT_H
12 
13 #include "api/api_global.h"
14 #include "api/BamAux.h"
15 #include "api/BamConstants.h"
16 #include <cstdlib>
17 #include <cstring>
18 #include <string>
19 #include <vector>
20 
21 namespace BamTools {
22 
24 // forward declaration of BamAlignment's "friends"
25 namespace Internal {
26  class BamReaderPrivate;
27  class BamWriterPrivate;
28 } // namespace Internal
30 
31 // BamAlignment data structure
33 
34  // constructors & destructor
35  public:
36  BamAlignment(void);
37  BamAlignment(const BamAlignment& other);
38  ~BamAlignment(void);
39 
40  // queries against alignment flags
41  public:
42  bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate
43  bool IsFailedQC(void) const; // returns true if this read failed quality control
44  bool IsFirstMate(void) const; // returns true if alignment is first mate on read
45  bool IsMapped(void) const; // returns true if alignment is mapped
46  bool IsMateMapped(void) const; // returns true if alignment's mate is mapped
47  bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
48  bool IsPaired(void) const; // returns true if alignment part of paired-end read
49  bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment
50  bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution
51  bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand
52  bool IsSecondMate(void) const; // returns true if alignment is second mate on read
53 
54  // manipulate alignment flags
55  public:
56  void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
57  void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
58  void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
59  void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
60  void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
61  void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
62  void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
63  void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
64  void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
65  void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
66  void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
67 
68  // tag data access methods
69  public:
70 
71  // add a new tag
72  template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
73  template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
74 
75  // edit (or append) tag
76  template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
77  template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
78 
79  // retrieves tag data
80  template<typename T> bool GetTag(const std::string& tag, T& destination) const;
81  template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
82 
83  // retrieves all current tag names
84  std::vector<std::string> GetTagNames(void) const;
85 
86  // retrieves the SAM/BAM type-code for requested tag name
87  bool GetTagType(const std::string& tag, char& type) const;
88 
89  // retrieves the SAM/BAM type-code for the data elements in an array tag
90  bool GetArrayTagType(const std::string& tag, char& type) const;
91 
92  // returns true if alignment has a record for this tag name
93  bool HasTag(const std::string& tag) const;
94 
95  // removes a tag
96  void RemoveTag(const std::string& tag);
97 
98  // additional methods
99  public:
100  // populates alignment string fields
101  bool BuildCharData(void);
102 
103  // calculates alignment end position
104  int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
105 
106  // returns a description of the last error that occurred
107  std::string GetErrorString(void) const;
108 
109  // retrieves the size, read locations and reference locations of soft-clip operations
110  bool GetSoftClips(std::vector<int>& clipSizes,
111  std::vector<int>& readPositions,
112  std::vector<int>& genomePositions,
113  bool usePadded = false) const;
114 
115  // public data fields
116  public:
117  std::string Name; // read name
118  int32_t Length; // length of query sequence
119  std::string QueryBases; // 'original' sequence (contained in BAM file)
120  std::string AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
121  std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
122  std::string TagData; // tag data (use provided methods to query/modify)
123  int32_t RefID; // ID number for reference sequence
124  int32_t Position; // position (0-based) where alignment starts
125  uint16_t Bin; // BAM (standard) index bin number for this alignment
126  uint16_t MapQuality; // mapping quality score
127  uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
128  std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
129  int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
130  int32_t MatePosition; // position (0-based) where alignment's mate starts
131  int32_t InsertSize; // mate-pair insert size
132  std::string Filename; // name of BAM file which this alignment comes from
133 
135  // internal utility methods
136  private:
137  bool FindTag(const std::string& tag,
138  char*& pTagData,
139  const unsigned int& tagDataLength,
140  unsigned int& numBytesParsed) const;
141  bool IsValidSize(const std::string& tag, const std::string& type) const;
142  void SetErrorString(const std::string& where, const std::string& what) const;
143  bool SkipToNextTag(const char storageType,
144  char*& pTagData,
145  unsigned int& numBytesParsed) const;
147  // internal data
148  private:
149 
150  struct BamAlignmentSupportData {
151 
153  // data members
154  std::string AllCharData;
155  uint32_t BlockLength;
156  uint32_t NumCigarOperations;
157  uint32_t QueryNameLength;
158  uint32_t QuerySequenceLength;
159  bool HasCoreOnly;
160 
162  // constructor
163  BamAlignmentSupportData(void)
164  : BlockLength(0)
165  , NumCigarOperations(0)
166  , QueryNameLength(0)
167  , QuerySequenceLength(0)
168  , HasCoreOnly(false)
169  { }
170  };
171  BamAlignmentSupportData SupportData;
172  friend class Internal::BamReaderPrivate;
173  friend class Internal::BamWriterPrivate;
174 
175  mutable std::string ErrorString; // mutable to allow updates even in logically const methods
176 };
177 
178 // ---------------------------------------------------------
179 // BamAlignment tag access methods
180 
192 template<typename T>
193 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) {
194 
195  // if char data not populated, do that first
196  if ( SupportData.HasCoreOnly )
197  BuildCharData();
198 
199  // check tag/type size
200  if ( !IsValidSize(tag, type) ) {
201  // TODO: set error string?
202  return false;
203  }
204 
205  // check that storage type code is OK for T
206  if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
207  // TODO: set error string?
208  return false;
209  }
210 
211  // localize the tag data
212  char* pTagData = (char*)TagData.data();
213  const unsigned int tagDataLength = TagData.size();
214  unsigned int numBytesParsed = 0;
215 
216  // if tag already exists, return false
217  // use EditTag explicitly instead
218  if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
219  // TODO: set error string?
220  return false;
221  }
222 
223  // otherwise, convert value to string
224  union { T value; char valueBuffer[sizeof(T)]; } un;
225  un.value = value;
226 
227  // copy original tag data to temp buffer
228  const std::string newTag = tag + type;
229  const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
230  RaiiBuffer originalTagData(newTagDataLength);
231  memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
232 
233  // append newTag
234  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
235  memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
236 
237  // store temp buffer back in TagData
238  const char* newTagData = (const char*)originalTagData.Buffer;
239  TagData.assign(newTagData, newTagDataLength);
240  return true;
241 }
242 
243 template<>
244 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
245  const std::string& type,
246  const std::string& value)
247 {
248  // if char data not populated, do that first
249  if ( SupportData.HasCoreOnly )
250  BuildCharData();
251 
252  // check tag/type size
253  if ( !IsValidSize(tag, type) ) {
254  // TODO: set error string?
255  return false;
256  }
257 
258  // check that storage type code is OK for string
259  if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
260  // TODO: set error string?
261  return false;
262  }
263 
264  // localize the tag data
265  char* pTagData = (char*)TagData.data();
266  const unsigned int tagDataLength = TagData.size();
267  unsigned int numBytesParsed = 0;
268 
269  // if tag already exists, return false
270  // use EditTag explicitly instead
271  if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
272  // TODO: set error string?
273  return false;
274  }
275 
276  // otherwise, copy tag data to temp buffer
277  const std::string newTag = tag + type + value;
278  const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
279  RaiiBuffer originalTagData(newTagDataLength);
280  memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
281 
282  // append newTag (removes original null-term, then appends newTag + null-term)
283  strcat(originalTagData.Buffer + tagDataLength, newTag.data());
284 
285  // store temp buffer back in TagData
286  const char* newTagData = (const char*)originalTagData.Buffer;
287  TagData.assign(newTagData, newTagDataLength);
288  return true;
289 }
290 
301 template<typename T>
302 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) {
303 
304  // if char data not populated, do that first
305  if ( SupportData.HasCoreOnly )
306  BuildCharData();
307 
308  // check for valid tag name length
309  if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
310  return false;
311 
312  // localize the tag data
313  char* pTagData = (char*)TagData.data();
314  const unsigned int tagDataLength = TagData.size();
315  unsigned int numBytesParsed = 0;
316 
317  // if tag already exists, return false
318  // use EditTag explicitly instead
319  if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
320  // TODO: set error string?
321  return false;
322  }
323 
324  // build new tag's base information
325  char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
326  memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
327  newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
328  newTagBase[3] = TagTypeHelper<T>::TypeCode();
329 
330  // add number of array elements to newTagBase
331  const int32_t numElements = values.size();
332  memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
333 
334  // copy current TagData string to temp buffer, leaving room for new tag's contents
335  const size_t newTagDataLength = tagDataLength +
337  numElements*sizeof(T);
338  RaiiBuffer originalTagData(newTagDataLength);
339  memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
340 
341  // write newTagBase (removes old null term)
342  strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
343 
344  // add vector elements to tag
345  int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
346  for ( int i = 0 ; i < numElements; ++i ) {
347  const T& value = values.at(i);
348  memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
349  }
350 
351  // store temp buffer back in TagData
352  const char* newTagData = (const char*)originalTagData.Buffer;
353  TagData.assign(newTagData, newTagDataLength);
354  return true;
355 }
356 
371 template<typename T>
372 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) {
373 
374  // if char data not populated, do that first
375  if ( SupportData.HasCoreOnly )
376  BuildCharData();
377 
378  // remove existing tag if present, then append tag with new value
379  if ( HasTag(tag) )
380  RemoveTag(tag);
381  return AddTag(tag, type, value);
382 }
383 
395 template<typename T>
396 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) {
397 
398  // if char data not populated, do that first
399  if ( SupportData.HasCoreOnly )
400  BuildCharData();
401 
402  // remove existing tag if present, then append tag with new values
403  if ( HasTag(tag) )
404  RemoveTag(tag);
405  return AddTag(tag, values);
406 }
407 
408 
416 template<typename T>
417 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const {
418 
419  // skip if alignment is core-only
420  if ( SupportData.HasCoreOnly ) {
421  // TODO: set error string?
422  return false;
423  }
424 
425  // skip if no tags present
426  if ( TagData.empty() ) {
427  // TODO: set error string?
428  return false;
429  }
430 
431  // localize the tag data
432  char* pTagData = (char*)TagData.data();
433  const unsigned int tagDataLength = TagData.size();
434  unsigned int numBytesParsed = 0;
435 
436  // return failure if tag not found
437  if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
438  // TODO: set error string?
439  return false;
440  }
441 
442  // fetch data type
443  const char type = *(pTagData - 1);
444  if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
445  // TODO: set error string ?
446  return false;
447  }
448 
449  // determine data length
450  int destinationLength = 0;
451  switch ( type ) {
452 
453  // 1 byte data
457  destinationLength = 1;
458  break;
459 
460  // 2 byte data
463  destinationLength = 2;
464  break;
465 
466  // 4 byte data
470  destinationLength = 4;
471  break;
472 
473  // var-length types not supported for numeric destination
477  SetErrorString("BamAlignment::GetTag",
478  "cannot store variable length tag data into a numeric destination");
479  return false;
480 
481  // unrecognized tag type
482  default:
483  const std::string message = std::string("invalid tag type: ") + type;
484  SetErrorString("BamAlignment::GetTag", message);
485  return false;
486  }
487 
488  // store data in destination
489  destination = 0;
490  memcpy(&destination, pTagData, destinationLength);
491 
492  // return success
493  return true;
494 }
495 
496 template<>
497 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
498  std::string& destination) const
499 {
500  // skip if alignment is core-only
501  if ( SupportData.HasCoreOnly ) {
502  // TODO: set error string?
503  return false;
504  }
505 
506  // skip if no tags present
507  if ( TagData.empty() ) {
508  // TODO: set error string?
509  return false;
510  }
511 
512  // localize the tag data
513  char* pTagData = (char*)TagData.data();
514  const unsigned int tagDataLength = TagData.size();
515  unsigned int numBytesParsed = 0;
516 
517  // return failure if tag not found
518  if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
519  // TODO: set error string?
520  return false;
521  }
522 
523  // otherwise copy data into destination
524  const unsigned int dataLength = strlen(pTagData);
525  destination.clear();
526  destination.resize(dataLength);
527  memcpy( (char*)destination.data(), pTagData, dataLength );
528 
529  // return success
530  return true;
531 }
532 
540 template<typename T>
541 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const {
542 
543  // skip if alignment is core-only
544  if ( SupportData.HasCoreOnly ) {
545  // TODO: set error string?
546  return false;
547  }
548 
549  // skip if no tags present
550  if ( TagData.empty() ) {
551  // TODO: set error string?
552  return false;
553  }
554 
555  // localize the tag data
556  char* pTagData = (char*)TagData.data();
557  const unsigned int tagDataLength = TagData.size();
558  unsigned int numBytesParsed = 0;
559 
560  // return false if tag not found
561  if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
562  // TODO: set error string?
563  return false;
564  }
565 
566  // check that tag is array type
567  const char tagType = *(pTagData - 1);
568  if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
569  SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
570  return false;
571  }
572 
573  // fetch element type
574  const char elementType = *pTagData;
575  if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
576  // TODO: set error string ?
577  return false;
578  }
579  ++pTagData;
580 
581  // calculate length of each element in tag's array
582  int elementLength = 0;
583  switch ( elementType ) {
587  elementLength = sizeof(uint8_t);
588  break;
589 
592  elementLength = sizeof(uint16_t);
593  break;
594 
598  elementLength = sizeof(uint32_t);
599  break;
600 
601  // var-length types not supported for numeric destination
605  SetErrorString("BamAlignment::GetTag",
606  "invalid array data, variable-length elements are not allowed");
607  return false;
608 
609  // unknown tag type
610  default:
611  const std::string message = std::string("invalid array element type: ") + elementType;
612  SetErrorString("BamAlignment::GetTag", message);
613  return false;
614  }
615 
616  // get number of elements
617  int32_t numElements;
618  memcpy(&numElements, pTagData, sizeof(int32_t));
619  pTagData += 4;
620  destination.clear();
621  destination.reserve(numElements);
622 
623  // read in elements
624  T value;
625  for ( int i = 0 ; i < numElements; ++i ) {
626  memcpy(&value, pTagData, sizeof(T));
627  pTagData += sizeof(T);
628  destination.push_back(value);
629  }
630 
631  // return success
632  return true;
633 }
634 
635 typedef std::vector<BamAlignment> BamAlignmentVector;
636 
637 } // namespace BamTools
638 
639 #endif // BAMALIGNMENT_H
const char BAM_TAG_TYPE_FLOAT
Definition: BamConstants.h:80
const char BAM_TAG_TYPE_UINT16
Definition: BamConstants.h:77
const uint8_t BAM_TAG_ARRAYBASE_SIZE
Definition: BamConstants.h:87
int32_t InsertSize
mate-pair insert size
Definition: BamAlignment.h:131
std::string TagData
tag data (use the provided methods to query/modify)
Definition: BamAlignment.h:122
const char BAM_TAG_TYPE_INT8
Definition: BamConstants.h:74
The main BAM alignment data structure.
Definition: BamAlignment.h:32
uint32_t AlignmentFlag
alignment bit-flag (use the provided methods to query/modify)
Definition: BamAlignment.h:127
const char BAM_TAG_TYPE_INT32
Definition: BamConstants.h:78
bool GetTag(const std::string &tag, T &destination) const
Definition: BamAlignment.h:417
uint16_t Bin
BAM (standard) index bin number for this alignment.
Definition: BamAlignment.h:125
std::string Filename
name of BAM file which this alignment comes from
Definition: BamAlignment.h:132
const char BAM_TAG_TYPE_ARRAY
Definition: BamConstants.h:83
std::vector< CigarOp > CigarData
CIGAR operations for this alignment.
Definition: BamAlignment.h:128
#define API_EXPORT
Definition: api_global.h:18
const char BAM_TAG_TYPE_HEX
Definition: BamConstants.h:82
const char BAM_TAG_TYPE_ASCII
Definition: BamConstants.h:73
std::string QueryBases
&#39;original&#39; sequence (as reported from sequencing machine)
Definition: BamAlignment.h:119
int32_t Position
position (0-based) where alignment starts
Definition: BamAlignment.h:124
bool AddTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:193
int32_t MateRefID
ID number for reference sequence where alignment&#39;s mate was aligned.
Definition: BamAlignment.h:129
std::string Name
read name
Definition: BamAlignment.h:117
std::string AlignedBases
&#39;aligned&#39; sequence (includes any indels, padding, clipping)
Definition: BamAlignment.h:120
const char BAM_TAG_TYPE_INT16
Definition: BamConstants.h:76
const char BAM_TAG_TYPE_STRING
Definition: BamConstants.h:81
uint16_t MapQuality
mapping quality score
Definition: BamAlignment.h:126
Contains all BamTools classes & methods.
Definition: Sort.h:24
const uint8_t BAM_TAG_TAGSIZE
Definition: BamConstants.h:85
int32_t MatePosition
position (0-based) where alignment&#39;s mate starts
Definition: BamAlignment.h:130
std::vector< BamAlignment > BamAlignmentVector
Definition: BamAlignment.h:635
int32_t Length
length of query sequence
Definition: BamAlignment.h:118
const char BAM_TAG_TYPE_UINT8
Definition: BamConstants.h:75
std::string Qualities
FASTQ qualities (ASCII characters, not numeric values)
Definition: BamAlignment.h:121
const char BAM_TAG_TYPE_UINT32
Definition: BamConstants.h:79
int32_t RefID
ID number for reference sequence.
Definition: BamAlignment.h:123
bool EditTag(const std::string &tag, const std::string &type, const T &value)
Definition: BamAlignment.h:372