libStatGen Software  1
SamTags.cpp
1 /*
2  * Copyright (C) 2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "SamTags.h"
19 #include "BaseUtilities.h"
20 
21 const char* SamTags::BQ_TAG = "BQ";
22 const char SamTags::BQ_TAG_TYPE = 'Z';
23 const char* SamTags::MD_TAG = "MD";
24 const char SamTags::MD_TAG_TYPE = 'Z';
25 const char* SamTags::ORIG_POS_TAG = "OP";
26 const char SamTags::ORIG_POS_TAG_TYPE = 'i';
27 const char* SamTags::ORIG_CIGAR_TAG = "OC";
28 const char SamTags::ORIG_CIGAR_TAG_TYPE = 'Z';
29 const char* SamTags::ORIG_QUAL_TAG = "OQ";
30 const char SamTags::ORIG_QUAL_TAG_TYPE = 'Z';
31 
32 
33 // Create the MD tag for the specified input record and the genome.
34 bool SamTags::createMDTag(String& outputMDtag, SamRecord& inputRec,
35  GenomeSequence& genome)
36 {
37  outputMDtag.Clear();
38  // Get the cigar to use for determing alignment.
39  Cigar* cigarInfo = inputRec.getCigarInfo();
40  if(cigarInfo == NULL)
41  {
42  throw(std::runtime_error("Cannot createMDTag - failed to get the cigar"));
43  return(false);
44  }
45  int32_t queryIndex = Cigar::INDEX_NA;
46 
47  // get where this read starts on the reference.
48  uint32_t startOfReadOnRefIndex =
49  genome.getGenomePosition(inputRec.getReferenceName());
50  if(startOfReadOnRefIndex == (uint32_t)INVALID_CHROMOSOME_INDEX)
51  {
52  // Failed to find the reference for this chromosome, so return false.
53  return(false);
54  }
55  startOfReadOnRefIndex += inputRec.get0BasedPosition();
56 
57  // Track the number of consecutive matches.
58  int32_t matchCount = 0;
59  // Track if it is currently in a deletion so it knows when not to add
60  // a '^'.
61  bool currentDeletion = false;
62 
63  // Loop through the Reference indices (ignores insertions/pads/clips).
64  for(int refOffset = 0;
65  refOffset < cigarInfo->getExpectedReferenceBaseCount();
66  ++refOffset)
67  {
68  // Get the query index for this reference position..
69  queryIndex = cigarInfo->getQueryIndex(refOffset);
70 
71  char refBase = genome[startOfReadOnRefIndex + refOffset];
72 
73  if(queryIndex != Cigar::INDEX_NA)
74  {
75  // Both the reference and the read have a base, so get the bases.
76  char readBase = inputRec.getSequence(queryIndex);
77  currentDeletion = false;
78 
79  // If neither base is unknown and they are the same, count it
80  // as a match.
81  if(!BaseUtilities::isAmbiguous(readBase) &&
82  !BaseUtilities::isAmbiguous(refBase) &&
83  (BaseUtilities::areEqual(readBase, refBase)))
84  {
85  // Match, so update counter.
86  ++matchCount;
87  }
88  else
89  {
90  // Mismatch, so output the number of matches if any.
91  if(matchCount != 0)
92  {
93  outputMDtag += matchCount;
94  matchCount = 0;
95  }
96  outputMDtag += refBase;
97  }
98  }
99  else
100  {
101  // This reference position is not in the query, so it is a deletion.
102  // Deletion, so output the number of matches if any.
103  if(matchCount != 0)
104  {
105  outputMDtag += matchCount;
106  matchCount = 0;
107  }
108 
109  if(!currentDeletion)
110  {
111  // Not currently in a deletion, so add the ^
112  outputMDtag += '^';
113  }
114  // Add the deleted base.
115  outputMDtag += refBase;
116  currentDeletion = true;
117  }
118  }
119 
120  // output the match count at the end.
121  outputMDtag += matchCount;
122  return(true);
123 }
124 
125 // Check to see if the MD tag in the record is accurate.
127 {
128  String calcMDtag;
129  if(!createMDTag(calcMDtag, inputRec, genome))
130  {
131  // Could not generate the MD tag, so just return that it is incorrect.
132  return(false);
133  }
134 
135  const String* origMDtag = inputRec.getStringTag(MD_TAG);
136 
137  if(origMDtag == NULL)
138  {
139  // There was no tag.
140  // if there is not a new tag, then they are the same and true
141  // should be returned. If there is a new tag, then the old one was
142  // wrong so false should be returned. So just return the result of
143  // IsEmpty.
144  return(calcMDtag.IsEmpty());
145  }
146  else
147  {
148  // origMDtag is not NULL, so just compare the two tags.
149  return(calcMDtag == *origMDtag);
150  }
151 }
152 
153 
154 // Update/Add the MD tag in the inputRec.
155 bool SamTags::updateMDTag(SamRecord& inputRec, GenomeSequence& genome)
156 {
157  // Calculate the new MD tag.
158  String calcMDtag;
159  createMDTag(calcMDtag, inputRec, genome);
160 
161  // Add the MD tag. If it is already there and is different it will
162  // replace it. If it is already there and it is the same, it won't
163  // do anything.
164  return(inputRec.addTag(MD_TAG, MD_TAG_TYPE, calcMDtag.c_str()));
165 }
static bool isAmbiguous(char base)
Returns whether or not the specified bases is an indicator for ambiguity.
static bool areEqual(char base1, char base2)
Returns whether or not two bases are equal (case insensitive), if one of the bases is '=',...
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition: Cigar.cpp:202
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1298
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1836
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:791
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2180
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1319
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1568
static bool isMDTagCorrect(SamRecord &inputRec, GenomeSequence &genome)
Check to see if the MD tag in the record is accurate.
Definition: SamTags.cpp:126
static bool createMDTag(String &outputMDtag, SamRecord &inputRec, GenomeSequence &genome)
Create the MD tag for the specified input record and the genome.
Definition: SamTags.cpp:34