libStatGen Software  1
BamIndex Class Reference
Inheritance diagram for BamIndex:
Collaboration diagram for BamIndex:

Public Member Functions

virtual void resetIndex ()
 Reset the member data for a new index file.
 
SamStatus::Status readIndex (const char *filename)
 
bool getChunksForRegion (int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
 Get the list of chunks associated with this region. More...
 
uint64_t getMaxOffset () const
 
bool getReferenceMinMax (int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const
 Get the minimum and maximum file offsets for the specfied reference ID. More...
 
int32_t getNumMappedReads (int32_t refID)
 Get the number of mapped reads for this reference id. More...
 
int32_t getNumUnMappedReads (int32_t refID)
 Get the number of unmapped reads for this reference id. More...
 
void printIndex (int32_t refID, bool summary=false)
 Print the index information. More...
 
- Public Member Functions inherited from IndexBase
int32_t getNumRefs () const
 Get the number of references in this index. More...
 
bool getMinOffsetFromLinearIndex (int32_t refID, uint32_t position, uint64_t &minOffset) const
 

Static Public Attributes

static const int32_t UNKNOWN_NUM_READS = -1
 The number used for an unknown number of reads.
 
static const int32_t REF_ID_UNMAPPED = -1
 The number used for the reference id of unmapped reads.
 
static const int32_t REF_ID_ALL = -2
 The number used to indicate that all reference ids should be used.
 

Additional Inherited Members

- Static Protected Member Functions inherited from IndexBase
static void getBinsForRegion (uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1])
 
- Protected Attributes inherited from IndexBase
int32_t n_ref
 
std::vector< ReferencemyRefs
 
- Static Protected Attributes inherited from IndexBase
static const uint32_t MAX_NUM_BINS = 37450
 
static const uint32_t MAX_POSITION = 536870911
 
static const uint32_t LINEAR_INDEX_SHIFT = 14
 

Detailed Description

Definition at line 31 of file BamIndex.h.

Member Function Documentation

◆ getChunksForRegion()

bool BamIndex::getChunksForRegion ( int32_t  refID,
int32_t  start,
int32_t  end,
SortedChunkList chunkList 
)

Get the list of chunks associated with this region.

For an entire reference ID, set start and end to -1. To start at the beginning of the region, set start to 0/-1. To go to the end of the region, set end to -1.

Definition at line 218 of file BamIndex.cpp.

220 {
221  chunkList.clear();
222 
223  // If start is >= to end, there will be no sections, return no
224  // regions.
225  if((start >= end) && (end != -1))
226  {
227  std::cerr << "Warning, requesting region where start <= end, so "
228  << "no values will be returned.\n";
229  return(false);
230  }
231 
232  // Handle REF_ID_UNMAPPED. This uses a default chunk which covers
233  // from the max offset to the end of the file.
234  if(refID == REF_ID_UNMAPPED)
235  {
236  Chunk refChunk;
237  // The start of the unmapped region is the max offset found
238  // in the index file.
239  refChunk.chunk_beg = getMaxOffset();
240  // The end of the unmapped region is the end of the file, so
241  // set chunk end to the max value.
242  refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
243  return(chunkList.insert(refChunk));
244  }
245 
246  if((refID < 0) || (refID >= n_ref))
247  {
248  // The specified refID is out of range, return false.
249  std::cerr << "Warning, requesting refID is out of range, so "
250  << "no values will be returned.\n";
251  return(false);
252  }
253 
254  const Reference* ref = &(myRefs[refID]);
255 
256  // Handle where start/end are defaults.
257  if(start == -1)
258  {
259  if(end == -1)
260  {
261  // This is whole chromosome, so take a shortcut.
262  if(ref->maxChunkOffset == 0)
263  {
264  // No chunks for this region, but this is not an error.
265  return(true);
266  }
267  Chunk refChunk;
268  refChunk.chunk_beg = ref->minChunkOffset;
269  refChunk.chunk_end = ref->maxChunkOffset;
270  return(chunkList.insert(refChunk));
271  }
272  else
273  {
274  start = 0;
275  }
276  }
277  if(end == -1)
278  {
279  // MAX_POSITION is inclusive, but end is exclusive, so add 1.
280  end = MAX_POSITION + 1;
281  }
282 
283  // Determine the minimum offset for the given start position. This
284  // is done by using the linear index for the specified start position.
285  uint64_t minOffset = 0;
286  getMinOffsetFromLinearIndex(refID, start, minOffset);
287 
288  bool binInRangeMap[MAX_NUM_BINS+1];
289 
290  getBinsForRegion(start, end, binInRangeMap);
291 
292  // Loop through the bins in the ref and if they are in the region, get the chunks.
293  for(int i = 0; i < ref->n_bin; ++i)
294  {
295  const Bin* bin = &(ref->bins[i]);
296  if(binInRangeMap[bin->bin] == false)
297  {
298  // This bin is not in the region, so check the next one.
299  continue;
300  }
301 
302  // Add each chunk in the bin to the map.
303  for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
304  {
305  // If the end of the chunk is less than the minimum offset
306  // for the 16K block that starts our region, then no
307  // records in this chunk will cross our region, so do
308  // not add it to the chunks we need to use.
309  if(bin->chunks[chunkIndex].chunk_end < minOffset)
310  {
311  continue;
312  }
313  // Add the chunk to the map.
314  if(!chunkList.insert(bin->chunks[chunkIndex]))
315  {
316  // Failed to add to the map, return false.
317  std::cerr << "Warning, Failed to add a chunk, so "
318  << "no values will be returned.\n";
319  return(false);
320  }
321  }
322  }
323 
324  // Now that all chunks have been added to the list,
325  // handle overlapping chunks.
326  return(chunkList.mergeOverlapping());
327 }
static const int32_t REF_ID_UNMAPPED
The number used for the reference id of unmapped reads.
Definition: BamIndex.h:86

References REF_ID_UNMAPPED.

◆ getNumMappedReads()

int32_t BamIndex::getNumMappedReads ( int32_t  refID)

Get the number of mapped reads for this reference id.

Returns -1 for out of range refIDs.

Parameters
refIDreference ID for which to extract the number of mapped reads.
Returns
number of mapped reads for the specified reference id.

Definition at line 355 of file BamIndex.cpp.

356 {
357  // If it is the reference id of unmapped reads, return
358  // that there are no mapped reads.
359  if(refID == REF_ID_UNMAPPED)
360  {
361  // These are by definition all unmapped reads.
362  return(0);
363  }
364 
365  if((refID < 0) || (refID >= (int32_t)myRefs.size()))
366  {
367  // Reference ID is out of range for this index file.
368  return(-1);
369  }
370 
371  // Get this reference.
372  return(myRefs[refID].n_mapped);
373 }

References REF_ID_UNMAPPED.

Referenced by SamFile::getNumMappedReadsFromIndex().

◆ getNumUnMappedReads()

int32_t BamIndex::getNumUnMappedReads ( int32_t  refID)

Get the number of unmapped reads for this reference id.

Returns -1 for out of range refIDs.

Parameters
refIDreference ID for which to extract the number of unmapped reads.
Returns
number of unmapped reads for the specified reference id

Definition at line 377 of file BamIndex.cpp.

378 {
379  // If it is the reference id of unmapped reads, return
380  // that value.
381  if(refID == REF_ID_UNMAPPED)
382  {
383  return(myUnMappedNumReads);
384  }
385 
386  if((refID < 0) || (refID >= (int32_t)myRefs.size()))
387  {
388  // Reference ID is out of range for this index file.
389  return(-1);
390  }
391 
392  // Get this reference.
393  return(myRefs[refID].n_unmapped);
394 }

References REF_ID_UNMAPPED.

Referenced by SamFile::getNumUnMappedReadsFromIndex().

◆ getReferenceMinMax()

bool BamIndex::getReferenceMinMax ( int32_t  refID,
uint64_t &  minOffset,
uint64_t &  maxOffset 
) const

Get the minimum and maximum file offsets for the specfied reference ID.

Parameters
refIDthe reference ID to locate in the file.
minOffsetreturns the min file offset for the specified reference
maxOffsetreturns the max file offset for the specified reference
Returns
whether or not the reference was found in the file

Definition at line 337 of file BamIndex.cpp.

340 {
341  if((refID < 0) || (refID >= (int32_t)myRefs.size()))
342  {
343  // Reference ID is out of range for this index file.
344  return(false);
345  }
346 
347  // Get this reference.
348  minOffset = myRefs[refID].minChunkOffset;
349  maxOffset = myRefs[refID].maxChunkOffset;
350  return(true);
351 }

◆ printIndex()

void BamIndex::printIndex ( int32_t  refID,
bool  summary = false 
)

Print the index information.

Parameters
refIDreference ID for which to print info for. -1 means print for all references.
summarywhether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk.

Definition at line 398 of file BamIndex.cpp.

399 {
400  std::cout << "BAM Index: " << std::endl;
401  std::cout << "# Reference Sequences: " << n_ref << std::endl;
402 
403  unsigned int startRef = 0;
404  unsigned int endRef = myRefs.size() - 1;
405  std::vector<Reference> refsToProcess;
406  if(refID != -1)
407  {
408  // Set start and end ref to the specified reference id.
409  startRef = refID;
410  endRef = refID;
411  }
412 
413  // Print out the information for each bin.
414  for(unsigned int i = startRef; i <= endRef; ++i)
415  {
416  std::cout << std::dec
417  << "\tReference ID: " << std::setw(4) << i
418  << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
419  << "; #Linear Index Entries: "
420  << std::setw(6) << myRefs[i].n_intv
421  << "; Min Chunk Offset: "
422  << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
423  << "; Max Chunk Offset: "
424  << std::setw(18) << myRefs[i].maxChunkOffset
425  << std::dec;
426  // Print the mapped/unmapped if set.
427  if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
428  {
429  std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
430  }
431  if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
432  {
433  std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
434  }
435  std::cout << std::endl;
436 
437  // Only print more details if not summary.
438  if(!summary)
439  {
440  std::vector<Bin>::iterator binIter;
441  for(binIter = myRefs[i].bins.begin();
442  binIter != myRefs[i].bins.end();
443  ++binIter)
444  {
445  Bin* binPtr = &(*binIter);
446  if(binPtr->bin == Bin::NOT_USED_BIN)
447  {
448  // This bin is not used, continue.
449  continue;
450  }
451  // Print the bin info.
452  std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
453  std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
454  std::cout << std::hex << std::showbase;
455 
456  for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
457  ++chunkIndex)
458  {
459  // If this is the last chunk of the MAX_NUM_BINS - it
460  // contains a mapped/unmapped count rather than the regular
461  // chunk addresses.
462  if((binPtr->bin != MAX_NUM_BINS) ||
463  (chunkIndex != (binPtr->n_chunk - 1)))
464  {
465  std::cout << "\t\t\t\tchunk_beg: "
466  << binPtr->chunks[chunkIndex].chunk_beg
467  << std::endl;
468  std::cout << "\t\t\t\tchunk_end: "
469  << binPtr->chunks[chunkIndex].chunk_end
470  << std::endl;
471  }
472  }
473  }
474  std::cout << std::dec;
475 
476  // Print the linear index.
477  for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
478  ++linearIndex)
479  {
480  if(myRefs[i].ioffsets[linearIndex] != 0)
481  {
482  std::cout << "\t\t\tLinearIndex["
483  << std::dec << linearIndex << "] Offset: "
484  << std::hex << myRefs[i].ioffsets[linearIndex]
485  << std::endl;
486  }
487  }
488  }
489  }
490 }

◆ readIndex()

SamStatus::Status BamIndex::readIndex ( const char *  filename)
virtual
Parameters
filenamethe bam index file to be read.
Returns
the status of the read.

Implements IndexBase.

Definition at line 45 of file BamIndex.cpp.

46 {
47  // Reset the index from anything that may previously be set.
48  resetIndex();
49 
50  IFILE indexFile = ifopen(filename, "rb");
51 
52  // Failed to open the index file.
53  if(indexFile == NULL)
54  {
55  return(SamStatus::FAIL_IO);
56  }
57 
58  // generate the bam index structure.
59 
60  // Read the magic string.
61  char magic[4];
62  if(ifread(indexFile, magic, 4) != 4)
63  {
64  // Failed to read the magic
65  ifclose(indexFile);
66  return(SamStatus::FAIL_IO);
67  }
68 
69  // If this is not an index file, set num references to 0.
70  if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
71  {
72  // Not a BAM Index file.
73  ifclose(indexFile);
74  return(SamStatus::FAIL_PARSE);
75  }
76 
77  // It is a bam index file.
78  // Read the number of reference sequences.
79  if(ifread(indexFile, &n_ref, 4) != 4)
80  {
81  // Failed to read.
82  ifclose(indexFile);
83  return(SamStatus::FAIL_IO);
84  }
85 
86  // Size the references.
87  myRefs.resize(n_ref);
88 
89  for(int refIndex = 0; refIndex < n_ref; refIndex++)
90  {
91  // Read each reference.
92  Reference* ref = &(myRefs[refIndex]);
93 
94  // Read the number of bins.
95  if(ifread(indexFile, &(ref->n_bin), 4) != 4)
96  {
97  // Failed to read the number of bins.
98  // Return failure.
99  ifclose(indexFile);
100  return(SamStatus::FAIL_PARSE);
101  }
102 
103  // If there are no bins, then there are no
104  // mapped/unmapped reads.
105  if(ref->n_bin == 0)
106  {
107  ref->n_mapped = 0;
108  ref->n_unmapped = 0;
109  }
110 
111  // Resize the bins so they can be indexed by bin number.
112  ref->bins.resize(ref->n_bin + 1);
113 
114  // Read each bin.
115  for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
116  {
117  uint32_t binNumber;
118 
119  // Read in the bin number.
120  if(ifread(indexFile, &(binNumber), 4) != 4)
121  {
122  // Failed to read the bin number.
123  // Return failure.
124  ifclose(indexFile);
125  return(SamStatus::FAIL_IO);
126  }
127 
128  // Add the bin to the reference and get the
129  // pointer back so the values can be set in it.
130  Bin* binPtr = &(ref->bins[binIndex]);
131  binPtr->bin = binNumber;
132 
133  // Read in the number of chunks.
134  if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
135  {
136  // Failed to read number of chunks.
137  // Return failure.
138  ifclose(indexFile);
139  return(SamStatus::FAIL_IO);
140  }
141 
142  // Read in the chunks.
143  // Allocate space for the chunks.
144  uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
145  binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
146  if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
147  {
148  // Failed to read the chunks.
149  // Return failure.
150  ifclose(indexFile);
151  return(SamStatus::FAIL_IO);
152  }
153 
154  // Determine the min/max for this bin if it is not the max bin.
155  if(binPtr->bin != MAX_NUM_BINS)
156  {
157  for(int i = 0; i < binPtr->n_chunk; i++)
158  {
159  if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
160  {
161  ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
162  }
163  if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
164  {
165  ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
166  }
167  if(binPtr->chunks[i].chunk_end > maxOverallOffset)
168  {
169  maxOverallOffset = binPtr->chunks[i].chunk_end;
170  }
171  }
172  }
173  else
174  {
175  // Mapped/unmapped are the last chunk of the
176  // MAX BIN
177  ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
178  ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
179  }
180  }
181 
182  // Read the number of intervals.
183  if(ifread(indexFile, &(ref->n_intv), 4) != 4)
184  {
185  // Failed to read, set to 0.
186  ref->n_intv = 0;
187  // Return failure.
188  ifclose(indexFile);
189  return(SamStatus::FAIL_IO);
190  }
191 
192  // Allocate space for the intervals and read them.
193  uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
194  ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
195  if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
196  {
197  // Failed to read the linear index.
198  // Return failure.
199  ifclose(indexFile);
200  return(SamStatus::FAIL_IO);
201  }
202  }
203 
204  int32_t numUnmapped = 0;
205  if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
206  {
207  myUnMappedNumReads = numUnmapped;
208  }
209 
210  // Successfully read the bam index file.
211  ifclose(indexFile);
212  return(SamStatus::SUCCESS);
213 }
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
virtual void resetIndex()
Reset the member data for a new index file.
Definition: BamIndex.cpp:35
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
@ SUCCESS
method completed successfully.
Definition: StatGenStatus.h:32
@ FAIL_IO
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
@ FAIL_PARSE
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42

References StatGenStatus::FAIL_IO, StatGenStatus::FAIL_PARSE, ifclose(), ifopen(), ifread(), resetIndex(), and StatGenStatus::SUCCESS.

Referenced by SamFile::ReadBamIndex().


The documentation for this class was generated from the following files: