libStatGen Software  1
InputFile.cpp
1 /*
2  * Copyright (C) 2010-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "InputFile.h"
19 #include "StringBasics.h"
20 #include "GzipHeader.h"
21 #include "BgzfFileType.h"
22 #include "BgzfFileTypeRecovery.h"
23 #include "GzipFileType.h"
24 #include "UncompressedFileType.h"
25 
26 #include <stdarg.h>
27 
28 InputFile::InputFile(const char * filename, const char * mode,
29  InputFile::ifileCompression compressionMode)
30 {
31  // XXX duplicate code
32  myAttemptRecovery = false;
33  myFileTypePtr = NULL;
34  myBufferIndex = 0;
35  myCurrentBufferSize = 0;
36  myAllocatedBufferSize = DEFAULT_BUFFER_SIZE;
37  myFileBuffer = new char[myAllocatedBufferSize];
38  myFileName.clear();
39 
40  openFile(filename, mode, compressionMode);
41 }
42 
43 
44 int InputFile::readTilChar(const std::string& stopChars, std::string& stringRef)
45 {
46  int charRead = 0;
47  size_t pos = std::string::npos;
48  // Loop until the character was not found in the stop characters.
49  while(pos == std::string::npos)
50  {
51  charRead = ifgetc();
52 
53  // First Check for EOF. If EOF is found, just return -1
54  if(charRead == EOF)
55  {
56  return(-1);
57  }
58 
59  // Try to find the character in the stopChars.
60  pos = stopChars.find(charRead);
61 
62  if(pos == std::string::npos)
63  {
64  // Didn't find a stop character and it is not an EOF,
65  // so add it to the string.
66  stringRef += charRead;
67  }
68  }
69  return(pos);
70 }
71 
72 
73 int InputFile::readTilChar(const std::string& stopChars)
74 {
75  int charRead = 0;
76  size_t pos = std::string::npos;
77  // Loop until the character was not found in the stop characters.
78  while(pos == std::string::npos)
79  {
80  charRead = ifgetc();
81 
82  // First Check for EOF. If EOF is found, just return -1
83  if(charRead == EOF)
84  {
85  return(-1);
86  }
87 
88  // Try to find the character in the stopChars.
89  pos = stopChars.find(charRead);
90  }
91  return(pos);
92 }
93 
94 
96 {
97  int charRead = 0;
98  // Loop until the character was not found in the stop characters.
99  while((charRead != EOF) && (charRead != '\n'))
100  {
101  charRead = ifgetc();
102  }
103  // First Check for EOF. If EOF is found, just return -1
104  if(charRead == EOF)
105  {
106  return(-1);
107  }
108  return(0);
109 }
110 
111 
112 int InputFile::readLine(std::string& line)
113 {
114  int charRead = 0;
115  while(!ifeof())
116  {
117  charRead = ifgetc();
118  if(charRead == EOF)
119  {
120  return(-1);
121  }
122  if(charRead == '\n')
123  {
124  return(0);
125  }
126  line += charRead;
127  }
128  // Should never get here.
129  return(-1);
130 }
131 
132 
133 int InputFile::readTilTab(std::string& field)
134 {
135  int charRead = 0;
136  while(!ifeof())
137  {
138  charRead = ifgetc();
139  if(charRead == EOF)
140  {
141  return(-1);
142  }
143  if(charRead == '\n')
144  {
145  return(0);
146  }
147  if(charRead == '\t')
148  {
149  return(1);
150  }
151  field += charRead;
152  }
153  return(-1);
154 }
155 
156 
157 #ifdef __ZLIB_AVAILABLE__
158 
159 // Open a file. Called by the constructor.
160 // Returns true if the file was successfully opened, false otherwise.
161 bool InputFile::openFile(const char * filename, const char * mode,
162  InputFile::ifileCompression compressionMode)
163 {
164  //
165  // if recovering, we don't want to issue big readaheads, since
166  // that interferes with the decompression - we only want to
167  // decompress one at a time, and handle the exceptions immediately
168  // rather than at some indeterminate point in time.
169  //
170  if(myAttemptRecovery) {
171  bufferReads(1);
172  }
173  // If a file is for write, just open a new file.
174  if (mode[0] == 'w' || mode[0] == 'W')
175  {
176  openFileUsingMode(filename, mode, compressionMode);
177  }
178  else
179  {
180  // Check if reading from stdin.
181  if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0))
182  {
183  // Reading from stdin, open it based on the
184  // compression mode.
185  openFileUsingMode(filename, mode, compressionMode);
186  }
187  else
188  {
189  // Not from stdin, so determine the file type.
190 
191  // Open the file read only to determine file type.
192  UncompressedFileType file(filename, "r");
193  // If the file could not be opened, either create a new one or
194  // return failure.
195  if (!file.isOpen())
196  {
197  // If the mode is for read, then the file must exist, otherwise,
198  // create a new file.
199  if (mode[0] == 'r' || mode[0] == 'R')
200  {
201  // File must exist.
202  if (myFileTypePtr != NULL)
203  {
204  delete myFileTypePtr;
205  myFileTypePtr = NULL;
206  }
207  // Return false, was not opened.
208  return false;
209  }
210  else
211  {
212  openFileUsingMode(filename, mode, compressionMode);
213  }
214  }
215  else
216  {
217  // File was successfully opened, so try to determine the
218  // filetype from the file.
219  // Read the file to see if it a gzip file.
220  GzipHeader gzipHeader;
221  bool isGzip = gzipHeader.readHeader(file);
222 
223  // The file header has been read, so close the file, so it can
224  // be re-opened as the correct type.
225  file.close();
226 
227  if (isGzip)
228  {
229  // This file is a gzip file.
230  // Check to see if it is BGZF Compression.
231  if (gzipHeader.isBgzfFile())
232  {
233  // This file has BGZF Compression, so set the file
234  // pointer.
235  if(myAttemptRecovery) {
236  // NB: this reader will throw std::runtime_error when it recovers
237  myFileTypePtr = new BgzfFileTypeRecovery(filename, mode);
238  } else {
239  // use the standard bgzf reader (samtools)
240  myFileTypePtr = new BgzfFileType(filename, mode);
241  }
242  }
243  else
244  {
245  // Not BGZF, just a normal gzip.
246  myFileTypePtr = new GzipFileType(filename, mode);
247  }
248  }
249  else
250  {
251  // The file is a uncompressed, uncompressed file,
252  // so set the myFileTypePtr accordingly.
253  myFileTypePtr = new UncompressedFileType(filename, mode);
254  }
255  }
256  }
257  }
258  if(myFileTypePtr == NULL)
259  {
260  return(false);
261  }
262  if (!myFileTypePtr->isOpen())
263  {
264  // The file was not opened, so delete the pointer and set to null.
265  delete myFileTypePtr;
266  myFileTypePtr = NULL;
267  return false;
268  }
269 
270  if(myAllocatedBufferSize == 1)
271  {
272  myFileTypePtr->setBuffered(false);
273  }
274  else
275  {
276  myFileTypePtr->setBuffered(true);
277  }
278  myFileName = filename;
279  return true;
280 }
281 
282 
283 // Open a file. This method will open a file with the specified name and
284 // mode with the fileTypePtr associated with the specified compressionMode.
285 void InputFile::openFileUsingMode(const char * filename, const char * mode,
286  ifileCompression compressionMode)
287 {
288  switch (compressionMode)
289  {
290  case GZIP:
291  // Gzipped.
292  myFileTypePtr = new GzipFileType(filename, mode);
293  break;
294  case BGZF:
295  //
296  // BGZF compression - recovery is possible, so use
297  // Bgzf recovery reader if asked.
298  //
299  if(myAttemptRecovery && ((mode[0] == 'r') || (mode[0] == 'R')))
300  {
301  // NB: this reader will throw std::runtime_error when it recovers
302  myFileTypePtr = new BgzfFileTypeRecovery(filename, mode);
303  }
304  else
305  {
306  myFileTypePtr = new BgzfFileType(filename, mode);
307  }
308  break;
309  case UNCOMPRESSED:
310  myFileTypePtr = new UncompressedFileType(filename, mode);
311  break;
312  case InputFile::DEFAULT:
313  default:
314  // Check the extension. If it is ".gz", treat as gzip.
315  // otherwise treat it as UNCOMPRESSED.
316  int lastchar = 0;
317  while (filename[lastchar] != 0) lastchar++;
318  if ((lastchar >= 3 &&
319  filename[lastchar - 3] == '.' &&
320  filename[lastchar - 2] == 'g' &&
321  filename[lastchar - 1] == 'z'))
322  {
323  // .gz files files should be gzipped.
324  myFileTypePtr = new GzipFileType(filename, mode);
325  }
326  else
327  {
328  // Create an uncompressed file.
329  myFileTypePtr = new UncompressedFileType(filename, mode);
330  }
331  break;
332  }
333 
334  if(myFileTypePtr == NULL)
335  {
336  return;
337  }
338  if(myAllocatedBufferSize == 1)
339  {
340  myFileTypePtr->setBuffered(false);
341  }
342  else
343  {
344  myFileTypePtr->setBuffered(true);
345  }
346 }
347 
348 #else
349 
350 // No zlib, so just treat all files as std files.
351 // Open a file. Called by the constructor.
352 // Returns true if the file was successfully opened, false otherwise.
353 bool InputFile::openFile(const char * filename, const char * mode,
354  InputFile::ifileCompression compressionMode)
355 {
356  // No zlib, so it is a uncompressed, uncompressed file.
357  myFileTypePtr = new UncompressedFileType(filename, mode);
358 
359  if(myFileTypePtr == NULL)
360  {
361  return(false);
362  }
363  if (!myFileTypePtr->isOpen())
364  {
365  // The file was not opened, so delete the pointer and set to null.
366  delete myFileTypePtr;
367  myFileTypePtr = NULL;
368  return false;
369  }
370  if(myAllocatedBufferSize == 1)
371  {
372  myFileTypePtr->setBuffered(false);
373  }
374  else
375  {
376  myFileTypePtr->setBuffered(true);
377  }
378  myFileName = filename;
379  return true;
380 }
381 
382 #endif
383 
384 
386 {
387  delete myFileTypePtr;
388  myFileTypePtr = NULL;
389 
390  if(myFileBuffer != NULL)
391  {
392  delete[] myFileBuffer;
393  myFileBuffer = NULL;
394  }
395 }
396 
397 
398 int ifprintf(IFILE output, const char * format, ...)
399 {
400  String buffer;
401 
402  va_list ap;
403  va_start(ap, format);
404 
405  buffer.vprintf(format, ap);
406 
407  va_end(ap);
408 
409  return ::ifwrite(output, (const char *) buffer, buffer.Length());
410 }
411 
412 
413 InputFile& operator << (InputFile& stream, double num)
414 {
415  String val;
416  val = num;
417  stream << val;
418  return(stream);
419 }
420 
421 
422 InputFile& operator << (InputFile& stream, int num)
423 {
424  String val;
425  val = num;
426  stream << val;
427  return(stream);
428 }
429 
430 
431 InputFile& operator << (InputFile& stream, unsigned int num)
432 {
433  String val;
434  val = num;
435  stream << val;
436  return(stream);
437 }
int ifprintf(IFILE output, const char *format,...)
Write to a file using fprintf format.
Definition: InputFile.cpp:398
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
unsigned int ifwrite(IFILE file, const void *buffer, unsigned int size)
Write the specified number of bytes from the specified buffer into the file.
Definition: InputFile.h:669
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
void bufferReads(unsigned int bufferSize=DEFAULT_BUFFER_SIZE)
Set the buffer size for reading from files so that bufferSize bytes are read at a time and stored unt...
Definition: InputFile.h:83
~InputFile()
Destructor.
Definition: InputFile.cpp:385
int readLine(std::string &line)
Read, appending the characters into the specified string until new line or EOF is found,...
Definition: InputFile.cpp:112
InputFile()
Default constructor.
Definition: InputFile.h:52
int ifeof() const
Check to see if we have reached the EOF.
Definition: InputFile.h:386
int ifgetc()
Get a character from the file.
Definition: InputFile.h:324
int discardLine()
Read until the end of the line, discarding the characters, returning -1 returned for EOF and returnin...
Definition: InputFile.cpp:95
ifileCompression
Compression to use when writing a file & decompression used when reading a file from stdin.
Definition: InputFile.h:44
@ BGZF
bgzf file.
Definition: InputFile.h:48
@ GZIP
gzip file.
Definition: InputFile.h:47
@ DEFAULT
Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
Definition: InputFile.h:45
@ UNCOMPRESSED
uncompressed file.
Definition: InputFile.h:46
int readTilTab(std::string &field)
Read, appending the characters into the specified string until tab, new line, or EOF is found,...
Definition: InputFile.cpp:133
int readTilChar(const std::string &stopChars, std::string &stringRef)
Read until the specified characters, returning which character was found causing the stop,...
Definition: InputFile.cpp:44