libStatGen Software  1
TrimSequence.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef _TRIMSEQUENCE_H
19 #define _TRIMSEQUENCE_H
20 
21 #include <assert.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 
25 #ifndef __WIN32__
26 #include <unistd.h>
27 #endif
28 
29 ///
30 /// TrimSequence is a templated function to find bases
31 /// which are below a certain moving mean threshold,
32 /// and can be applied to either end of the sequence string.
33 ///
34 /// @param sequence is the input sequence
35 /// @param meanValue is the value below which we wish to trim.
36 /// @return the iterator of the location at which untrimmed values begin
37 ///
38 /// Details:
39 ///
40 /// trimFromLeft is a bool indicating which direction we wish
41 /// to trim. true -> left to right, false is right to left.
42 ///
43 /// The code is convoluted enough here, so for implementation
44 /// and testing sanity, the following definitions are made:
45 ///
46 /// When trimFromLeft is true:
47 /// result == sequence.begin() implies no trimming
48 /// result == sequence.end() implies all values are trimmed
49 ///
50 /// When trimFromLeft is false:
51 /// result == sequence.begin() implies all values are trimmed
52 /// result == sequence.end() no values are trimmed
53 ///
54 /// result will always be in the range [sequence.begin() , sequence.end())
55 /// (begin is inclusive, end is exclusive).
56 ///
57 /// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples
58 ///
59 /// THIS CODE IS EXCEPTIONALLY FRAGILE. DO NOT ATTEMPT TO FIX OR
60 /// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY
61 /// SHOW WHY OR WHY NOT SOMETHING WORKS.
62 ///
63 template<typename sequenceType, typename meanValueType>
64 typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft)
65 {
66  const int howManyValues = 4; // this is used in signed arithmetic below
67  int windowThreshold = howManyValues * meanValue;
68  int64_t sumOfWindow = 0;
69  typename sequenceType::iterator it;
70 
71  //
72  // Sanity check to weed out what otherwise would be
73  // a large number of boundary checks below. If the input
74  // is too small, just punt it back to the caller. Technically,
75  // we can still trim, but we'd just do the simple iteration
76  // loop. Save that for when we care.
77  //
78 
79  if (sequence.size() < (size_t) howManyValues)
80  return trimFromLeft? sequence.begin() : sequence.end();
81 
82  typename sequenceType::iterator sequenceBegin;
83  typename sequenceType::iterator sequenceEnd;
84 
85  // The algorithm should be clear and efficient
86  // so it does not bother to write codes for two directions.
87  // It that way, we avoid thinking trimming from left and right interchangably.
88  if (trimFromLeft)
89  {
90  // sequenceBegin is inclusive, sequenceEnd is exclusive,
91  sequenceBegin = sequence.begin();
92  sequenceEnd = sequence.end();
93 
94  for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++)
95  sumOfWindow += *it;
96 
97  for (; it < sequenceEnd; it ++)
98  {
99  if (sumOfWindow > windowThreshold)
100  break;
101  sumOfWindow += *it;
102  sumOfWindow -= *(it - howManyValues);
103  }
104  // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively
105  // the range is also [sequence.begin() + howManyValues, sequence.end()]
106  while (*(it-1) >= meanValue && (it-1) >= sequenceBegin)
107  it--;
108  }
109  else
110  {
111  sequenceBegin = sequence.end() - 1;
112  sequenceEnd = sequence.begin() - 1;
113 
114  for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--)
115  sumOfWindow += *it;
116 
117  for (; it > sequenceEnd; it--)
118  {
119  if (sumOfWindow > windowThreshold)
120  break;
121  sumOfWindow += *it;
122  sumOfWindow -= *(it + howManyValues);
123  }
124 
125  // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively
126  // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues]
127  while (*(it+1) >= meanValue && (it+1) <= sequenceBegin)
128  it ++;
129  // note, the return value should in the range [sequence.begin(), sequence.end()]
130  it += 1;
131  }
132 
133  // 'it' may be sequence.end() in some cases
134  assert(it >= sequence.begin() && it <= sequence.end());
135 
136  return it;
137 }
138 
139 #endif