libStatGen Software  1
PedigreeAlleleFreq.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "PedigreeAlleleFreq.h"
19 #include "QuickIndex.h"
20 #include "Error.h"
21 
22 #include <math.h>
23 
24 int CountAlleles(Pedigree & /* ped */, int marker)
25 {
26  // With automatic recoding in the pedigree loader there
27  // is no need to iterate through the pedigree ...
28  MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
29 
30  return info->CountAlleles();
31 }
32 
33 void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder)
34 {
35  // find out how many alleles there are
36  int alleles = ped.CountAlleles(marker);
37 
38  if (alleles < 2) return;
39 
40  MarkerInfo * info = PedigreeGlobals::GetMarkerInfo(marker);
41 
42  if (alleles < info->freq.Length())
43  alleles = info->freq.Length() - 1;
44 
45  IntArray counts(alleles + 1);
46  counts.Zero();
47 
48  // Count number of occurrences for each allele
49  for (int i = 0; i < ped.count; i++)
50  {
51  counts[int(ped[i].markers[marker][0])]++;
52  counts[int(ped[i].markers[marker][1])]++;
53  }
54 
55  // Calculate treshold for lumping alleles
56  int total = 0;
57  for (int i = 1; i <= alleles; i++)
58  total += counts[i];
59  int thresh = int(total * threshold);
60 
61  // If threshold is set at zero, we artificially increase
62  // counts for alleles that do not appear in the pedigree
63  // but whose frequencies are set > 0.0. This ensures that
64  // allele frequency data does not get discarded when simply
65  // recoding alleles (vs. lumping)
66  if (thresh == 0)
67  for (int i = 1; i < info->freq.Length(); i++)
68  if (counts[i] == 0 && info->freq[i] > 0.0)
69  counts[i] = 1, total++;
70 
71  // If allele reordering is disabled, put in dummy allele
72  // counts so as to ensure that allele have desired ordering
73  if (!reorder)
74  {
75  QuickIndex index(info->alleleLabels);
76  index.Reverse();
77 
78  for (int i = 0; i < index.Length(); i++)
79  counts[index[i]] = i + 1;
80 
81  total = counts.Sum(1, counts.Length() - 1);
82  }
83 
84  // Order all alleles according to their frequency
85  // Zero, which corresponds to missing values, stays put!
86  counts[0] = total + 1;
87  QuickIndex index(counts);
88  index.Reverse();
89 
90  // recode alleles
91  // all alleles where frequency < thresh are labelled N
92  // use counts array to keep track of labels
93  int N = 0;
94  bool rare = false;
95  for (int i = 0; i <= alleles; i++)
96  if (counts[index[i]] > thresh)
97  {
98  counts[index[i]] = i;
99  N++;
100  }
101  else
102  {
103  if (counts[index[i]] > 0)
104  rare = true;
105  counts[index[i]] = N;
106  }
107 
108  // This loop does the recoding
109  for (int i = 0; i < ped.count; i++)
110  {
111  Alleles & current = ped[i].markers[marker];
112  current[0] = counts[current[0]];
113  current[1] = counts[current[1]];
114  }
115 
116  StringArray oldLabels(info->alleleLabels);
117  String label;
118 
119  info->alleleLabels.Clear();
120  info->alleleNumbers.Clear();
121 
122  for (int i = 0; i < N; i++)
123  {
124  if (oldLabels.Length() <= index[i])
125  info->alleleLabels.Push(label = index[i]);
126  else
127  info->alleleLabels.Push(oldLabels[index[i]]);
128 
129  if (i) info->alleleNumbers.SetInteger(info->alleleLabels.Last(), i);
130  }
131 
132  // Reorder allele frequencies if necessary
133  if (info->freq.Length())
134  {
135  Vector freq(info->freq);
136 
137  info->freq.Dimension(N);
138  info->freq[0] = 0.0;
139 
140  for (int i = 1; i < N; i++)
141  {
142  info->freq[i] = freq[index[i]];
143  freq[index[i]] = 0;
144  }
145 
146  if ((1.0 - info->freq.Sum()) > 1e-10)
147  rare = true;
148 
149  if (rare)
150  {
151  info->freq.Dimension(N + 1);
152  info->freq[N] = 1.0 - info->freq.Sum();
153  }
154  }
155 
156  if (rare)
157  {
158  info->alleleLabels.Push("OTHER");
159  info->alleleNumbers.SetInteger("OTHER", info->alleleLabels.Length());
160  }
161 }
162 
163 bool EstimateFrequencies(Pedigree & ped, int marker, int estimator)
164 {
165  int alleleCount = CountAlleles(ped, marker);
166 
167  IntArray founder(alleleCount + 1);
168  IntArray all(alleleCount + 1);
169 
170  founder.Zero();
171  all.Zero();
172 
173  for (int i = 0; i < ped.count; i++)
174  {
175  // When counting alleles, note that males only carry one X chromosome
176  // and are arbitrarily scored as homozygous.
177  all[ped[i].markers[marker][0]]++;
178  if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
179  all[ped[i].markers[marker][1]]++;
180  if (!ped[i].isFounder()) continue;
181  founder[ped[i].markers[marker][0]]++;
182  if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
183  founder[ped[i].markers[marker][1]]++;
184  }
185 
186  MarkerInfo * info = ped.GetMarkerInfo(marker);
187 
188  if (info->freq.dim > 0)
189  {
190  // previous allele frequency information is available
191  if (alleleCount >= info->freq.dim)
192  error("For marker %s, input files define %d alleles, but at least\n"
193  "one other allele (named '%s') occurs in the pedigree\n",
194  (const char *) info->name, info->freq.dim - 1,
195  (const char *) info->GetAlleleLabel(alleleCount));
196 
197  for (int i = 1; i <= alleleCount; i++)
198  if (all[i] > 0 && info->freq[i] <= 0.0)
199  error("Although allele %s for marker %s has frequency zero,\n"
200  "it occurs %d times in the pedigree",
201  (const char *) info->GetAlleleLabel(i), (const char *) info->name, all[i]);
202 
203  return false;
204  }
205  else
206  {
207  if (alleleCount < 1)
208  {
209  // If no one is genotyped, default to two equifrequent allele
210  // since some programs do not like monomorphic markers
211  info->freq.Dimension(3);
212  info->freq[0] = 0.0;
213  info->freq[1] = 0.99999;
214  info->freq[2] = 0.00001;
215  return true;
216  }
217 
218  info->freq.Dimension(alleleCount + 1);
219  info->freq.Zero();
220 
221  if (estimator == FREQ_FOUNDERS && founder.Sum() > founder[0])
222  {
223  // Make sure the frequency of alleles occuring in the pedigree
224  // is never zero
225  for (int i = 1; i <= alleleCount; i++)
226  if (founder[i] == 0 && all[i] > 0)
227  founder[i] = 1;
228 
229  // To get frequencies, just multiply counts by 1 / total_counts
230  double factor = 1.0 / (founder.Sum() - founder[0]);
231 
232  for (int i = 1; i <= alleleCount; i++)
233  info->freq[i] = founder[i] * factor;
234  }
235  else if (estimator == FREQ_ALL || estimator == FREQ_FOUNDERS)
236  {
237  // To get frequencies, just multiply counts by 1 / total_counts
238  double factor = 1.0 / (all.Sum() - all[0]);
239 
240  for (int i = 1; i <= alleleCount; i++)
241  info->freq[i] = all[i] * factor;
242  }
243  else if (estimator == FREQ_EQUAL)
244  // Assume all alleles have equal frequency
245  {
246  // Count the number of observed alleles
247  all[0] = 0;
248  int alleles = all.CountIfGreater(0);
249  double freq = 1.0 / alleles;
250 
251  // Set equal frequencies for all occuring alleles
252  for (int i = 0; i <= alleleCount; i++)
253  info->freq[i] = all[i] ? freq : 0.0;
254  }
255  }
256 
257  return true;
258 }
259