Skip to content

Commit ae5bec8

Browse files
committed
genetic map replaces ad-hoc estimation of mutation rate from genomic position
Former-commit-id: 7c497cce8d43c1618b4ac79c1c0eae248a59e8b1
1 parent 53dd012 commit ae5bec8

11 files changed

+215
-39
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
43c81daf7921ce98d6ff972a5f03093c2e0340bb

src/Makefile

+6-3
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ BDIR := ../bin
77
VERSION := $(shell grep VERSION_MAJOR version.hpp | cut -f3 -d' ').$(shell grep VERSION_MINOR version.hpp | cut -f3 -d' ').$(shell grep VERSION_REVISION version.hpp | cut -f3 -d' ')
88

99
# object and header files are defined here
10-
OBJECT_FILES := main.o impute.o insti.o emcchain.o olivier/utils.o haplotype.o kMedoids.o kNN.o hapPanel.o sampler.o glPack.o tabixpp/tabix.o glVQ.o
10+
OBJECT_FILES := main.o impute.o insti.o emcchain.o olivier/utils.o haplotype.o kMedoids.o kNN.o hapPanel.o sampler.o glPack.o tabixpp/tabix.o glVQ.o geneticMap.o
1111
ifndef NCUDA
1212
OBJECT_FILES += hmmLike.cu.o allCuda.link.o hmmLike.o
1313
endif
1414

15-
HEADER_FILES := insti.hpp.gch impute.hpp.gch emcchain.hpp.gch olivier/utils.hpp.gch version.hpp.gch bio.hpp.gch hapPanel.hpp.gch MHSampler.hpp.gch sampler.hpp.gch tabixpp/tabix.hpp.gch globals.h glVQ.hpp
15+
HEADER_FILES := insti.hpp.gch impute.hpp.gch emcchain.hpp.gch olivier/utils.hpp.gch version.hpp.gch bio.hpp.gch hapPanel.hpp.gch MHSampler.hpp.gch sampler.hpp.gch tabixpp/tabix.hpp.gch globals.h glVQ.hpp geneticMap.hpp.gch
1616
ifndef NCUDA
1717
HEADER_FILES += hmmLike.hpp.gch
1818
endif
@@ -125,7 +125,10 @@ kMedoids.hpp.gch: kMedoids.hpp haplotype.hpp.gch sampler.hpp.gch
125125
MHSampler.hpp.gch: MHSampler.hpp olivier/utils.hpp.gch
126126
$(CXX) $(CXXFLAGS) -x c++-header $<
127127

128-
INSTIHEADERPRE :=insti.hpp impute.hpp.gch emcchain.hpp.gch olivier/utils.hpp.gch sampler.hpp.gch version.hpp.gch bio.hpp.gch hapPanel.hpp.gch MHSampler.hpp.gch kNN.hpp.gch kMedoids.hpp.gch glPack.hpp.gch tabixpp/tabix.hpp.gch
128+
geneticMap.hpp.gch: geneticMap.hpp olivier/utils.hpp.gch
129+
$(CXX) $(CXXFLAGS) -x c++-header $<
130+
131+
INSTIHEADERPRE :=insti.hpp impute.hpp.gch emcchain.hpp.gch olivier/utils.hpp.gch sampler.hpp.gch version.hpp.gch bio.hpp.gch hapPanel.hpp.gch MHSampler.hpp.gch kNN.hpp.gch kMedoids.hpp.gch glPack.hpp.gch tabixpp/tabix.hpp.gch geneticMap.hpp.gch
129132

130133
ifndef NCUDA
131134
INSTIHEADERPRE += hmmLike.hpp.gch

src/geneticMap.cpp

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#include "geneticMap.hpp"
2+
3+
using namespace std;
4+
GeneticMap::GeneticMap(string &fileName) {
5+
6+
// populate genetic map vector
7+
ifile gmFD(fileName);
8+
if (!gmFD.isGood())
9+
throw std::runtime_error("Could not open file: " + fileName);
10+
11+
// read header
12+
string buffer;
13+
getline(gmFD, buffer, '\n');
14+
if (buffer != "position COMBINED_rate(cM/Mb) Genetic_Map(cM)")
15+
throw std::runtime_error("header of genetic map does not look right. "
16+
"Expected 'position COMBINED_rate(cM/Mb) "
17+
"Genetic_Map(cM)', but got '" +
18+
buffer + "'");
19+
20+
// read in each line
21+
vector<string> cols;
22+
unsigned rowNum = 2;
23+
assert(m_sortedMap.empty());
24+
while (getline(gmFD, buffer, '\n')) {
25+
cols.clear();
26+
boost::split(cols, buffer, boost::is_any_of(" "));
27+
if (cols.size() != 3)
28+
throw std::runtime_error("Could not find three columns in line " +
29+
to_string(rowNum));
30+
31+
// check the next position is greater than the previous one
32+
unsigned position = stoul(cols[0]);
33+
if (!m_sortedMap.empty() && position <= m_sortedMap.back().first)
34+
throw std::runtime_error(
35+
"Genetic map does not appear to be sorted by position at line " +
36+
to_string(rowNum));
37+
38+
m_sortedMap.push_back(make_pair(position, stod(cols[2])));
39+
40+
++rowNum;
41+
}
42+
if (m_sortedMap.empty())
43+
throw std::runtime_error("Genetic map is empty.");
44+
}
45+
46+
double GeneticMap::GeneticLocation(unsigned genomLoc) {
47+
48+
// test assumptions
49+
assert(!m_sortedMap.empty());
50+
if (genomLoc < m_sortedMap.front().first ||
51+
genomLoc > m_sortedMap.back().first)
52+
throw std::runtime_error("Genomic location " + to_string(genomLoc) +
53+
" is not on genetic map");
54+
55+
// find position in map that is greater or equal to genomLoc
56+
auto high = std::lower_bound(
57+
m_sortedMap.begin(), m_sortedMap.end(), genomLoc,
58+
[](const pair<unsigned, double> &a, unsigned b) { return a.first < b; });
59+
60+
// genomLoc is equal to high
61+
if (high->first == genomLoc)
62+
return high->second;
63+
64+
// interpolate if genomLoc is less than high
65+
auto low = high - 1;
66+
double diffGenD = high->second - low->second;
67+
double diffGenomD = high->first - low->first;
68+
69+
return low->second + (genomLoc - low->first) / diffGenomD * diffGenD;
70+
}
71+
72+
double GeneticMap::GeneticDistance(unsigned first, unsigned second) {
73+
74+
if (first > second)
75+
throw std::runtime_error("first is larger than second");
76+
77+
return GeneticLocation(second) - GeneticLocation(first);
78+
}

src/geneticMap.hpp

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/* @(#)geneticMap.hpp
2+
*/
3+
4+
#ifndef _GENETICMAP_HPP
5+
#define _GENETICMAP_HPP 1
6+
7+
static_assert(__cplusplus > 199711L, "Program requires C++11 capable compiler");
8+
9+
#include <vector>
10+
#include <algorithm>
11+
#include <string>
12+
#include <exception>
13+
#include <utility>
14+
#include "utils.hpp"
15+
#include <boost/algorithm/string.hpp>
16+
17+
class GeneticMap {
18+
19+
private:
20+
std::vector<std::pair<unsigned, double> > m_sortedMap;
21+
22+
public:
23+
GeneticMap(std::string &fileName);
24+
double GeneticLocation(unsigned genomLoc);
25+
double GeneticDistance(unsigned first, unsigned second);
26+
};
27+
28+
#endif /* _GENETICMAP_HPP */

src/insti.cpp

+21-10
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ Insti::Insti(InstiHelper::Init &init)
7373
: m_reclusterEveryNGen(init.reclusterEveryNGen),
7474
m_scaffoldHapsFile(init.scaffoldHapsFile),
7575
m_scaffoldSampleFile(init.scaffoldSampleFile),
76-
m_initPhaseFromScaffold(init.initPhaseFromScaffold), m_init(init),
76+
m_initPhaseFromScaffold(init.initPhaseFromScaffold),
77+
m_geneticMap(init.geneticMap), m_init(init),
7778
m_tag(boost::uuids::random_generator()()) {
7879

7980
omp_set_num_threads(init.numThreads);
@@ -1240,13 +1241,17 @@ void Insti::initialize() {
12401241
// If S is a harmonic series of length hn (number of haplotypes),
12411242
// then mu = 1/S ( hn + 1/S)
12421243
// initialize recombination rate rho based on SNP density
1243-
fast mu = 0, rho;
1244+
fast mu = 0; //, rho;
12441245

12451246
for (unsigned i = 1; i < hn; i++)
12461247
mu += 1.0 / i;
12471248

1249+
// mu * (mn-1) looks like the Watterson estimator of the population mutation
1250+
// rate theta
1251+
// 0.5 / (posi[mn - 1] - posi[0]) / density looks like a correction for finite
1252+
// sites
12481253
mu = 1 / mu;
1249-
rho = 0.5 * mu * (mn - 1) / (posi[mn - 1] - posi[0]) / density;
1254+
// rho = 0.5 * mu * (mn - 1) / (posi[mn - 1] - posi[0]) / density;
12501255
mu = mu / (hn + mu); // rho is recombination rate? mu is mutation rate
12511256

12521257
// initialzie the site transition matrix tran
@@ -1255,13 +1260,18 @@ void Insti::initialize() {
12551260
// tran is a site's recombination probability matrix
12561261
// r therefore must be a recombination rate estimate
12571262
for (unsigned m = mn - 1; m; m--) {
1258-
posi[m] = (posi[m] - posi[m - 1]) * rho;
1259-
fast r = posi[m] / (posi[m] + hn);
1263+
1264+
// replaced ad-hoc genetic distance estimate by genetic map
1265+
// posi[m] = (posi[m] - posi[m - 1]) * rho;
1266+
// fast r = posi[m] / (posi[m] + hn);
1267+
const float r = m_geneticMap.GeneticDistance(posi[m - 1], posi[m]);
1268+
1269+
// 4 state HMM with three transitions at each position
1270+
// for each position, transition. r= recombination,
1271+
// 1-r= no recombination
12601272
tran[m * 3] = (1 - r) * (1 - r);
12611273
tran[m * 3 + 1] = r * (1 - r);
1262-
tran[m * 3 + 2] = r * r; // for each position, transition. r= alternative,
1263-
// 1-r= refrence? 4 state HMM with three
1264-
// transitions at each position
1274+
tran[m * 3 + 2] = r * r;
12651275
}
12661276

12671277
// initialize site mutation probability matrix
@@ -2196,20 +2206,21 @@ void Insti::document() {
21962206
cerr << "\nhaplotype imputation by cFDSL distribution";
21972207
cerr << "\nAuthor\tYi Wang @ Fuli Yu' Group @ BCM-HGSC";
21982208
cerr << "\n\nusage\timpute [options] 1.bin 2.bin ...";
2199-
cerr << "\n\t-d <density> relative SNP density to Sanger sequencing (1)";
2209+
// cerr << "\n\t-d <density> relative SNP density to Sanger sequencing (1)";
22002210

22012211
// cerr << "\n\t-b <burn> burn-in generations (56)";
22022212
cerr << "\n\t-l <file> list of input files";
22032213
cerr << "\n\t-n <fold> sample size*fold of nested MH sampler "
22042214
"iteration "
22052215
"(2)";
22062216
cerr << "\n\t-o <name>\tPrefix to use for output files";
2207-
22082217
cerr << "\n\t-P <thread> number of threads (0=MAX,default=1)";
22092218
cerr << "\n\t-v <vcf> integrate known genotype in VCF format";
22102219
cerr << "\n\t-c <conf> confidence of known genotype (0.9998)";
22112220
cerr << "\n\t-x <gender> impute x chromosome data";
22122221
cerr << "\n\t-e <file> write log to file";
2222+
cerr << "\n\t-g <file> genetic map (required)";
2223+
22132224
cerr << "\n\n GENERATION OPTIONS";
22142225
cerr << "\n\t-m <mcmc> sampling generations (200)";
22152226
cerr << "\n\t-C <integer> number of cycles to estimate an individual's "

src/insti.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "bio.hpp"
3636
#include "globals.h"
3737
#include <omp.h>
38+
#include "geneticMap.hpp"
3839

3940
#ifndef NCUDA
4041
#include "hmmLike.hpp"
@@ -107,6 +108,7 @@ class Insti : public Impute {
107108
const std::string m_scaffoldSampleFile; // location of scaffold sample file
108109
double m_scaffoldFreqCutoff; // cutoff MAF for what to fix in scaffold
109110
const bool m_initPhaseFromScaffold;
111+
GeneticMap m_geneticMap;
110112
const InstiHelper::Init m_init;
111113

112114
// holds chrom, start and end position, etc.

src/kNN.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,18 @@ KNN::KNN(unsigned numClust, const std::vector<uint64_t> &haplotypes,
1414
m_numClusters(numClust), m_distMetric(distMetric), m_freqLB(freqLB),
1515
m_freqUB(freqUB), m_usingMAF(usingMAF), m_usingScaffold(true) {
1616

17-
// check bounds
17+
// check bounds
1818
if (m_freqLB < 0)
1919
throw std::range_error("[KNN] freq lower bound is less than 0: " +
2020
to_string(m_freqLB));
21-
if (m_usingMAF)
21+
if (m_usingMAF) {
2222
if (m_freqUB > 0.5)
2323
throw std::range_error("[KNN] freq upper bound is greater than 0.5: " +
2424
to_string(m_freqUB));
2525
else if (freqUB > 1)
2626
throw std::range_error("[KNN] freq upper bound is greater than 0.5: " +
2727
to_string(m_freqUB));
28+
}
2829
if (m_freqLB > m_freqUB)
2930
throw std::range_error("[KNN] freq upper bound(" + to_string(m_freqUB) +
3031
") less than lower bound(" + to_string(m_freqLB) +

src/main.cpp

+25-20
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ int main(int ac, char **av) {
3434

3535
Impute::sn = 200;
3636
Impute::nn = 2;
37-
Impute::density = 1.0;
37+
Impute::density = 1.0; // this is not used anymore
3838
Impute::conf = 0.9998;
3939
Impute::is_x = false;
4040
Impute::is_y = false;
@@ -47,20 +47,21 @@ int main(int ac, char **av) {
4747
bool optMSet = false;
4848
while ((opt = getopt(
4949
ac, av,
50-
"Vd:l:m:n:v:c:x:e:E:p:C:L:H:kK:t:B:i:M:h:s:q:Q:fo:DTr:P:a")) >=
50+
"Vl:m:n:v:c:x:e:E:p:C:L:H:kK:t:B:i:M:h:s:q:Q:fo:DTr:P:ag:")) >=
5151
0) {
5252
switch (opt) {
53-
case 'd':
54-
Impute::density = atof(optarg);
55-
break;
56-
/* case 'b':
57-
Impute::bn = atoi(optarg);
58-
break; */
53+
54+
/* case 'd':
55+
Impute::density = atof(optarg);
56+
break;
57+
case 'b':
58+
Impute::bn = stoul(optarg);
59+
break; */
5960
case 'm':
60-
Impute::sn = atoi(optarg);
61+
Impute::sn = stoul(optarg);
6162
break;
6263
case 'n':
63-
Impute::nn = atoi(optarg);
64+
Impute::nn = stoul(optarg);
6465
break;
6566
case 'v':
6667
Impute::vcf_file.push_back(optarg);
@@ -84,24 +85,25 @@ int main(int ac, char **av) {
8485
sLogFile = optarg;
8586
break;
8687
case 'E':
87-
init.estimator = atoi(optarg);
88+
init.estimator = stoul(optarg);
8889
if (init.estimator > 3) {
8990
cerr << "-E needs to be between 0 and 3" << endl;
9091
Insti::document();
9192
}
9293
break;
94+
9395
case 'D':
9496
Insti::s_MHSamplerType = MHType::DRMH;
9597
break;
9698
case 'p': {
97-
uint uP = atoi(optarg);
99+
uint uP = stoul(optarg);
98100
if (uP < 2)
99101
Insti::document();
100102
Insti::s_uParallelChains = uP;
101103
break;
102104
}
103105
case 'C':
104-
Insti::s_uCycles = atoi(optarg);
106+
Insti::s_uCycles = stoul(optarg);
105107
break;
106108
case 'L':
107109
Insti::s_sRefLegendFile = optarg;
@@ -113,22 +115,22 @@ int main(int ac, char **av) {
113115
Insti::s_bKickStartFromRef = true;
114116
break;
115117
case 'P':
116-
init.numThreads = atoi(optarg);
118+
init.numThreads = stoul(optarg);
117119
break;
118120
case 'K':
119-
Insti::s_uNumClusters = atoi(optarg);
121+
Insti::s_uNumClusters = stoul(optarg);
120122
break;
121123
case 't':
122-
Insti::s_uClusterType = atoi(optarg);
124+
Insti::s_uClusterType = stoul(optarg);
123125
break;
124126
case 'B':
125-
Insti::s_uSABurninGen = atoi(optarg);
127+
Insti::s_uSABurninGen = stoul(optarg);
126128
break;
127129
case 'i':
128-
Insti::s_uNonSABurninGen = atoi(optarg);
130+
Insti::s_uNonSABurninGen = stoul(optarg);
129131
break;
130132
case 'M':
131-
Insti::s_uStartClusterGen = atoi(optarg);
133+
Insti::s_uStartClusterGen = stoul(optarg);
132134
optMSet = true;
133135
break;
134136
case 'h':
@@ -159,7 +161,10 @@ int main(int ac, char **av) {
159161
Insti::s_clusterDistanceMetric = kNNDistT::tracLen;
160162
break;
161163
case 'r':
162-
init.reclusterEveryNGen = atoi(optarg);
164+
init.reclusterEveryNGen = stoul(optarg);
165+
break;
166+
case 'g':
167+
init.geneticMap = optarg;
163168
break;
164169
default:
165170
Insti::document();

0 commit comments

Comments
 (0)