Commit bef88834 authored by Damien Leroux's avatar Damien Leroux
Browse files

Embryo of an R package for the mapping tools.

parent 82a8cd1e
......@@ -137,7 +137,8 @@ endif()
if (${BUILD_FOR_DEPLOYMENT})
add_executable(spell-pedigree ${SPELL_PEDIGREE_SRC} ${libstdcpp})
add_executable(spell-marker ${SPELL_MARKER_SRC})
add_executable(spell-map ${SPELL_MAP_SRC} ${libstdcpp})
add_executable(spell-marker ${SPELL_MARKER_SRC} ${libstdcpp})
add_executable(spell-qtl ${SPELL_QTL_SRC} ${libstdcpp})
add_custom_command(
OUTPUT glibc.h libstdc++.a
......
......@@ -712,27 +712,39 @@ struct rw_base : public rw_any<rw_base> {
/** **/
typedef union _gamete_lv_value_type {
Eigen::Vector4d binary;
Eigen::Vector2d unary;
static _gamete_lv_value_type unknown_binary() { static _gamete_lv_value_type ret = {(Eigen::Vector4d() << .25, .25, .25, .25).finished()}; return ret; }
static _gamete_lv_value_type unknown_unary() { static _gamete_lv_value_type ret = {(Eigen::Vector4d() << .5, .5, 0, 0).finished()}; return ret; }
_gamete_lv_value_type&
operator = (const _gamete_lv_value_type& other)
{
binary = other.binary;
return *this;
}
template <typename STREAM_TYPE>
void
file_io(STREAM_TYPE& fs)
{
rw_base rw;
rw(fs, binary);
}
_gamete_lv_value_type() : binary() {}
_gamete_lv_value_type(const _gamete_lv_value_type& other) : binary(other.binary) {}
_gamete_lv_value_type(const Eigen::Vector2d& v) { unary = v; }
_gamete_lv_value_type(const Eigen::Vector4d& v) { binary = v; }
friend std::ostream& operator << (std::ostream& os, const _gamete_lv_value_type& glv) { return os << glv.binary.transpose(); }
} gamete_lv_value_type;
struct gamete_LV_type {
bool is_single;
typedef union _gamete_lv_value_type {
Eigen::Vector2d unary;
Eigen::Vector4d binary;
template <typename STREAM_TYPE>
void
file_io(STREAM_TYPE& fs)
{
rw_base rw;
rw(fs, binary);
}
_gamete_lv_value_type() : binary() {}
_gamete_lv_value_type(const _gamete_lv_value_type& other) : binary(other.binary) {}
_gamete_lv_value_type(const Eigen::Vector2d& v) { unary = v; }
_gamete_lv_value_type(const Eigen::Vector4d& v) { binary = v; }
friend std::ostream& operator << (std::ostream& os, const _gamete_lv_value_type& glv) { return os << glv.binary.transpose(); }
} gamete_lv_value_type;
std::map<std::string, gamete_lv_value_type> lv;
friend std::ostream& operator << (std::ostream& os, const gamete_LV_type& glv) { return os << (glv.is_single ? "Unary{" : "Binary{") << glv.lv << '}'; }
......@@ -763,17 +775,186 @@ struct gamete_LV_type {
void add(const std::string& key, const T& value) { add_normalized(key, value); }
template <typename STREAM_TYPE>
void
file_io(STREAM_TYPE& fs)
{
rw_base rw;
if (rw.fourcc(fs, "GLVT")) { return; }
rw(fs, is_single);
rw(fs, lv);
void
file_io(STREAM_TYPE& fs)
{
rw_base rw;
if (rw.fourcc(fs, "GLVT")) { return; }
rw(fs, is_single);
rw(fs, lv);
}
};
#if 0
struct em_dist_opt : public Eigen::Functor<double> {
em_dist_opt() : Eigen::Functor<double>(1, 5) {}
int
operator() (const VectorXd& b, VectorXd& fvec) const
{
double r = b(0);
double r2 = r * r;
fvec <<
(r2 - r - r + 1),
(r2 - r),
(r2),
(r),
(1 - r);
return 0;
}
int
df(const VectorXd& b, VectorXd& fjac) const
{
double r = b(0);
double _2r = r + r;
fjac <<
(_2r - 2),
(_2r - 1),
(_2r),
(1),
(-1);
return 0;
}
};
#endif
struct EM_helpers {
# define sqrt5 2.236067977499789696
struct max_return_type {
double a, b, fa, fb;
};
template <typename F>
max_return_type
find_max(F&& f, double a0, double b0)
{
# define interval_too_small(_x, _y) ((_y - _x) < tolerance)
static const double lambda = 0.5 * (sqrt5 - 1.0);
static const double mu = 0.5 * (3.0 - sqrt5); // = 1 - lambda
static const double tolerance = 3.e-8;
double x1;
double x2;
double fx1;
double fx2;
max_return_type ret = {a0, b0, f(a0), f(b0)};
// Find first two internal points and evaluate
// the function at the two internal points.
x1 = ret.b - lambda * (ret.b - ret.a);
x2 = ret.a + lambda * (ret.b - ret.a);
fx1 = f(x1);
fx2 = f(x2);
// Verify that the tolerance is an acceptable number
// if (tolerance <= 0.0) tolerance = sqrt(DBL_EPSILON) * (ret.b - ret.a);
// Loop by exluding segments from current endpoints a, b
// to current internal points x1, x2 and then calculating
// a new internal point until the length of the interval
// is less than or equal to the tolerance.
while (!interval_too_small(ret.a, ret.b)) {
if (fx1 < fx2) {
ret.a = x1;
ret.fa = fx1;
if (interval_too_small(ret.a, ret.b)) break;
x1 = x2;
fx1 = fx2;
x2 = ret.b - mu * (ret.b - ret.a);
fx2 = f(x2);
} else {
ret.b = x2;
ret.fb = fx2;
if (interval_too_small(ret.a, ret.b)) break;
x2 = x1;
fx2 = fx1;
x1 = ret.a + mu * (ret.b - ret.a);
fx1 = f(x1);
}
}
return ret;
}
static
const Eigen::Matrix2d&
S_kernel(const Eigen::Vector2d&)
{
static Eigen::Matrix2d kernel = (Eigen::Matrix2d() << 1, 0, 0, 1).finished();
return kernel;
}
static
const Eigen::Matrix4d&
S_kernel(const Eigen::Vector4d&)
{
static Eigen::Matrix4d kernel = (Eigen::Matrix4d() << 2, 1, 1, 0,
1, 2, 0, 1,
1, 0, 2, 1,
0, 1, 1, 2).finished();
return kernel;
}
template <typename VTYPE>
static
double
compute_S(VTYPE&& a, VTYPE&& b)
{
const auto& kernel = S_kernel(a);
double ret = a.transpose() * kernel * b;
scoped_indent _("[compute_S] ");
MSG_DEBUG("a = " << a.transpose());
MSG_DEBUG("b = " << b.transpose());
MSG_DEBUG("kernel" << std::endl << kernel);
MSG_DEBUG("ret = " << ret);
return ret;
}
/*
static
double
compute_S(const std::vector<gamete_LV_type>& v1, const std::vector<gamete_LV_type>& v2)
{
size_t n_mei = 0;
auto i1 = v1.begin(), i2 = v2.begin(), j = v1.end();
double s = 0;
// should assert that v1.size() == v2.size()
for (; i1 != j; ++i1, ++i2) {
n_mei += 2 - i1->is_single;
s += compute_S(*i1, *i2);
}
return s / n_mei;
}
//*/
static
double
S_to_dist(double s)
{
scoped_indent _("[S_to_dist] ");
#define EM_S_MIN .499999
#define EM_S_MAX .999999
MSG_DEBUG("s = " << s);
if (s > EM_S_MAX) {
s = EM_S_MAX;
} else if (s < EM_S_MIN) {
s = EM_S_MIN;
}
MSG_DEBUG("s = " << s);
return -.5 * log(s + s - 1);
}
};
struct gamete_LV_database {
struct gamete_LV_database : public EM_helpers {
/* IND.ped / MARK => 1 or 2 gametes (.second will be empty when cross type is DH) */
std::map<int, gamete_LV_type> data;
std::map<double, Eigen::Matrix2d> cache2;
......@@ -784,27 +965,39 @@ struct gamete_LV_database {
gamete_LV_database() : data(), cache2(), cache4() {}
Eigen::Matrix2d
get_TR_unary(double dist)
get_TR_unary(double dist_or_r, bool use_r=false)
{
auto it = cache2.find(dist);
double r, s;
if (use_r) {
r = dist_or_r;
s = 1. - r;
} else {
s = .5 + exp(-2. * dist_or_r) * .5;
r = 1. - s;
}
auto it = cache2.find(r);
if (it == cache2.end()) {
double s = .5 + exp(-2. * dist) * .5;
double r = 1. - s;
Eigen::Matrix2d ret;
ret << s, r, r, s;
cache2[dist] = ret;
cache2[r] = ret;
return ret;
}
return it->second;
}
Eigen::Matrix4d
get_TR_binary(double dist)
get_TR_binary(double dist_or_r, bool use_r=false)
{
auto it = cache4.find(dist);
double r, s;
if (use_r) {
r = dist_or_r;
s = 1. - r;
} else {
s = .5 + exp(-2. * dist_or_r) * .5;
r = 1. - s;
}
auto it = cache4.find(r);
if (it == cache4.end()) {
double s = .5 + exp(-2. * dist) * .5;
double r = 1. - s;
double rs = r * s;
double r2 = r * r;
double s2 = s * s;
......@@ -814,7 +1007,7 @@ struct gamete_LV_database {
rs, s2, r2, rs,
rs, r2, s2, rs,
r2, rs, rs, s2;
cache4[dist] = ret;
cache4[r] = ret;
return ret;
}
return it->second;
......@@ -834,24 +1027,8 @@ struct gamete_LV_database {
MSG_ERROR("Individual #" << ind << " was first declared as " << (it->second.is_single ? "intercross" : "doubled haploid") << " and is now requested to be the other kind.", "");
}
}
data[ind].add(mark, (lv.array() > 0).select(VectorXd::Ones(lv.size()), VectorXd::Zero(lv.size())).matrix());
// data[ind].add(mark, lv * (lv.array() > 0).cast<double>().sum());
// data[ind].add(mark, lv);
#if 0
Eigen::Vector2d g1, g2;
MSG_DEBUG("add_gametes mark=" << mark << " ind=" << ind << " lv=" << lv.transpose() << " dh=" << std::boolalpha << dh);
if (!dh) {
g1(0) = lv(0) + lv(1);
g1(1) = lv(2) + lv(3);
g2(0) = lv(0) + lv(2);
g2(1) = lv(1) + lv(3);
} else {
g1 = lv;
g2(0) = g2(1) = 0;
}
data[ind][mark] = g1;
data[-ind][mark] = g2;
#endif
// data[ind].add(mark, (lv.array() > 0).select(VectorXd::Ones(lv.size()), VectorXd::Zero(lv.size())).matrix());
data[ind].add(mark, lv);
}
void init_tr(bool is_single, const std::vector<double>& distances)
......@@ -876,7 +1053,7 @@ struct gamete_LV_database {
}
}
}
template <typename V, typename M>
double gamete_likelihood_impl(std::vector<M>& TR, V& accum, const std::vector<std::string>& marker_names, const gamete_LV_type& gam)
{
......@@ -894,7 +1071,7 @@ struct gamete_LV_database {
MSG_DEBUG("final accum = " << accum.transpose());
return accum.sum();
}
double gamete_likelihood(int ind, const gamete_LV_type& gam, const std::vector<std::string>& marker_names, const std::vector<double>& distances)
{
init_tr(gam.is_single, distances);
......@@ -919,51 +1096,323 @@ struct gamete_LV_database {
ret += log(gamete_likelihood(kv.first, kv.second, marker_names, distances));
}
return ret;
#if 0
// implicitly, we just garanteed there is at least one marker name in the list
Eigen::Vector2d all_accum;
all_accum(0) = all_accum(1) = 0;
std::vector<Eigen::Matrix2d> TR;
TR.reserve(distances.size());
for (double d: distances) {
TR.push_back(get_TR(d));
}
double ret = 0;
MSG_DEBUG("DATA" << std::endl << data);
for (auto& kv: data) {
// int ind = kv.first;
MSG_DEBUG("ON GAMETE #" << kv.first);
auto& LV = kv.second;
auto name_i = marker_names.begin();
auto& gam0 = LV[*name_i++];
Eigen::Vector2d accum = gam0;
// accum << .5, .5;
// std::vector<double> norms;
// double norm_accum = 0;
// norms.reserve(distances.size());
// norms.push_back(1.);
// norm_accum -= log(norms.back());
for (size_t i = 0; i < TR.size(); ++i, ++name_i) {
// MSG_DEBUG("accum = " << accum.transpose());
// MSG_DEBUG("TR[" << i << ']' << std::endl << TR[i]);
MSG_DEBUG("accum = " << accum.transpose() << " LV[" << (*name_i) << "] = " << LV[*name_i].transpose());
accum = accum.transpose() * TR[i] * LV[*name_i].asDiagonal();
// norms.push_back(accum.lpNorm<2>());
// if (norms.back() == 0) {
// MSG_ERROR("Have a zero vector, aborting.", "");
// return NAN;
}
std::vector<double> em_distances;
std::vector<double> em_old_distances;
std::vector<Eigen::Matrix2d> em_TR2;
std::vector<Eigen::Matrix4d> em_TR4;
bool em_have_single = false;
bool em_have_double = false;
std::vector<std::vector<gamete_lv_value_type>> em_state_per_mark_per_ind;
std::vector<std::vector<gamete_lv_value_type>> em_forward, em_backward, em_obs;
double em_likelihood;
std::vector<bool> em_chain_is_single;
void
EM_init(const std::vector<std::string>& marker_names)
{
em_distances.clear();
em_distances.resize(marker_names.size() - 1, .5);
em_old_distances.clear();
em_old_distances.resize(marker_names.size() - 1, 0);
em_chain_is_single.clear();
em_chain_is_single.reserve(data.size());
em_state_per_mark_per_ind.resize(marker_names.size());
auto spmpi = em_state_per_mark_per_ind.begin();
em_forward.clear();
em_forward.resize(marker_names.size());
em_backward.clear();
em_backward.resize(marker_names.size());
auto forwardi = em_forward.begin();
auto backwardi = em_backward.begin();
MSG_DEBUG("forward " << em_forward);
MSG_DEBUG("backward " << em_backward);
// size_t i = 0;
for (const auto& m: marker_names) {
(spmpi++)->resize(data.size());
(forwardi++)->resize(data.size(), gamete_lv_value_type::unknown_binary());
(backwardi++)->resize(data.size(), gamete_lv_value_type::unknown_binary());
}
MSG_DEBUG("data");
MSG_DEBUG(data);
for (const auto& kv: data) { // all markers for one individual
em_chain_is_single.push_back(kv.second.is_single);
em_have_single |= kv.second.is_single;
em_have_double |= !kv.second.is_single;
// auto mi = marker_names.begin();
// MSG_DEBUG("marker_names " << marker_names);
// if (em_chain_is_single.back()) {
// for (auto& vec: *spmpi) {
// vec.unary = kv.second.lv.find(*mi++)->second.unary;
// }
// accum.array() /= norms.back();
// norm_accum += log(norms.back());
// } else {
// for (auto& vec: *spmpi) {
// vec.binary = kv.second.lv.find(*mi++)->second.binary;
// }
// }
// ++i;
}
auto di = data.begin();
for (size_t i = 0; i < data.size(); ++i, ++di) {
size_t mi = 0;
for (const auto& m: marker_names) {
em_state_per_mark_per_ind[mi][i].binary = di->second.lv.find(m)->second.binary;
++mi;
}
}
if (em_have_double) {
em_TR4.resize(em_distances.size());
}
if (em_have_single) {
em_TR2.resize(em_distances.size());
}
em_obs = em_state_per_mark_per_ind;
MSG_DEBUG("data");
MSG_DEBUG(data);
MSG_DEBUG("em_state_per_mark_per_ind");
MSG_DEBUG(em_state_per_mark_per_ind);
MSG_DEBUG("Have single? " << std::boolalpha << em_have_single);
MSG_DEBUG("Have double? " << std::boolalpha << em_have_double);
/*EM_update_distances();*/
}
void
EM_init_TR()
{
if (em_have_double) {
for (size_t i = 0; i < em_distances.size(); ++i) {
em_TR4[i] = get_TR_binary(em_distances[i], true);
}
}
if (em_have_single) {
for (size_t i = 0; i < em_distances.size(); ++i) {
em_TR2[i] = get_TR_unary(em_distances[i], true);
}
MSG_DEBUG("final accum = " << accum.transpose());
// ret += norm_accum + log(accum.lpNorm<1>());
ret += log(accum.lpNorm<1>());
}
}
template <typename TYPE>
double
norm(TYPE&& m_or_v, double& sum)
{
sum = m_or_v.sum();
if (sum > 0) {
sum = 1. / sum;
m_or_v *= sum;
return sum;
}
return std::numeric_limits<double>::signaling_NaN();
}
template <typename TYPE>
double
norm(TYPE&& m_or_v)
{
double sum;
return norm(m_or_v, sum);
}
template <typename VTYPE>
typename std::decay<VTYPE>::type
mult_and_norm(VTYPE&& a, VTYPE&& b)
{
typename std::decay<VTYPE>::type ret = (a.array() * b.array()).matrix();
norm(ret);
return ret;
#endif
}
template <typename VTYPE>
typename std::decay<VTYPE>::type
mult_and_norm(VTYPE&& a, VTYPE&& b, VTYPE&& c)
{
typename std::decay<VTYPE>::type ret = (a.array() * b.array() * c.array()).matrix();
norm(ret);
return ret;
}
void
EM_update_gamete_prob()
{
scoped_indent _("[EM update state] ");
EM_init_TR();
Eigen::Vector4d v4;
Eigen::Vector2d v2;
for (size_t i = 0; i < em_chain_is_single.size(); ++i) {
if (em_chain_is_single[i]) {
size_t m = 0;
em_forward[m][i].unary = Eigen::Vector2d::Constant(.5);
// forward.
for (; m < em_distances.size(); ++m) {
double norm_accum;
em_forward[m + 1][i].unary = em_TR2[m] * mult_and_norm(em_forward[m][i].unary, em_obs[m][i].unary);
norm(em_forward[m + 1][i].unary, norm_accum);
}
// backward.
em_backward[m][i].unary = Eigen::Vector2d::Constant(.5);
for (; m > 0; --m) {
em_backward[m - 1][i].unary = em_TR2[m - 1] * mult_and_norm(em_backward[m][i].unary, em_obs[m][i].unary);
norm(em_backward[m - 1][i].unary);
}
// forward * backward * obs then normalize.
for (; m < em_state_per_mark_per_ind.size(); ++m) {
em_state_per_mark_per_ind[m][i].unary = mult_and_norm(em_forward[m][i].unary, em_backward[m][i].unary, em_obs[m][i].unary);
}
} else {
size_t m = 0;
em_forward[m][i].binary = Eigen::Vector4d::Constant(.25);
// forward.
for (; m < em_distances.size(); ++m) {
em_forward[m + 1][i].binary = em_TR4[m] * mult_and_norm(em_forward[m][i].binary, em_obs[m][i].binary);
norm(em_forward[m + 1][i].binary);
}
// backward.
em_backward[m][i].binary = Eigen::Vector4d::Constant(.25);
for (; m > 0; --m) {
em_backward[m - 1][i].binary = em_TR4[m - 1] * mult_and_norm(em_backward[m][i].binary, em_obs[m][i].binary);
norm(em_backward[m - 1][i].binary);
}
// forward * backward * obs then normalize.
for (; m < em_state_per_mark_per_ind.size(); ++m) {
em_state_per_mark_per_ind[m][i].binary = mult_and_norm(em_forward[m][i].binary, em_backward[m][i].binary, em_obs[m][i].binary);
}
}
}
MSG_DEBUG("em_forward");
MSG_DEBUG(em_forward);
MSG_DEBUG("em_backward");
MSG_DEBUG(em_backward);
MSG_DEBUG("em_obs");
MSG_DEBUG(em_obs);
MSG_DEBUG("em_state_per_mark_per_ind");
MSG_DEBUG(em_state_per_mark_per_ind);
}