Commit a0c76b0a authored by Damien Leroux's avatar Damien Leroux
Browse files

Now emitting pop_data_type in output file.

parent cf1df0c4
#ifndef _SPELL_BAYES_BN_H_
#define _SPELL_BAYES_BN_H_
#include <map>
#include <vector>
#include <string>
#include <iostream>
#include <iomanip>
#include <fstream>
#include "error.h"
#include "chrono.h"
#include "generation_rs_fwd.h"
#include "input/read_mark.h"
#include "pedigree.h"
#include "bayes.h"
#include "bayes/factor_var2.h"
#include "input/observations.h"
struct pedigree_bayesian_network {
bayesian_network bn;
std::map<std::string, std::vector<size_t>> evidence_by_gen_name;
std::map<std::string, generation_rs*> generations;
std::map<const generation_rs*, SparseMatrix<double>> geno_to_state;
std::map<std::string, std::vector<int>> families;
std::map<size_t, const generation_rs*> gen_by_id;
std::vector<pedigree_item> pedigree;
std::map<int, size_t> vars;
struct vtsh_item {
size_t var;
std::vector<size_t> parents;
/*size_t p1;*/
/*size_t p2;*/
const generation_rs* gen;
vtsh_item(size_t v, const std::vector<size_t>& p, const generation_rs* g) : var(v), parents(p), gen(g) {}
};
std::vector<vtsh_item> var_to_state_helper;
double tol;
pedigree_bayesian_network(size_t n_parents, size_t n_alleles, double noise, double tolerance)
: bn(n_parents, n_alleles, noise)
, evidence_by_gen_name()
, generations()
, geno_to_state()
, families()
, pedigree()
, var_to_state_helper()
, tol(tolerance)
{}
std::map<std::string, std::vector<VectorXd>>
run(std::function<VectorXd(const std::string&, size_t)> get_obs, size_t verbosity=0) const
{
auto comp = bn.instance();
for (const auto& gv: evidence_by_gen_name) {
for (size_t i = 0; i < gv.second.size(); ++i) {
comp.evidence(gv.second[i]) = get_obs(gv.first, i);
}
}
chrono::start("LoopyBP");
comp.run(tol, verbosity);
chrono::stop("LoopyBP");
std::map<std::string, std::vector<VectorXd>> ret;
#if 1
chrono::start("Geno->State");
std::map<size_t, VectorXd> state_vectors;
/*std::map<const generation_rs*, SparseMatrix<double>> lincombs;*/
/*for (const auto& kv: generations) {*/
/*VectorLC lc = kv.second->design->lincomb();*/
/*SparseMatrix<double> mat(*/
/*lincombs*/
/*}*/
std::map<const generation_rs*, VectorLC> lincombs;
for (const auto& kv: generations) {
lincombs[kv.second] = kv.second->this_lincomb;
}
state_vectors[0] = VectorXd::Ones(1);
for (const auto& vts: var_to_state_helper) {
/*VectorLC lc = vts.gen->design->lincomb();*/
/*auto par = vts.gen->design->get_parents();*/
const auto& lc = lincombs[vts.gen];
VectorXd v1(lc.size());
VectorXd v2(lc.size());
/*VectorXd p1 = state_vectors[vts.p1], p2 = state_vectors[vts.p2];*/
std::map<size_t, VectorXd> pmap;
/* FIXME: not always 2 parents */
if (lc.size() == 1 && vts.parents[0] == 0 && vts.parents[1] == 0) {
/* ancestor! */
v1(0) = 1.;
} else {
for (size_t i = 0; i < vts.parents.size(); ++i) {
pmap[i] = state_vectors[vts.parents[i]];
MSG_DEBUG("PMAP[" << i << "] = #" << vts.parents[i] << " : " << pmap[i]);
}
for (int i = 0; i < lc.size(); ++i) {
/* FIXME the parent key should be {gen, #id} or #id. NOT gen only. */
/*v1(i) = lc(i, 0).apply({{par.first, p1}, {par.second, p2}});*/
v1(i) = lc(i, 0).apply(pmap);
}
}
/*v1 = (v1.array() == 0).select(VectorXd::Zero(v1.size()), VectorXd::Ones(v1.size()));*/
const SparseMatrix<double>& g2s = geno_to_state.find(vts.gen)->second;
VectorXd b = comp.parental_origin_belief(vts.var);
VectorXd bp = g2s.transpose() * v1;
VectorXd norm2 = (b.array() / (bp.array() == 0).select(VectorXd::Ones(bp.size()), bp).array()).matrix();
v2 = g2s * norm2;
/*v2 = g2s * b;*/
/*VectorXd b01 = (b.array() == 0).select(VectorXd::Zero(b.size()), VectorXd::Ones(b.size()));*/
/*VectorXd bs = (g2s.array().rowwise() * b01.array().transpose()).array().colwise().sum();*/
/*VectorXd norm2 = g2s * bs;*/
/*v2.array() /= (norm2.array() == 0).select(VectorXd::Ones(norm2.size()), norm2).array();*/
VectorXd tmp = (v1.array() * v2.array()).matrix();
double s = tmp.sum();
state_vectors[vts.var] = tmp / (s ? s : 1.);
/*MSG_DEBUG(vts.gen->name << ':' << vts.var);*/
/*MSG_DEBUG("p1 " << p1.transpose());*/
/*MSG_DEBUG("p2 " << p2.transpose());*/
/*MSG_DEBUG("v1 " << v1.transpose());*/
/*MSG_DEBUG("b " << b.transpose());*/
/*MSG_DEBUG("bp " << bp.transpose());*/
/*MSG_DEBUG("norm2 " << norm2.transpose());*/
/*MSG_DEBUG("v2 " << v2.transpose());*/
/*MSG_DEBUG("STATE_VEC " << vts.var << " " << state_vectors[vts.var].transpose());*/
}
std::map<std::string, size_t> sizes;
for (const auto& pi: pedigree) {
++sizes[pi.gen_name];
}
for (const auto& kv: sizes) {
ret[kv.first].reserve(kv.second);
}
for (const auto& pi: pedigree) {
ret[pi.gen_name].emplace_back(state_vectors[vars.find(pi.id)->second]);
}
chrono::stop("Geno->State");
#endif
return ret;
}
};
pedigree_bayesian_network
make_bn(const std::vector<pedigree_item>& pedigree, const std::map<std::string, ObservationDomain>& obs_gen, /*const std::string& query_gen,*/ size_t n_alleles=1, double obs_noise=0, double tolerance=1.e-10);
#endif
......@@ -6,6 +6,7 @@
#include <vector>
#include "generation_rs.h"
#include "commandline.h"
#include "bn.h"
enum JobDispatchScheme { JDS_None, JDS_MT, JDS_SGE, JDS_SSH };
......@@ -27,14 +28,25 @@ struct bn_settings_t {
std::string job_name;
std::string fifo_path;
/* Configuration */
std::string pop_name;
std::string qtl_generation_name;
std::string work_directory;
std::vector<chromosome> map;
std::string map_filename;
std::map<std::string, marker_observation_spec> marker_observation_specs;
std::vector<std::string> marker_observation_specs_filenames;
std::map<std::string, population_marker_observation> observed_mark;
double noise;
double tolerance;
/* Inputs */
std::string pedigree_filename;
std::string design_filename;
/* BN */
pedigree_bayesian_network* bn;
/* Management */
std::map<std::string, std::map<char, VectorXd>> compiled_obs_specs;
std::vector<std::string> marker_names;
std::map<std::string, size_t> marker_index;
bn_settings_t()
: scheme(JDS_None)
......@@ -46,13 +58,23 @@ struct bn_settings_t {
, job_end(SPELL_CASTER)
, job_name()
, fifo_path()
, pop_name()
, qtl_generation_name()
, work_directory(".")
, map()
, map_filename()
, marker_observation_specs()
, marker_observation_specs_filenames()
, observed_mark()
, noise(0.)
, tolerance(1.e-10)
, pedigree_filename()
, design_filename()
, bn(NULL)
, compiled_obs_specs()
, marker_names()
, marker_index()
{}
size_t count_markers() const
......
#ifndef _SPELL_BAYES_DISPATCH_H_
#define _SPELL_BAYES_DISPATCH_H_
#include "bayes/cli.h"
typedef std::function<bool(const bn_settings_t*, size_t)> job_type;
typedef std::function<size_t(const bn_settings_t*)> count_jobs_type;
extern std::map<std::string, std::pair<count_jobs_type, job_type>> job_registry;
void do_the_job(bn_settings_t* settings, std::string job);
#endif
......@@ -761,6 +761,13 @@ struct bayesian_network {
bool run(double threshold, size_t verbose=0)
{
if (verbose >= 1) { MSG_DEBUG("Running " << (m_mode == PM_Async ? "a" : "") << "synchronous convergence"); }
if (verbose >= 2) {
if (verbose >= 3) {
MSG_DEBUG("MESSAGES");
dump_messages();
dump_beliefs();
}
}
m_exact = true;
m_inexact_messages.clear();
m_inexact_var_message_counters.clear();
......@@ -1281,8 +1288,8 @@ void new_self_prob2(size_t n_par, size_t n_al, SparseMatrix<double>& M1, SparseM
{
size_t dom = n_par * n_par * n_al * n_al;
M1.resize(dom, dom * dom);
M2.resize(dom, dom * dom);
M1.resize(dom, dom);
M2.resize(dom, dom);
std::vector<Eigen::Triplet<double>> t1, t2, t3;
t1.reserve(dom * dom);
......
#ifndef _SPELL_BAYES_OUTPUT_H_
#define _SPELL_BAYES_OUTPUT_H_
#include <map>
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <cstring>
#include "eigen.h"
#include "error.h"
#include "generation_rs_fwd.h"
/** FOURCC **/
inline
bool check_fourcc(std::ifstream& ifs, const char* fourcc)
{
char buf[4] = {0, 0, 0, 0};
ifs.read(buf, 4);
return strncmp(fourcc, buf, 4);
}
inline
void write_fourcc(std::ofstream& ofs, const char* fourcc)
{
ofs.write(fourcc, 4);
}
/** SIZE_T **/
inline
void write_size(std::ofstream& ofs, size_t sz)
{
ofs.write((const char*) &sz, sizeof sz);
}
inline
size_t read_size(std::ifstream& ifs)
{
size_t ret;
ifs.read((char*) &ret, sizeof ret);
return ret;
}
/** STRING **/
inline
std::string read_str(std::ifstream& ifs)
{
size_t sz = read_size(ifs);
std::vector<char> tmp(sz);
ifs.read(&tmp[0], sz);
return {tmp.begin(), tmp.end()};
}
inline
void write_str(std::ofstream& ofs, const std::string& s)
{
write_size(ofs, s.size());
ofs.write(s.c_str(), s.size());
}
/** DOUBLE **/
inline
void write_double(std::ofstream& ofs, double sz)
{
ofs.write((const char*) &sz, sizeof sz);
}
inline
double read_double(std::ifstream& ifs)
{
double ret;
ifs.read((char*) &ret, sizeof ret);
return ret;
}
/** INT **/
inline
void write_int(std::ofstream& ofs, int sz)
{
ofs.write((const char*) &sz, sizeof sz);
}
inline
int read_int(std::ifstream& ifs)
{
int ret;
ifs.read((char*) &ret, sizeof ret);
return ret;
}
/** FAST_POLYNOM **/
inline
void write_fast_polynom(std::ofstream& ofs, const fast_polynom& fp)
{
impl::f_polynom f = fp;
write_int(ofs, fp.value);
write_int(ofs, f.r_exp);
write_int(ofs, f.s_exp);
write_size(ofs, f.P.size());
for (double c: f.P) {
write_double(ofs, c);
}
}
inline
fast_polynom read_fast_polynom(std::ifstream& ifs, int& original_key)
{
original_key = read_int(ifs);
impl::f_polynom ret = fast_polynom::zero;
ret.r_exp = read_int(ifs);
ret.s_exp = read_int(ifs);
size_t sz = read_size(ifs);
ret.P.resize(sz);
for (size_t i = 0; i < sz; ++i) {
ret.P[i] = read_double(ifs);
}
return ret;
}
/** ALGREBRAIC GENOTYPE **/
inline void write_algebraic_genotype(std::ofstream& ofs, const algebraic_genotype& ag)
{
ofs.write((const char*) &ag.genotype, sizeof ag.genotype);
ofs.write((const char*) &ag.type, sizeof ag.type);
write_int(ofs, ag.poly.value);
}
inline algebraic_genotype read_algebraic_genotype(std::ifstream& ifs, const std::map<int, fast_polynom>& pt)
{
algebraic_genotype ag;
ifs.read((char*) &ag.genotype, sizeof ag.genotype);
ifs.read((char*) &ag.type, sizeof ag.type);
ag.poly = pt.find(read_int(ifs))->second;
return ag;
}
/** GENOMATRIX **/
inline void write_genomatrix(std::ofstream& ofs, const GenoMatrix& mat)
{
write_fourcc(ofs, "SGEM");
/*std::map<decltype(fast_polynom::value), impl::f_polynom>*/
std::set<fast_polynom> poly_table;
for (int j = 0; j < mat.cols(); ++j) {
for (int i = 0; i < mat.rows(); ++i) {
poly_table.insert(mat(i, j).poly);
}
}
write_size(ofs, poly_table.size());
for (const auto& fp: poly_table) {
write_fast_polynom(ofs, fp);
}
write_int(ofs, mat.cols());
write_int(ofs, mat.rows());
/*MSG_DEBUG("[write_genomatrix] cols=" << mat.cols() << " rows=" << mat.rows());*/
for (int j = 0; j < mat.cols(); ++j) {
for (int i = 0; i < mat.rows(); ++i) {
/*write_int(ofs, mat(i, j).value);*/
write_algebraic_genotype(ofs, mat(i, j));
}
}
}
inline
void read_genomatrix(std::ifstream& ifs, GenoMatrix& mat)
{
if (check_fourcc(ifs, "SGEM")) {
MSG_ERROR("File is not valid or has been corrupted", "");
return;
}
std::map<int, fast_polynom> poly_map;
size_t table_size = read_size(ifs);
int key;
for (size_t i = 0; i < table_size; ++i) {
auto f = read_fast_polynom(ifs, key);
poly_map[key] = f;
}
int cols = read_int(ifs);
int rows = read_int(ifs);
/*MSG_DEBUG("[read_genomatrix] cols=" << cols << " rows=" << rows);*/
mat.resize(rows, cols);
for (int j = 0; j < mat.cols(); ++j) {
for (int i = 0; i < mat.rows(); ++i) {
mat(i, j) = read_algebraic_genotype(ifs, poly_map);
}
}
}
/** GENERATION_RS **/
inline void write_generation_rs(std::ofstream& ofs, const generation_rs* gen)
{
write_fourcc(ofs, "SGRS");
write_str(ofs, gen->name);
write_size(ofs, gen->P.size());
for (const auto& p: gen->P) {
write_double(ofs, p.weight);
write_genomatrix(ofs, p.G.data);
}
}
inline
generation_rs* read_generation_rs(std::ifstream& ifs)
{
if (check_fourcc(ifs, "SGRS")) {
MSG_ERROR("File is not valid or has been corrupted", "");
}
/*MSG_DEBUG("pouet 1"); MSG_QUEUE_FLUSH();*/
std::string name = read_str(ifs);
/*MSG_DEBUG("pouet 2"); MSG_QUEUE_FLUSH();*/
generation_rs* ret = generation_rs::blank(name);
/*MSG_DEBUG("pouet 3"); MSG_QUEUE_FLUSH();*/
size_t n_p = read_size(ifs);
/*MSG_DEBUG("Have " << n_p << " processes"); MSG_QUEUE_FLUSH();*/
ret->P.resize(n_p);
for (size_t i = 0; i < n_p; ++i) {
ret->P[i].weight = read_double(ifs);
GenoMatrix tmp;
read_genomatrix(ifs, tmp);
ret->P[i].G = convert(tmp);
/*MSG_DEBUG("Read process");*/
/*MSG_DEBUG(ret->P[i]);*/
}
ret->precompute();
return ret;
}
/** MATRIX<SCALAR, R, C> **/
template <typename MATRIX_TYPE>
struct resize_matrix_impl;
template <> struct resize_matrix_impl<VectorXd> { void operator () (VectorXd& v, size_t rows, size_t) { v.resize(rows); } };
template <> struct resize_matrix_impl<MatrixXd> { void operator () (MatrixXd& m, size_t rows, size_t cols) { m.resize(rows, cols); } };
template <typename MATRIX_TYPE>
void resize_matrix(MATRIX_TYPE& m, size_t r, size_t c) { resize_matrix_impl<MATRIX_TYPE>()(m, r, c); }
template <typename SCALAR, int ROW, int COL, int C, int D, int E>
void read_matrix(std::ifstream& ifs, Eigen::Matrix<SCALAR, ROW, COL, C, D, E>& mat)
{
size_t scalar_sz = read_size(ifs);
if (scalar_sz != sizeof(SCALAR)) {
MSG_ERROR("WRONG SIZE OF SCALAR, CAN'T READ FILE", "Make sure spell-marker and spell-qtl are always executed on machines with same word size.");
}
size_t n_row = read_size(ifs);
size_t n_col = read_size(ifs);
if (ROW != Eigen::Dynamic && ((int) n_row) != ROW) {
MSG_ERROR("WRONG ROW COUNT. FILE IS NOT A LOCUS VECTOR FILE OR IS CORRUPTED", "You may want to run spell-marker again");
}
if (COL != Eigen::Dynamic && ((int) n_col) != COL) {
MSG_ERROR("WRONG COLUMN COUNT. FILE IS NOT A LOCUS VECTOR FILE OR IS CORRUPTED", "You may want to run spell-marker again");
}
resize_matrix(mat, n_row, n_col);
ifs.read((char*) mat.data(), n_row * n_col * sizeof(SCALAR));
}
template <typename SCALAR, int ROW, int COL, int C, int D, int E>
void write_matrix(std::ofstream& ofs, const Eigen::Matrix<SCALAR, ROW, COL, C, D, E>& mat)
{
write_size(ofs, sizeof(SCALAR));
write_size(ofs, mat.rows());
write_size(ofs, mat.cols());
ofs.write((const char*) mat.data(), mat.size() * sizeof(SCALAR));
}
/** **/
struct LV_database {
std::map<std::string, std::map<std::string, std::vector<MatrixXd>>> data;
LV_database() : data() {}
MatrixXd& operator () (const std::string& chr, const std::string& gen, size_t ind)
{
return data[chr][gen][ind];
}
const MatrixXd& operator () (const std::string& chr, const std::string& gen, size_t ind) const
{
return data.find(chr)->second.find(gen)->second[ind];
}
static
bool lv_check_fourcc(std::ifstream& ifs, const char* fourcc)
{
if (check_fourcc(ifs, fourcc)) {
MSG_ERROR("FILE IS NOT A LOCUS VECTOR FILE OR IS CORRUPTED", "You may want to run spell-marker again.");
return true;
}
return false;
}
static
LV_database load_from(const std::string& filename)
{
std::ifstream ifs(filename);
return load_from(ifs);
}
static
LV_database load_from(std::ifstream& ifs)
{
LV_database LV;
if (lv_check_fourcc(ifs, "SMLV")) { return {}; }
size_t n_chrom = read_size(ifs);
size_t n_gen = read_size(ifs);
for (size_t c = 0; c < n_chrom; ++c) {
if (lv_check_fourcc(ifs, "SCHR")) { return {}; }
std::string chr_name = read_str(ifs);
for (size_t g = 0; g < n_gen; ++g) {
if (lv_check_fourcc(ifs, "SGEN")) { return {}; }
std::string gen_name = read_str(ifs);
size_t n_ind = read_size(ifs);
LV.data[chr_name][gen_name].resize(n_ind);
for (size_t i = 0; i < n_ind; ++i) {
if (lv_check_fourcc(ifs, "SLV_")) { return {}; }
read_matrix(ifs, LV.data[chr_name][gen_name][i]);
}
}
}
return LV;
}
void save_to(const std::string& filename)
{
std::ofstream ofs(filename);
save_to(ofs);
}
void save_to(std::ofstream& ofs)
{
write_fourcc(ofs, "SMLV");
write_size(ofs, data.size());
write_size(ofs, data.begin()->second.size());
for (const auto& chr_gen_vec_lv: data) {
write_fourcc(ofs, "SCHR");
write_str(ofs, chr_gen_vec_lv.first);
for (const auto& gen_vec_lv: chr_gen_vec_lv.second) {
write_fourcc(ofs, "SGEN");
write_str(ofs, gen_vec_lv.first);
write_size(ofs, gen_vec_lv.second.size());
for (const auto& lv: gen_vec_lv.second) {
write_fourcc(ofs, "SLV_");