spellmaptools.cc 14.2 KB
Newer Older
1
2
3
4
5
6
// #ifndef SPELL_PEDIGREE_MAIN
// #  define SPELL_PEDIGREE_MAIN spell_pedigree
// #endif
// #ifndef SPELL_BAYES_MAIN
// #  define SPELL_BAYES_MAIN spell_marker
// #endif
7
8
9

#include "eigen.h"
#include <vector>
10
#include <map>
11
#include <string>
12
13
14
15
16
17
18
19
// #include <boost/filesystem.hpp>
#include "error.h"
#include "geno_matrix.h"
#include "bayes/output.h"
#include "map-likelihood/cli.h"
#include "cache/md5.h"
#include "RWrap/RWrap.h"
#include <numeric>
20
21
22

// Load and initialize data

23
// SEM (order)
24
25
26
27
28
29
30
31
32
33

// Try (order, marker)

// Flips (order)

// LOD2pt (order, order)

typedef std::vector<std::string> marker_vec;


34
35
36
37
extern int SPELL_BAYES_MAIN(int, const char**);
extern int SPELL_PEDIGREE_MAIN(int, const char**);


38
39
40
41
42
class Spell2PtMatrix;


class SpellMapTools {
private:
43
    gamete_LV_database gamete_LV;
44
    double m_convergence_threshold = 1.e-6;
45
    int m_max_iterations = 100;
46
    bool m_active = false;
47
public:
48
    /* observations_specs must have three columns: generation, format, filename */
49
    SpellMapTools(std::string ped_filename, Rwrap::DataFrame observation_specs, int mt);
50
51
52
53
54
55
56
57
    Rwrap::List
    SEM(marker_vec order);
    Rwrap::List
    Try(marker_vec order, std::string marker);
    double
    Flips(marker_vec order, int window_size);
    Spell2PtMatrix*
    LOD2pt(marker_vec rows, marker_vec cols);
58
59
    Spell2PtMatrix*
    R2pt(marker_vec rows, marker_vec cols);
60
61
62
63
    double
    map_likelihood(const marker_vec& order, const std::vector<double>& distances)
    { return gamete_LV.map_likelihood(order, distances); }
    EM_map
64
65
    EM(const marker_vec& order, bool dist_if_true_else_r=true)
    { return gamete_LV.EM(order, dist_if_true_else_r, m_convergence_threshold, m_max_iterations); }
66
    
Damien Leroux's avatar
Damien Leroux committed
67
68
//     void init_2pt_tr_at_inf() { gamete_LV.init_2pt_tr_at_inf(); }
    double twoMarkerLikelihood() { return gamete_LV.twoMarkerLikelihoodAtInf(); }
69
70
71
72
73
74
75
76
77
78
79
80
    
    int max_iterations(int n)
    {
        if (n) { std::swap(m_max_iterations, n); return n; }
        else { return m_max_iterations; }
    }

    double convergence_threshold(double t)
    {
        if (t) { std::swap(m_convergence_threshold, t); return t; }
        else { return m_convergence_threshold; }
    }
81
82
83
84
85
86
87
88
89
90
91
    
    bool active_check()
    {
        if (!m_active) {
            MSG_ERROR("This instance is not active. There was a problem during initialization.", "");
            return true;
        }
        return false;
    }
    
    bool is_active() { return m_active; }
92
93
94
};


95

96
97
class Spell2PtMatrix {
private:
98
    SpellMapTools* instance;
99
100
101
    std::vector<std::string> rownames, colnames;
    std::map<std::string, int> row_indices, col_indices;
    std::map<std::pair<std::string, std::string>, double> cache;
102
103
    std::map<std::pair<std::string, std::string>, double> r_cache;
    bool lod_if_true_else_r;
104
public:
105
    Spell2PtMatrix(SpellMapTools* smt, std::vector<std::string> r, std::vector<std::string> c)
106
        : instance(smt), rownames(r), colnames(c), row_indices(), col_indices(), cache(), r_cache(), lod_if_true_else_r(true)
107
108
109
110
111
112
113
114
115
    {
        for (const auto& m: rownames) {
            row_indices.emplace(m, row_indices.size());
        }
        for (const auto& m: colnames) {
            col_indices.emplace(m, col_indices.size());
        }
    }

116
117
118
119
120
121
    void rate() { lod_if_true_else_r = false; }
    void lod() { lod_if_true_else_r = true; }
    
    int is_rate() { return (int) !lod_if_true_else_r; }
    int is_lod() { return (int) lod_if_true_else_r; }
    
122
123
124
    marker_vec getRownames() { return rownames; }
    marker_vec getColnames() { return colnames; }
    
125
    std::vector<int>
126
    getDim()
127
    {
128
        return {(int) rownames.size(), (int) colnames.size()};
129
    }
130

131
132
    void
    get_value_impl(int i, int j, double& lod, double& r)
133
    {
134
        if (i < 1 || j < 1 || i > rownames.size() || j > colnames.size()) {
135
            lod = r = NA_REAL;
136
137
138
        }
        --i; --j;
        if (rownames[i] == colnames[j]) {
139
            lod = r = 0;
140
        }
141
        std::pair<std::string, std::string> key = {rownames[i], colnames[j]};
142
143
144
145
146
        if (key.first > key.second) {
            std::swap(key.first, key.second);
        }
        auto it = cache.find(key);
        if (it == cache.end()) {
147
148
149
//             MSG_INFO("Key " << key.first << ':' << key.second << " not in cache!");
            static const std::vector<double> at_worlds_end = {.5};
            marker_vec minimap = {rownames[i], colnames[j]};
150
            auto em = instance->EM(minimap, false);
151
            double best = em.likelihood;
152
153
            r = em.distances[0];
            r_cache.emplace(key, r);
154
155
//             instance->init_2pt_tr_at_inf();
//             double inf_ref = instance->map_likelihood(minimap, at_worlds_end);
Damien Leroux's avatar
Damien Leroux committed
156
            double inf_ref = instance->twoMarkerLikelihood();
157
158
159
            lod = best - inf_ref;
//             MSG_INFO("LOD(" << minimap[0] << " … " << minimap[1] << " => d=" << em.distances[0] << " best=" << best << " inf_ref=" << inf_ref << " value=" << lod);
            cache.emplace(key, lod);
160
        } else {
161
//             MSG_INFO("Key " << key.first << ':' << key.second << " in cache! " << it->first.first << ':' << it->first.second);
162
163
164
165
166
167
168
169
170
            lod = it->second;
            r = r_cache[key];
        }
    }

    double get2PtValue(int i, int j)
    {
        if (!instance->is_active()) {
            return NA_REAL;
171
        }
172
173
174
        double lod, r;
        get_value_impl(i, j, lod, r);
        return lod_if_true_else_r ? lod : r;
175
176
177
178
    }
};


179
180
181

int
invoke_spell(int (&main_func) (int argc, const char** argv), const std::vector<std::string>& arglist)
182
{
183
184
185
186
187
188
189
190
    std::vector<const char*> argv;
    argv.resize(arglist.size() + 1, NULL);
    for (size_t i = 0; i < arglist.size(); ++i) {
        argv[i] = arglist[i].c_str();
    }
    int ret = main_func(arglist.size(), argv.data());
    argv.clear();
    return ret;
191
192
}

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217


extern "C" {
#   include <sys/types.h>
#   include <sys/stat.h>
#   include <unistd.h>

    unsigned int
    get_file_date(const char* filename)
    {
        struct stat buffer;
        int result;
        if ((result = stat(filename, &buffer))) {
//             MSG_INFO("stat result (" << filename << ") = " << result << " errno=" << strerror(errno));
            /* display filesystem error? */
            return 0;
        } else {
            return buffer.st_mtim.tv_sec;
        }
    }

}



218
SpellMapTools::SpellMapTools(std::string ped_filename, Rwrap::DataFrame observation_specs, int mt)
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    : gamete_LV()
{
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);
    std::vector<const char*> obs_gen = observation_specs[0];
    std::vector<const char*> obs_format = observation_specs[1];
    std::vector<const char*> obs_filename = observation_specs[2];

    md5_digest md5;
    
    md5 << ped_filename;

//     MSG_DEBUG(__FILE__ << ':' << __LINE__);
    
    std::string wd = std::string("spell-map-tools-") + (std::string) md5;

    unsigned int ped_time, data_time;
    
    ped_time = data_time = get_file_date(ped_filename.c_str());
    
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);

    for (size_t i = 0; i < obs_gen.size(); ++i) {
        md5 << obs_gen[i] << obs_format[i] << obs_filename[i];
        std::time_t t = get_file_date(obs_filename[i]);
        if (t > data_time) {
            data_time = t;
        }
    }
    
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);

    std::string name = md5;
251

252
253
    std::string spell_ped_file = SPELL_STRING(wd << '/' << name << ".cache/" << name << ".spell-pedigree.data");
    std::string spell_mark_file = SPELL_STRING(wd << '/' << name << ".cache/gamete.data");
254

255
256
    std::time_t spell_ped_t = get_file_date(spell_ped_file.c_str());
    std::time_t spell_mark_t = 0;
257

258
259
260
261
262
263
264
265
266
267
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);

//     MSG_INFO("ped_time=" << ped_time);
//     MSG_INFO("spell_ped_t=" << spell_ped_t);
    
    /* Run spell-pedigree if needed */
    if (spell_ped_t < ped_time) {
//         MSG_DEBUG(__FILE__ << ':' << __LINE__);
        data_time = std::time(nullptr);
        std::vector<std::string> args = {"spell-pedigree", "-wd", wd, "-n", name, "-p", ped_filename};
Damien Leroux's avatar
Damien Leroux committed
268
269
270
271
272
        int ret = invoke_spell(SPELL_PEDIGREE_MAIN, args);
        if (ret) {
            MSG_ERROR("Something went wrong while loading the pedigree…", "");
            return;
        }
273
274
275
276
277
278
279
280
281
282
283
    } else {
//         MSG_DEBUG(__FILE__ << ':' << __LINE__);
        data_time = spell_ped_t;
        spell_mark_t = get_file_date(spell_mark_file.c_str());
    }
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);

//     MSG_INFO("data_time=" << data_time);
//     MSG_INFO("spell_mark_t=" << spell_mark_t);

    /* Run spell-marker if needed */
Damien Leroux's avatar
Damien Leroux committed
284
    if (spell_mark_t < data_time && obs_gen.size() > 0) {
285
//         MSG_DEBUG(__FILE__ << ':' << __LINE__);
286
//         std::stringstream out;
287
        std::vector<std::string> args = {"spell-marker", "-wd", wd, "-n", name, "-Og", "-Op"};
288
289
290
291
        if (mt > 1) {
            args.push_back("-mt");
            args.push_back(SPELL_STRING(mt));
        }
292
293
294
295
        for (size_t i = 0; i < obs_gen.size(); ++i) {
            args.push_back("-m");
            args.push_back(SPELL_STRING(obs_gen[i] << ':' << obs_format[i]));
            args.push_back(obs_filename[i]);
296
297
298
299
300
301
302
303
304
305
//             if (out.tellp()) {
//                 out << ',';
//             }
//             out << obs_gen[i];
        }
//         args.push_back("-o");
//         args.push_back(out.str());
        if (invoke_spell(SPELL_BAYES_MAIN, args)) {
            MSG_ERROR("Aborting spell-map-tools initialization because of errors…", "");
            return;
306
307
308
309
310
311
312
313
314
315
316
        }
    }
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);

    {
//         MSG_DEBUG(__FILE__ << ':' << __LINE__);
        ifile gam(spell_mark_file);
        rw_base() (gam, gamete_LV);
//         MSG_DEBUG(__FILE__ << ':' << __LINE__);
    }
//     MSG_DEBUG(__FILE__ << ':' << __LINE__);
317
    m_active = true;
318
319
320
321
322
323
324
325
326
327
328
329
330
}



Rwrap::List
map2list(const EM_map& map)
{
    Rwrap::List ret;
    ret.add("distances", map.distances);
    ret.add("markers", map.marker_names);
    ret.add("likelihood", map.likelihood);
    ret.add("n_iterations", map.n_iterations);
    ret.add("converged", (int) map.converged);
331
332
    ret.add("delta", map.delta);
    ret.add("r", map.r);
333
334
335
336
337
338
339
340
    return ret;
}



Rwrap::List
SpellMapTools::SEM(marker_vec order)
{
341
342
343
    if (active_check()) {
        return {};
    }
344
345
346
347
348
    return map2list(EM(order));
}



349

350
351
Rwrap::List
SpellMapTools::Try(marker_vec order, std::string marker)
352
{
353
354
355
    if (active_check()) {
        return {};
    }
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
    std::vector<Rwrap::List> map_results;
    marker_vec full_order;
    full_order.reserve(order.size() + 1);
    full_order.push_back(marker);
    full_order.insert(full_order.end(), order.begin(), order.end());
    map_results.resize(full_order.size());
    EM_map tmp;
    tmp = EM(full_order);
    map_results[0] = map2list(tmp);
    double best = tmp.likelihood;
    size_t i_best = 0;
    for (size_t i = 1; i < full_order.size(); ++i) {
        std::swap(full_order[i - 1], full_order[i]);
        tmp = EM(full_order);
        map_results[i] = map2list(tmp);
        if (tmp.likelihood > best) {
            best = tmp.likelihood;
            i_best = i;
        }
    }
    Rwrap::List ret;
    ret.add("best", i_best + 1);
    ret.add("maps", map_results);
    return ret;
380
381
}

382
383


384
double
385
SpellMapTools::Flips(marker_vec order, int window_size)
386
{
387
388
389
    if (active_check()) {
        return NA_REAL;
    }
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
    std::vector<size_t> window(window_size);
    double best = EM(order).likelihood;
    double ref = best;
    size_t size = order.size() - window.size() + 1;
    for (size_t i = 0; i < size; ++i) {
        marker_vec tmp_order(order);
        std::iota(window.begin(), window.end(), i);
        if (0) {
            std::stringstream ss;
            auto i = window.begin(), j = window.end();
            ss << *i;
            for (++i; i != j; ++i) {
                ss << ' ' << *i;
            }
            MSG_INFO("Window " << ss.str());
        }
        while (std::next_permutation(window.begin(), window.end())) {
            if (0) {
                std::stringstream ss;
                auto i = window.begin(), j = window.end();
                ss << *i;
                for (++i; i != j; ++i) {
                    ss << ' ' << *i;
                }
                MSG_INFO("current window " << ss.str());
            }
            auto it = tmp_order.begin() + i;
            for (size_t x: window) {
                *it++ = order[x];
            }
            double tmp = EM(tmp_order).likelihood;
            if (tmp > best) {
                best = tmp;
            }
            if (0) {
                std::stringstream ss;
                auto i = tmp_order.begin(), j = tmp_order.end();
                ss << *i;
                for (++i; i != j; ++i) {
                    ss << "…" << *i;
                }
                MSG_INFO("Flips on map " << ss.str() << " => " << tmp);
            }
        }
//         MSG_DEBUG("Done permuting.");
    }
    return ref - best;
437
438
}

439
440


441
Spell2PtMatrix*
442
SpellMapTools::LOD2pt(marker_vec row_order, marker_vec col_order)
443
{
444
    active_check();
445
    return new Spell2PtMatrix(this, row_order, col_order);
446
447
448
449
}



450
451


452
CLASS(Spell2PtMatrix)
453
    .method(Spell2PtMatrix, get2PtValue).arg("i").arg("j").auto_glue()
454
455
456
457
    .method(Spell2PtMatrix, rate).auto_glue()
    .method(Spell2PtMatrix, lod).auto_glue()
    .method(Spell2PtMatrix, is_lod).auto_glue()
    .method(Spell2PtMatrix, is_rate).auto_glue()
458
459
460
    .method(Spell2PtMatrix, getDim).auto_glue()
    .method(Spell2PtMatrix, getRownames).auto_glue()
    .method(Spell2PtMatrix, getColnames).auto_glue()
461
462
463
    ;

CLASS(SpellMapTools)
464
    .ctor<std::string, Rwrap::DataFrame, int>("pedigree_filename", "observation_specs", "mt")
465
    .method(SpellMapTools, SEM).arg("order").auto_glue()
466
467
468
    .method(SpellMapTools, Try).arg("order").arg("marker").auto_glue()
    .method(SpellMapTools, Flips).arg("order").arg("window_size").auto_glue()
    .method(SpellMapTools, LOD2pt).arg("row_order").arg("col_order").auto_glue()
469
470
    .method(SpellMapTools, max_iterations).arg("n", "0").auto_glue()
    .method(SpellMapTools, convergence_threshold).arg("t", "0").auto_glue()
471
472
473
474
475
476
477
478
    ;


MODULE(spellmaptools)
    .add_class(SpellMapTools)
    .add_class(Spell2PtMatrix)
    .add_s3method("'['", "Spell2PtMatrix")
    .add_s3method("dim", "Spell2PtMatrix")
479
480
481
482
    .add_s3method("as.matrix", "Spell2PtMatrix")
    .add_s3method("rownames", "Spell2PtMatrix")
    .add_s3method("colnames", "Spell2PtMatrix")
//     .add_s3method("print", "Spell2PtMatrix")
483
    ;