Commit 9f008bbc authored by Damien Leroux's avatar Damien Leroux
Browse files

Now handling pleiotropy, covariables, and defining subsets to work on in commandline.

parent 90db7d98
......@@ -2,5 +2,6 @@
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/PAG_XXV" vcs="Git" />
</component>
</project>
\ No newline at end of file
This diff is collapsed.
......@@ -18,9 +18,9 @@
#ifndef _SPEL_FRONTENDS_H_
#define _SPEL_FRONTENDS_H_
#include "cache2.h"
#include "basic_data.h"
#include "model.h"
#include "../include/cache2.h"
#include "../include/computations/basic_data.h"
#include "../include/computations/model.h"
std::pair<bool, double>
detect_strongest_qtl(chromosome_value chr, const locus_key& lk,
......
......@@ -18,12 +18,12 @@
#ifndef _SPEL_FRONTENDS_H_
#define _SPEL_FRONTENDS_H_
#include "../cache2.h"
#include "basic_data.h"
#include "model.h"
#include "../include/cache2.h"
#include "../include/computations/basic_data.h"
#include "../include/computations/model.h"
/*#include "model/tests.h"*/
#include <boost/math/distributions/normal.hpp> // for normal_distribution
#include "../../../../../usr/include/boost/math/distributions/normal.hpp"
using boost::math::normal; // typedef provides default type is double.
using boost::math::cdf;
using boost::math::mean;
......@@ -273,7 +273,7 @@ struct QTL {
}
std::pair<double, double>
confidence_interval(const std::string& trait, const std::vector<QTL>& selection);
confidence_interval(const std::string &trait, const std::vector<QTL> &selection, ComputationType lod_test_type);
#if 0
{
/*MSG_DEBUG_INDENT_EXPR("[Confidence interval] ");*/
......@@ -451,13 +451,13 @@ struct analysis_report {
void report_final_model(model_manager& mm);
void report_qtls(std::vector<QTL>& qtls);
void report_qtls(std::vector<QTL> &qtls, ComputationType lod_test_type);
};
struct model_manager {
/* name of the studied trait */
/* name of the studied single_trait */
std::string trait_name;
/* populations used in this model */
collection<population_value> all_pops;
......@@ -1022,7 +1022,7 @@ struct model_manager {
if (pmode == Joint) {
/*model Mperm(trait_permutations(trait_name, active_settings->n_permutations), Mcurrent);*/
/*Mperm.compute();*/
std::string old_title = active_settings->set_title(MESSAGE("Computing threshold for trait " << trait_name << " given selection " << Mcurrent.keys() << " using " << active_settings->n_permutations << " and quantile " << active_settings->qtl_threshold_quantile));
std::string old_title = active_settings->set_title(MESSAGE("Computing threshold for single_trait " << trait_name << " given selection " << Mcurrent.keys() << " using " << active_settings->n_permutations << " and quantile " << active_settings->qtl_threshold_quantile));
double ret = *make_value<Disk|Sync>(qtl_threshold_all_chromosomes_for_model,
as_value(trait_name),
as_value(active_settings->qtl_threshold_quantile),
......@@ -1471,8 +1471,8 @@ void analysis_report::report_final_model(model_manager& mm)
/*size_t dof = mm.Mcurrent.rank();*/
/*size_t n = mm.Mcurrent.Y().rows();*/
auto qtls = mm.QTLs();
section_header(report_file, MESSAGE("Report for trait " << mm.trait_name));
report_qtls(qtls);
section_header(report_file, MESSAGE("Report for single_trait " << mm.trait_name));
report_qtls(qtls, mm.lod_test_type());
//*
/*header(report_file, "RSS ") << mm.Mcurrent.rss() << std::endl;*/
......@@ -1623,10 +1623,10 @@ void analysis_report::report_final_model(model_manager& mm)
}
inline
void analysis_report::report_qtls(std::vector<QTL>& qtls)
void analysis_report::report_qtls(std::vector<QTL> &qtls, ComputationType lod_test_type)
{
for (auto& qtl: qtls) {
const auto& ci = roi[qtl.chromosome][qtl.locus] = qtl.confidence_interval(trait_name, qtls);
const auto& ci = roi[qtl.chromosome][qtl.locus] = qtl.confidence_interval(trait_name, qtls, lod_test_type);
report_lod(qtl);
std::string name = MESSAGE(trait_name << " @ " << qtl.locus << " [" << ci.first << ':' << ci.second << ']');
if (poi[qtl.chromosome][qtl.locus].size()) {
......@@ -1992,7 +1992,7 @@ struct test_all_domain : public test_domain_search_method {
inline
std::pair<double, double>
QTL::confidence_interval(const std::string& trait, const std::vector<QTL>& selection)
QTL::confidence_interval(const std::string &trait, const std::vector<QTL> &selection, ComputationType lod_test_type)
{
test_all_domain tad;
test_manager tm(trait, 1, 0, true);
......
......@@ -15,7 +15,7 @@ The example datasets each consist in a set of files :
- the pedigree in a **.ped** file,
- the genetic map in a **.map** file,
- one or more genotypic or allelic observation files in **.gen** files,
- one or more sets of trait observations in **.phen** files.
- one or more sets of single_trait observations in **.phen** files.
Additionally, a **README** file in each directory describes the dataset features and the commands to run to process them.
......
......@@ -29,7 +29,7 @@ spell-qtl – Compute n-point Parental Origin Probabilities and perform QTL anal
**-p**,**--population** *GEN* *TRAITS*
: Specify a new population (dataset) to work on.
*GEN* is the name of the phenotyped generation. The *TRAITS* path must point to a trait observation file with the
*GEN* is the name of the phenotyped generation. The *TRAITS* path must point to a single_trait observation file with the
**same** number of individuals as defined for the given generation in the pedigree for this population.
# OPTIONS
......@@ -92,10 +92,10 @@ The standard pipeline is:
**qtl-threshold-quantile** *VALUE*
: Set the quantile value in range [0:1] to select the QTL threshold value in automatic mode. Default is **0.05**.
**qtl-threshold-value** *trait=value,...*
**qtl-threshold-value** *single_trait=value,...*
: Set the QTL threshold value manually for some traits. If not specified, will be automatically computed using the above settings.
**cofactor-threshold** *trait=value,...*
**cofactor-threshold** *single_trait=value,...*
: Set the cofactor threshold value manually for some traits. Defaults to **value of QTL threshold * .9**.
**cofactor-exclusion-window** *DISTANCE*
......
......@@ -108,7 +108,7 @@ volume = {143},
number = {1},
pages = {571-577},
year = {1996},
abstract ={The recent advent of molecular markers has created a great potential for the understanding of quantitative inheritance. In parallel to rapid developments and improvements in molecular marker technologies, biometrical models have been constructed, refined and generalized for the mapping of quantitative trait loci (QTL). However, current models present restricitions in terms of breeding designs to which they apply. In this paper, we develop an approach for the generalization of the mixture model for progeny from a single bi-parental cross of inbred lines. Detailed derivations are given for genetic designs involving populations developed by selfing, i.e., where marker genotypes are obtained from Fx (x ≤ 2) individuals and where phenotypes are measured on Fy (y ≥ x) individuals or families. Extensions to designs involving doubled-haploids, backcrossderived individuals and random matings are outlined. The derivations presented here can easily be combined with current QTL mapping approaches.},
abstract ={The recent advent of molecular markers has created a great potential for the understanding of quantitative inheritance. In parallel to rapid developments and improvements in molecular marker technologies, biometrical models have been constructed, refined and generalized for the mapping of quantitative single_trait loci (QTL). However, current models present restricitions in terms of breeding designs to which they apply. In this paper, we develop an approach for the generalization of the mixture model for progeny from a single bi-parental cross of inbred lines. Detailed derivations are given for genetic designs involving populations developed by selfing, i.e., where marker genotypes are obtained from Fx (x ≤ 2) individuals and where phenotypes are measured on Fy (y ≥ x) individuals or families. Extensions to designs involving doubled-haploids, backcrossderived individuals and random matings are outlined. The derivations presented here can easily be combined with current QTL mapping approaches.},
URL = {http://www.genetics.org/content/143/1/571.abstract},
eprint = {http://www.genetics.org/content/143/1/571.full.pdf+html},
journal = {Genetics}
......@@ -150,7 +150,7 @@ number = {3},
pages = {1373-1386},
year = {2006},
doi = {10.1534/genetics.106.056416},
abstract ={In the data collection of the QTL experiments using recombinant inbred (RI) populations, when individuals are genotyped for markers in a population, the trait values (phenotypes) can be obtained from the genotyped individuals (from the same population) or from some progeny of the genotyped individuals (from the different populations). Let Fu be the genotyped population and Fv (v ≥ u) be the phenotyped population. The experimental designs that both marker genotypes and phenotypes are recorded on the same populations can be denoted as (Fu/Fv, u = v) designs and that genotypes and phenotypes are obtained from the different populations can be denoted as (Fu/Fv, v > u) designs. Although most of the QTL mapping experiments have been conducted on the backcross and F2(F2/F2) designs, the other (Fu/Fv, v ≥ u) designs are also very popular. The great benefits of using the other (Fu/Fv, v ≥ u) designs in QTL mapping include reducing cost and environmental variance by phenotyping several progeny for the genotyped individuals and taking advantages of the changes in population structures of other RI populations. Current QTL mapping methods including those for the (Fu/Fv, u = v) designs, mostly for the backcross or F2/F2 design, and for the F2/F3 design based on a one-QTL model are inadequate for the investigation of the mapping properties in the (Fu/Fv, u ≤ v) designs, and they can be problematic due to ignoring their differences in population structures. In this article, a statistical method considering the differences in population structures between different RI populations is proposed on the basis of a multiple-QTL model to map for QTL in different (Fu/Fv, v ≥ u) designs. In addition, the QTL mapping properties of the proposed and approximate methods in different designs are discussed. Simulations were performed to evaluate the performance of the proposed and approximate methods. The proposed method is proven to be able to correct the problems of the approximate and current methods for improving the resolution of genetic architecture of quantitative traits and can serve as an effective tool to explore the QTL mapping study in the system of RI populations.},
abstract ={In the data collection of the QTL experiments using recombinant inbred (RI) populations, when individuals are genotyped for markers in a population, the single_trait values (phenotypes) can be obtained from the genotyped individuals (from the same population) or from some progeny of the genotyped individuals (from the different populations). Let Fu be the genotyped population and Fv (v ≥ u) be the phenotyped population. The experimental designs that both marker genotypes and phenotypes are recorded on the same populations can be denoted as (Fu/Fv, u = v) designs and that genotypes and phenotypes are obtained from the different populations can be denoted as (Fu/Fv, v > u) designs. Although most of the QTL mapping experiments have been conducted on the backcross and F2(F2/F2) designs, the other (Fu/Fv, v ≥ u) designs are also very popular. The great benefits of using the other (Fu/Fv, v ≥ u) designs in QTL mapping include reducing cost and environmental variance by phenotyping several progeny for the genotyped individuals and taking advantages of the changes in population structures of other RI populations. Current QTL mapping methods including those for the (Fu/Fv, u = v) designs, mostly for the backcross or F2/F2 design, and for the F2/F3 design based on a one-QTL model are inadequate for the investigation of the mapping properties in the (Fu/Fv, u ≤ v) designs, and they can be problematic due to ignoring their differences in population structures. In this article, a statistical method considering the differences in population structures between different RI populations is proposed on the basis of a multiple-QTL model to map for QTL in different (Fu/Fv, v ≥ u) designs. In addition, the QTL mapping properties of the proposed and approximate methods in different designs are discussed. Simulations were performed to evaluate the performance of the proposed and approximate methods. The proposed method is proven to be able to correct the problems of the approximate and current methods for improving the resolution of genetic architecture of quantitative traits and can serve as an effective tool to explore the QTL mapping study in the system of RI populations.},
URL = {http://www.genetics.org/content/174/3/1373.abstract},
eprint = {http://www.genetics.org/content/174/3/1373.full.pdf+html},
journal = {Genetics}
......@@ -170,7 +170,7 @@ journal = {Genetics}
@article{GRH:Kao5486696,
author = {Kao,Chen-Hung and Zeng,Miao-Hui},
title = {A study on the mapping of quantitative trait loci in advanced populations derived from two inbred lines},
title = {A study on the mapping of quantitative single_trait loci in advanced populations derived from two inbred lines},
journal = {Genetics Research},
volume = {91},
issue = {02},
......@@ -224,7 +224,7 @@ number = {4},
pages = {1981-1993},
year = {2004},
doi = {10.1534/genetics.166.4.1981},
abstract ={In plants and laboratory animals, QTL mapping is commonly performed using F2 or BC individuals derived from the cross of two inbred lines. Typical QTL mapping statistics assume that each F2 individual is genotyped for the markers and phenotyped for the trait. For plant traits with low heritability, it has been suggested to use the average phenotypic values of F3 progeny derived from selfing F2 plants in place of the F2 phenotype itself. All F3 progeny derived from the same F2 plant belong to the same F2:3 family, denoted by F2:3. If the size of each F2:3 family (the number of F3 progeny) is sufficiently large, the average value of the family will represent the genotypic value of the F2 plant, and thus the power of QTL mapping may be significantly increased. The strategy of using F2 marker genotypes and F3 average phenotypes for QTL mapping in plants is quite similar to the daughter design of QTL mapping in dairy cattle. We study the fundamental principle of the plant version of the daughter design and develop a new statistical method to map QTL under this F2:3 strategy. We also propose to combine both the F2 phenotypes and the F2:3 average phenotypes to further increase the power of QTL mapping. The statistical method developed in this study differs from published ones in that the new method fully takes advantage of the mixture distribution for F2:3 families of heterozygous F2 plants. Incorporation of this new information has significantly increased the statistical power of QTL detection relative to the classical F2 design, even if only a single F3 progeny is collected from each F2:3 family. The mixture model is developed on the basis of a single-QTL model and implemented via the EM algorithm. Substantial computer simulation was conducted to demonstrate the improved efficiency of the mixture model. Extension of the mixture model to multiple QTL analysis is developed using a Bayesian approach. The computer program performing the Bayesian analysis of the simulated data is available to users for real data analysis.},
abstract ={In plants and laboratory animals, QTL mapping is commonly performed using F2 or BC individuals derived from the cross of two inbred lines. Typical QTL mapping statistics assume that each F2 individual is genotyped for the markers and phenotyped for the single_trait. For plant traits with low heritability, it has been suggested to use the average phenotypic values of F3 progeny derived from selfing F2 plants in place of the F2 phenotype itself. All F3 progeny derived from the same F2 plant belong to the same F2:3 family, denoted by F2:3. If the size of each F2:3 family (the number of F3 progeny) is sufficiently large, the average value of the family will represent the genotypic value of the F2 plant, and thus the power of QTL mapping may be significantly increased. The strategy of using F2 marker genotypes and F3 average phenotypes for QTL mapping in plants is quite similar to the daughter design of QTL mapping in dairy cattle. We study the fundamental principle of the plant version of the daughter design and develop a new statistical method to map QTL under this F2:3 strategy. We also propose to combine both the F2 phenotypes and the F2:3 average phenotypes to further increase the power of QTL mapping. The statistical method developed in this study differs from published ones in that the new method fully takes advantage of the mixture distribution for F2:3 families of heterozygous F2 plants. Incorporation of this new information has significantly increased the statistical power of QTL detection relative to the classical F2 design, even if only a single F3 progeny is collected from each F2:3 family. The mixture model is developed on the basis of a single-QTL model and implemented via the EM algorithm. Substantial computer simulation was conducted to demonstrate the improved efficiency of the mixture model. Extension of the mixture model to multiple QTL analysis is developed using a Bayesian approach. The computer program performing the Bayesian analysis of the simulated data is available to users for real data analysis.},
URL = {http://www.genetics.org/content/166/4/1981.abstract},
eprint = {http://www.genetics.org/content/166/4/1981.full.pdf+html},
journal = {Genetics}
......
......@@ -9,6 +9,6 @@ POP F2 100
POP CD 100
POP AB 100
QTL 104.61 1.64 52.56
trait t1 qtl [('M_2_14', 104.61)] effect [{'a': 20.0, 'A': 20.0, 'c': 10.0, 'B': 30.0, 'd': 0.0, 'C': 10.0, 'b': 30.0, 'D': 0.0}] epistasis {} noise 1.0 0.0 1.0
trait t3 qtl [('M_2_14', 104.61)] effect [{'a': 100.0, 'A': 100.0, 'c': 0.0, 'B': 10.0, 'd': 2000.0, 'C': 0.0, 'b': 10.0, 'D': 200.0}] epistasis {} noise 1.0 0.0 5.0
trait t2 qtl [('M_2_9', 52.56)] effect [{'a': 5.0, 'A': 5.0, 'c': 6.0, 'B': 3.0, 'd': 4.0, 'C': 6.0, 'b': 3.0, 'D': 4.0}] epistasis {} noise 1.0 0.0 0.5
single_trait t1 qtl [('M_2_14', 104.61)] effect [{'a': 20.0, 'A': 20.0, 'c': 10.0, 'B': 30.0, 'd': 0.0, 'C': 10.0, 'b': 30.0, 'D': 0.0}] epistasis {} noise 1.0 0.0 1.0
single_trait t3 qtl [('M_2_14', 104.61)] effect [{'a': 100.0, 'A': 100.0, 'c': 0.0, 'B': 10.0, 'd': 2000.0, 'C': 0.0, 'b': 10.0, 'D': 200.0}] epistasis {} noise 1.0 0.0 5.0
single_trait t2 qtl [('M_2_9', 52.56)] effect [{'a': 5.0, 'A': 5.0, 'c': 6.0, 'B': 3.0, 'd': 4.0, 'C': 6.0, 'b': 3.0, 'D': 4.0}] epistasis {} noise 1.0 0.0 0.5
......@@ -9,6 +9,6 @@ cross,F1,AB,CD,100,None
self,F2,F1,100,None
self,F3,F2,1000,aa:0 ab:0 ac:0 ad:0 Aa:1 Ab:1 Ac:1 Ad:1 ba:0 bb:0 bc:0 bd:0 Ba:1 Bb:1 Bc:1 Bd:1 ca:0 cb:0 cc:0 cd:0 Ca:1 Cb:1 Cc:1 Cd:1 da:0 db:0 dc:0 dd:0 Da:1 Db:1 Dc:1 Dd:1 aA:1 aB:1 aC:1 aD:1 AA:2 AB:2 AC:2 AD:2 bA:1 bB:1 bC:1 bD:1 BA:2 BB:2 BC:2 BD:2 cA:1 cB:1 cC:1 cD:1 CA:2 CB:2 CC:2 CD:2 dA:1 dB:1 dC:1 dD:1 DA:2 DB:2 DC:2 DD:2
qtls,3
trait,t1,F3,1,a:20 A:20 b:30 B:30 c:10 C:10 d:0 D:0,None,1,0,1
trait,t2,F3,1,a:5 A:5 b:3 B:3 c:6 C:6 d:4 D:4,None,1,0,.5
trait,t3,F3,1,a:100 A:100 b:10 B:10 c:0 C:0 d:2000 D:200,None,1,0,5
single_trait,t1,F3,1,a:20 A:20 b:30 B:30 c:10 C:10 d:0 D:0,None,1,0,1
single_trait,t2,F3,1,a:5 A:5 b:3 B:3 c:6 C:6 d:4 D:4,None,1,0,.5
single_trait,t3,F3,1,a:100 A:100 b:10 B:10 c:0 C:0 d:2000 D:200,None,1,0,5
......@@ -8,7 +8,7 @@ A(1) B(1) C(1)
The genetic map features two chromosomes respectively 150cM and 120cM long with respectively 20 and 17 markers.
One trait (t1) is simulated for F2 and F2C, affected by two QTLs:
One single_trait (t1) is simulated for F2 and F2C, affected by two QTLs:
- one on chromosome 1 on marker M_1_4 at 14.97cM, with effects A=1, B=0, C=-1, and gaussian noise with parameters (mu=0, sigma=2)
- one on chromosome 2 on marker M_2_6 at 68.37cM, with effects A=1, B=0, C=-1, and gaussian noise with parameters (mu=0, sigma=1)
......
......@@ -7,5 +7,5 @@ POP F1C 1
POP F2 100
POP F2C 100
QTL 123.28 142.46
trait t1 qtl [('M_1_16', 123.28), ('M_1_19', 142.46)] effect [{'a': 2.0, 'c': -1.0}, {'a': 3.0, 'c': -2.0}] epistasis {} noise 1.0 0.0 2.0
trait t1 qtl [('M_1_19', 142.46), ('M_1_16', 123.28)] effect [{'a': 2.0, 'b': 0.0}, {'a': 3.0, 'b': 0.0}] epistasis {} noise 1.0 0.0 1.0
single_trait t1 qtl [('M_1_16', 123.28), ('M_1_19', 142.46)] effect [{'a': 2.0, 'c': -1.0}, {'a': 3.0, 'c': -2.0}] epistasis {} noise 1.0 0.0 2.0
single_trait t1 qtl [('M_1_19', 142.46), ('M_1_16', 123.28)] effect [{'a': 2.0, 'b': 0.0}, {'a': 3.0, 'b': 0.0}] epistasis {} noise 1.0 0.0 1.0
......@@ -7,5 +7,5 @@ cross,F1C,A,C,1,None
self,F2,F1,100,aa:A ab:H ba:H bb:B
self,F2C,F1C,100,aa:A ac:H ca:H cc:B
qtls,2
trait,t1,F2,2,a:2 b:0|a:3 b:0,None,1,0,1
trait,t1,F2C,2,a:2 c:-1|a:3 c:-2,None,1,0,2
single_trait,t1,F2,2,a:2 b:0|a:3 b:0,None,1,0,1
single_trait,t1,F2C,2,a:2 c:-1|a:3 c:-2,None,1,0,2
......@@ -35,6 +35,7 @@
/*#include "generation_rs_fwd.h"*/
#include "input/read_trait.h"
#include "tensor.h"
#include "linear_combination.h"
/** FOURCC **/
......@@ -953,13 +954,15 @@ struct qtl_pop_type {
std::string observed_traits_filename;
std::vector<trait> observed_traits;
std::map<char, std::string> ancestor_names;
trait covariables;
qtl_pop_type()
: name(), qtl_generation_name(), indices(), gen(), LV(), observed_traits_filename(), observed_traits()
: name(), qtl_generation_name(), indices(), gen(), LV(), observed_traits_filename(), observed_traits(), covariables()
{}
qtl_pop_type(const std::string& n, const std::string& qgn, const std::vector<size_t>& i, std::shared_ptr<geno_matrix> g,
const std::map<std::string, std::vector<MatrixXd>>& lv, const std::string& otf, const std::vector<trait>& ot, const std::map<char, std::string>& anam)
: name(n), qtl_generation_name(qgn), indices(i), gen(g), LV(lv), observed_traits_filename(otf), observed_traits(ot), ancestor_names(anam)
const std::map<std::string, std::vector<MatrixXd>>& lv, const std::string& otf, const std::vector<trait>& ot,
const std::map<char, std::string>& anam, const trait& covar)
: name(n), qtl_generation_name(qgn), indices(i), gen(g), LV(lv), observed_traits_filename(otf), observed_traits(ot), ancestor_names(anam), covariables(covar)
{
/*MSG_DEBUG("new qtl_pop with " << ancestor_names.size() << " ancestor names");*/
/*for (const auto& kv: ancestor_names) {*/
......@@ -987,6 +990,8 @@ struct qtl_pop_type {
auto filt = keep.begin();
auto filt_end = keep.end();
ret->covariables = covariables % this_trait;
/*MSG_DEBUG("Filtering <" << indices << "> with <" << keep << '>');*/
std::vector<size_t> keep_indices;
keep_indices.reserve(keep.size());
......@@ -1200,7 +1205,9 @@ struct pop_data_type {
return ret;
}
std::string extract_subpops(std::vector<std::shared_ptr<qtl_pop_type>>& pops, const std::string& traits_filename, const std::vector<trait>& traits, std::map<std::string, std::vector<std::vector<std::shared_ptr<qtl_pop_type>>>>& linked_pops)
std::string extract_subpops(std::vector<std::shared_ptr<qtl_pop_type>>& pops, const std::string& traits_filename, const std::vector<trait>& traits,
std::map<std::string, std::vector<std::vector<std::shared_ptr<qtl_pop_type>>>>& linked_pops,
const trait& covariables)
{
auto extract_traits
= [&] (const std::vector<size_t>& ind_vec)
......@@ -1209,36 +1216,14 @@ struct pop_data_type {
ret.resize(traits.size());
for (size_t ti = 0; ti < traits.size(); ++ti) {
ret[ti].name = traits[ti].name;
/* FIXME there may be less values, need to filter the ind_vec using good_indices */
ret[ti].values.reserve(ind_vec.size());
ret[ti].good_indices.reserve(ind_vec.size());
auto ivi = ind_vec.begin(), ivj = ind_vec.end();
auto vali = traits[ti].values.begin(), valj = traits[ti].values.end();
auto goodi = traits[ti].good_indices.begin();
while (true) {
if (ivi == ivj || vali == valj) {
break;
} else if (*ivi < *goodi) {
++ivi;
} else if (*ivi > *goodi) {
++goodi;
++vali;
} else {
ret[ti].values.push_back(*vali);
ret[ti].good_indices.push_back(*goodi);
++goodi;
++vali;
++ivi;
}
if (covariables.good_indices.size()) {
ret[ti].good_indices = ind_vec % traits[ti].good_indices % covariables.good_indices;
} else {
ret[ti].good_indices = ind_vec % traits[ti].good_indices;
}
/*for (size_t i: ind_vec) {*/
/*ret[ti].values.push_back(traits[ti].values[i]);*/
/*ret[ti].good_indices.push_back(traits[ti].good_indices[i]);*/
/*}*/
/*MSG_DEBUG("extracted " << ret.size() << " traits");*/
/*for (const auto& t: ret) {*/
/*MSG_DEBUG(" " << t.values.size() << " values");*/
/*}*/
ret[ti].P = traits[ti].P;
ret[ti].raw = reduce_with_good_indices(traits[ti].raw, traits[ti].good_indices, ret[ti].good_indices);
ret[ti].values = reduce_with_good_indices(traits[ti].values, traits[ti].good_indices, ret[ti].good_indices);
}
return ret;
};
......@@ -1249,9 +1234,13 @@ struct pop_data_type {
auto aqg = all_qtl_geno_matrix();
for (const auto& kv: aqg) {
/*MSG_DEBUG("subpop kv.first " << kv.first << " kv.second " << kv.second);*/
auto indices = kv.second;
if (covariables.good_indices.size()) {
indices = indices % covariables.good_indices;
}
std::shared_ptr<qtl_pop_type> this_pop = std::make_shared<qtl_pop_type>(name, qtl_generation_name, kv.second, kv.first,
LV.extract(qtl_generation_name, kv.second), traits_filename,
extract_traits(kv.second), ancestor_names);
extract_traits(kv.second), ancestor_names, covariables);
for (const auto& this_trait: traits) {
auto& lp = linked_pops[this_trait.name];
auto pop_ptr = this_pop->filter_clone(this_trait);
......
This diff is collapsed.
......@@ -32,6 +32,8 @@
#include "computations/probabilities.h"
#endif
#include "linear_combination.h"
template <typename L>
struct block_builder {
struct element {
......@@ -132,7 +134,7 @@ model extend(const model& M, const model_block_key& k, const value<model_block_t
MatrixXd
traits_to_matrix();
const std::vector<double>&
const MatrixXd&
trait_by_name(population_value pop, const std::string& name);
value<MatrixXd>
......@@ -150,6 +152,9 @@ f_tester(const model& M, const model& M0, const parental_origin_per_locus_type&
model_block_type
cross_indicator(const std::string& trait_name);
value<model_block_type>
make_covariables_block(const std::string& trait, const std::vector<std::string>& covar_names);
model
basic_model();
......@@ -283,18 +288,19 @@ MatrixXd
contrast_groups(const collection<population_value>& all_pops, const locus_key& lk);
VectorXd
f_test_with_permutations(const std::string& trait, int n_permut, chromosome_value chr, const locus_key& lk);
test_with_permutations(const std::string &trait, int n_permut, size_t y_block_size, chromosome_value chr, const locus_key &lk);
double
qtl_threshold(const std::string& trait_name, chromosome_value chr, const locus_key& lk, double quantile, int n_permut);
qtl_threshold(const std::string& trait_name, chromosome_value chr, const locus_key& lk, double quantile, int n_permut, size_t y_block_size);
double
qtl_threshold_all_chromosomes(const std::string& trait_name, double quantile, int n_permut);
qtl_threshold_all_chromosomes(const std::string& trait_name, size_t y_block_size, double quantile, int n_permut);
double
qtl_threshold_all_chromosomes_for_model(const std::string& trait_name, double quantile, int n_permut, const model& Mperm);
qtl_threshold_all_chromosomes_for_model(const std::string& trait_name, double quantile, int n_permut, size_t y_block_size, const model& Mperm);
std::vector<double>
compute_effective_loci(const locus_key& lk, const std::vector<double>& loci);
......@@ -323,10 +329,11 @@ struct computation_along_chromosome {
MatrixXd ftest_lod;
MatrixXd r2;
MatrixXd chi2;
MatrixXd chi2_lod;
MatrixXd mahalanobis;
computation_along_chromosome()
: coefficients(), residuals(), rank(), rss(), ftest_pvalue(), ftest_lod(), r2(), chi2(), mahalanobis()
: coefficients(), residuals(), rank(), rss(), ftest_pvalue(), ftest_lod(), r2(), chi2(), chi2_lod(), mahalanobis()
{}
friend
......@@ -341,7 +348,7 @@ inline void arg_write(std::ostream&, const computation_along_chromosome*) {}
inline bool arg_match(std::istream&, const computation_along_chromosome*) { return true; }
namespace std { template<> struct hash<computation_along_chromosome*> { bool operator () (const computation_along_chromosome*) const { return 1; } }; }
enum ComputationType { NoTest=0, FTest=1, FTestLOD=2, R2=4, Chi2=8, Mahalanobis=16 };
enum ComputationType { NoTest=0, FTest=1, FTestLOD=2, R2=4, Chi2=8, Chi2LOD=16, Mahalanobis=32 };
enum ComputationResults { Test=0, RSS=1, Residuals=2, Coefficients=4, Rank=8 };
constexpr ComputationType operator | (ComputationType a, ComputationType b) { return (ComputationType) (((int)a) | ((int)b)); }
......@@ -373,6 +380,7 @@ namespace std {
int
compute_one(int i, computation_along_chromosome* ret,
ComputationType ct, ComputationResults cr,
size_t y_block_size,
const model& Mcurrent, const model& M0,
const model_block_key& k,
const parental_origin_per_locus_type& popl);
......@@ -382,11 +390,13 @@ void
compute_along_chromosome(computation_along_chromosome& ret,
ComputationType ct, ComputationResults cr,
const value<model>& Mcurrent, const value<model>& M0,
size_t y_block_size,
const locus_key& base_key,
chromosome_value chr,
const std::vector<double>& loci,
const collection<parental_origin_per_locus_type>& popl)
{
MSG_DEBUG("INIT COMPUTATION ct=" << ct << " cr=" << cr);
if (cr & RSS) {
ret.rss.resize(Mcurrent->Y().outerSize(), popl.size());
}
......@@ -406,7 +416,16 @@ compute_along_chromosome(computation_along_chromosome& ret,
ret.ftest_lod.resize(Mcurrent->Y().outerSize(), popl.size());
}
if (ct & Chi2) {
ret.chi2.resize(popl.size(), 1);
MSG_DEBUG("INIT CHI2");
MSG_DEBUG("Y.cols = " << Mcurrent->Y().cols() << " y_block_size = " << y_block_size << " popl.size = " << popl.size());
MSG_QUEUE_FLUSH();
ret.chi2.resize(Mcurrent->Y().cols() / y_block_size, popl.size());
}
if (ct & Chi2LOD) {
MSG_DEBUG("INIT CHI2 LOD");
MSG_DEBUG("Y.cols = " << Mcurrent->Y().cols() << " y_block_size = " << y_block_size << " popl.size = " << popl.size());
MSG_QUEUE_FLUSH();
ret.chi2_lod.resize(Mcurrent->Y().cols() / y_block_size, popl.size());
}
if (ct & Mahalanobis) {
ret.mahalanobis.resize(popl.size(), 1);
......@@ -434,6 +453,7 @@ compute_along_chromosome(computation_along_chromosome& ret,
joiner.push_back(make_value<_Policy>(compute_one,
i, pcac, /* flower */
vct, vcr,
as_value(y_block_size),
Mcurrent, M0,
vmbk,
/*as_value(base_key + std::make_pair(chr, loci[i])),*/
......@@ -470,7 +490,10 @@ init_computation(computation_along_chromosome& ret, ComputationType ct, Computat
ret.ftest_lod.resize(Y_cols, n_steps);
}
if (ct & Chi2) {
ret.chi2.resize(n_steps, 1);
ret.chi2.resize(1, n_steps);
}
if (ct & Chi2LOD) {
ret.chi2_lod.resize(1, n_steps);
}
if (ct & Mahalanobis) {
ret.mahalanobis.resize(n_steps, 1);
......@@ -485,6 +508,7 @@ template <CachingPolicy _Policy = CachingPolicy::Oneshot>
void
compute_along_interval(int i0, computation_along_chromosome& ret,
value<ComputationType> vct, value<ComputationResults> vcr,
size_t y_block_cols,
const value<model>& Mcurrent, const value<model>& M0,
const locus_key& base_key,
chromosome_value chr,
......@@ -507,6 +531,7 @@ compute_along_interval(int i0, computation_along_chromosome& ret,
joiner.push_back(make_value<_Policy>(compute_one,
i, pcac, /* flower */
vct, vcr,
as_value(y_block_cols),
Mcurrent, M0,
vmbk,
/*as_value(base_key + std::make_pair(chr, loci[i])),*/
......
......@@ -22,33 +22,213 @@
#include <vector>
#include "file.h"
//#ifndef MATRIX_SIZE
#define dim_matrix(_x_) '(' << _x_.rows() << ", " << _x_.cols() << ')'
//#endif
inline
MatrixXd
reduce_with_good_indices(const MatrixXd& data, const std::vector<size_t>& source_indices, const std::vector<size_t>& indices)
{
/* REQUIRES that indices is a subset of source_indices */
MatrixXd ret(indices.size(), data.cols());
auto sii = source_indices.begin();
for (int i = 0; i < (int) indices.size(); ++i) {
while (*sii < indices[i]) { ++sii; }
ret.row(i) = data.row(sii - source_indices.begin());
}
return ret;
}
template <typename T>
std::vector<T>
reduce_with_good_indices(const std::vector<T>& data, const std::vector<size_t>& source_indices, const std::vector<size_t>& indices)
{
/* REQUIRES that indices is a subset of source_indices */
std::vector<T> ret;
ret.reserve(indices.size());
auto sii = source_indices.begin();
for (size_t i: indices) {
while (*sii < i) { ++sii; }
ret.push_back(data[sii - source_indices.begin()]);
}
return ret;
}
struct trait {
std::string name;
std::vector<double> values;
std::vector<std::string> dim_names;
std::vector<size_t> good_indices;
/* traits in columns */
MatrixXd values;
MatrixXd raw;
MatrixXd P;
trait()
: name(), dim_names(), good_indices(), values(), raw(), P(MatrixXd::Ones(1, 1))
{}
trait(const trait& other)
: name(other.name), dim_names(other.dim_names), good_indices(other.good_indices), values(other.values), raw(other.raw), P(other.P)
{}
trait(const std::vector<trait>& traits, const std::vector<std::string>& names, const std::string& new_name, double threshold=1.e-3)
: raw(), P()
{
name = new_name;
std::vector<size_t> indices;
for (const auto& n: names) {
auto it = std::find_if(traits.begin(), traits.end(), [&](const trait& t) { return t.name == n; });
if (it == traits.end()) {
/* WHINE */
MSG_WARNING("Didn't find trait " << n << " in trait list.");
continue;
}
if (indices.size() == 0) {
good_indices = it->good_indices;
} else {
std::vector<size_t> tmp = good_indices;
auto last = std::set_intersection(tmp.begin(), tmp.end(), it->good_indices.begin(), it->good_indices.end(), good_indices.begin());
good_indices.resize(last - good_indices.begin());
}
indices.push_back(it - traits.begin());
}
if (good_indices.size() == 0) {
/* WHINE */
} else {
int ncols = 0;
for (auto i: indices) {
ncols += traits[i].raw.cols();
}
raw.resize(good_indices.size(), ncols);
ncols = 0;
for (size_t j = 0; j < indices.size(); ++j) {
const auto& t = traits[indices[j]];
dim_names.insert(dim_names.end(), t.dim_names.begin(), t.dim_names.end());
auto t_gi = t.good_indices.begin();
int t_value_i = 0;
for (size_t i = 0; i < good_indices.size(); ++i) {
while (good_indices[i] != *t_gi) {
++t_gi;
++t_value_i;
}
// raw(i, j) = t.values(t_value_i, 0);
raw.block(i, ncols, 1, t.raw.cols()) = t.raw.block(t_value_i, 0, 1, t.raw.cols());
}
ncols += t.raw.cols();
}
P = MatrixXd::Identity(raw.cols(), raw.cols());
values = raw;
}
}
MatrixXd
XtX() const
{
MatrixXd centered = raw.rowwise() - raw.colwise().mean();
return centered.adjoint() * centered;