Commit cb9155a2 authored by damien's avatar damien
Browse files

Fixed few things in the CLIs and added man pages.

parent 9582208e
This diff is collapsed.
......@@ -17,6 +17,9 @@
cmake_minimum_required(VERSION 3.5)
project(spell_qtl)
#LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake")
#INCLUDE(pandocology)
set(CMAKE_CONFIGURATION_TYPES Debug Release CACHE TYPE INTERNAL FORCE)
set(CMAKE_VERBOSE_MAKEFILE ON)
......@@ -94,6 +97,39 @@ set(CMAKE_EXE_LINKER_FLAGS "-rdynamic")
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/doc)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/doc/man)
add_custom_target(manpages ALL)
function(add_manpage manpage)
add_custom_command(
TARGET manpages
PRE_BUILD
COMMAND pandoc --to man --standalone ${CMAKE_SOURCE_DIR}/doc/man/${manpage}.md -o ${manpage}
WORKING_DIRECTORY doc/man
COMMENT "Generating manpage ${manpage}")
endfunction()
add_manpage(spell-pedigree.1)
add_manpage(spell-marker.1)
add_manpage(spell-qtl.1)
#add_custom_command(OUTPUT spell-marker.1
# COMMAND pandoc -t man ${CMAKE_SOURCE_DIR}/doc/man/spell-marker.1.md -o spell-marker.1
# MAIN_DEPENDENCY ${CMAKE_SOURCE_DIR}/doc/man/spell-marker.1.md
# WORKING_DIRECTORY doc/man)
#add_document(
# spell-marker.1
# SOURCES
# ${CMAKE_SOURCE_DIR}/doc/man/spell-marker.1.md
# PANDOC_DIRECTIVES
# -t man
# PRODUCT_DIRECTORY
# doc/man
#)
# experimental binaries
......@@ -101,6 +137,12 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
INSTALL(TARGETS spell-pedigree spell-marker spell-qtl DESTINATION bin)
INSTALL(FILES
${CMAKE_CURRENT_BINARY_DIR}/doc/man/spell-pedigree.1
${CMAKE_CURRENT_BINARY_DIR}/doc/man/spell-marker.1
${CMAKE_CURRENT_BINARY_DIR}/doc/man/spell-qtl.1
DESTINATION share/man/man1)
INCLUDE(InstallRequiredSystemLibraries)
......
% SPELL-MARKER(1) Spell-QTL software suite
% Damien Leroux <damien.leroux@inra.fr>, Sylvain Jasson <sylvain.jasson@inra.fr>
% December 2016
# NAME
spell-marker – Compute the 1-point Parental Origin Probabilities in a pedigree given genotype or allelic observations
# SYNOPSIS
**spell-marker** [options...] **-n** *NAME* **-p** *FILE* **-m** *GEN:FORMAT* *FILE* [**-m**...]
# DESCRIPTION
**spell-marker** computes the 1-point Parental Origin Probabilities using Bayesian inference.
It outputs a data file that can be used with **spell-qtl** to compute the n-point Parental Origin Probabilities and
perform the actual QTL analysis.
Because each marker is supposed to be independent, **spell-marker** can perform the computations in parallel in a
variety of ways. See the *Job control* subsection of the options for details.
**-n**,**--name** *population_name*
: The name of this population (will also be used to create the output filename)
**-p**,**--pedigree** *FILE*
: Path to the output from spell-pedigree.
**-m**,**--marker-obs** *GEN:FORMAT* *FILE*
: Path to the marker observations file of generation *GEN* with given format *FORMAT*. This file must have as many
individuals as the pedigree has for that generation.
**spell-marker** knows three marker observation formats by default. Bi-allelic SNP observations encoded as 0, 1, 2
(*02*), bi-parental genotype observations as in the Mapmaker format (*ABHCD*), and phased outbred parental observations
as in carthagene (*CP*). You can define other formats using the **-mos** option.
You can direct **spell-marker** to use only a slice of an observation file using the following syntaxes:
*FILE*:**single_column_index**
*FILE*:**first_column_index**:**last_column_index**
When using genotype observations in a pedigree with more than two ancestors, you can specify the format for each
generation as **Parent1_letter**/**Parent2_letter** or **Parent1_generation**/**Parent2_generation**. The format
will be *ABHCD* with *a* and *b* replaced with the corresponding letters.
# OPTIONS
## Miscellaneous
**-h**, **--help**
: Display usage.
**-z**,**--noise** *level*
: Set the noise level for marker observations. Defaults to **0**.
**-wd**,**--work-directory** *path*
: Path to directory for output files. Defaults to the **current directory**.
## Job control
Select and configure the job control scheme
**-mt**,**--dispatch-multithread** *n_threads*
: Use single-machine, multi-threading.
**-ssh**,**--dispatch-SSH** *HOSTS*
: Use SSH for job dispatch. *HOSTS* is a comma-separated list of hostnames. **spell-marker** expects to find the same
file system structure on all hosts.
**-sge**,**--dispatch-SGE** *n_jobs* *qsub options*
: Use SGE for job dispatch. Use '-' for *qsub options* if you don't wish to provide any specific option.
## Inputs
Input files and configuration of observations.
There are two essential parameters to compute the genotype probabilities: the number of ancestors and the number
of observed alleles (for SNP observations). The number of ancestors is automatically computed from the given pedigree
or breeding design specification. The number of alleles is computed from the marker observation specifications.
**-mos**,**--marker-observation-spec** *path*
: Path to a marker observation specification file.
**-o**,**--output-generations** *comma-separated list*
: Specifies the list of variables to extract after the computation.
The state probabilities for all individuals in the given generations will be extracted and made available for spell-qtl.
Defaults to all generations.
## Output modes
Set the output mode. By default, only the population data file will be written. If you specify -O1, only the 1-point
Parental Origin Probabilities will be written, unless you also specify -Op.
**-Op**,**--output-population-data**
: Output the population data file for use in spell-qtl. This is the default behaviour.
**-O1**,**--output-one-point-prob**
: Output the 1-point Parental Origin Probabilities. This will disable the output of the population data file
unless -Op is also used.
# MARKER OBSERVATION FORMAT SPECIFICATION
A format specification file is a JSON object (dictionary) where each key is a format name. Each corresponding value is
a JSON object containing the following keys:
**"domain"**
: either *"allele"* or *"ancestor"*.
**"alphabet-from"**
: a string containing all the characters (alleles or ancestor letters) that can be observed.
**"scores"**
: an object where each key is an observation and each value an array of all the possible genotype/allelic pairs it
encompasses.
## Example: the *02*, *ABHCD*, and *CP* formats
~~~~
{
"02": {
"domain": "allele",
"alphabet_from": "01",
"scores": {
"0": ["00"],
"1": ["01", "10"],
"2": ["11"],
"-": ["00", "01", "10", "11"]
}
},
"ABHCD": {
"domain": "ancestor",
"alphabet_from": "ab",
"scores": {
"A": ["aa"],
"H": ["ab", "ba"],
"B": ["bb"],
"-": ["aa", "ab", "ba", "bb"],
"C": ["ab", "ba", "bb"],
"D": ["aa", "ab", "ba"]
}
},
"CP": {
"domain": "ancestor",
"alphabet_from": "abcd",
"scores": {
"0": ["ac", "ad", "bc", "bd"],
"1": ["ac"],
"2": ["ad"],
"3": ["ac", "ad"],
"4": ["bc"],
"5": ["ac", "bc"],
"6": ["ad", "bc"],
"7": ["ac, ad", "bc"],
"8": ["bd"],
"9": ["ac", "bd"],
"A": ["ad", "bd"],
"B": ["ac", "ad", "bd"],
"C": ["bc", "bd"],
"D": ["ac", "bc", "bd"],
"E": ["ad", "bc", "bd"],
"F": ["ac", "ad", "bc", "bd"],
"a": ["ad", "bd"],
"b": ["ac", "ad", "bd"],
"c": ["bc", "bd"],
"d": ["ac", "bc", "bd"],
"e": ["ad", "bc", "bd"],
"f": ["ac", "ad", "bc", "bd"]
"-": ["ac", "ad", "bc", "bd"]
}
}
}
~~~~
# SEE ALSO
`spell-pedigree` (1), `spell-qtl` (1).
\ No newline at end of file
% SPELL-PEDIGREE(1) Spell-QTL software suite
% Damien Leroux <damien.leroux@inra.fr>, Sylvain Jasson <sylvain.jasson@inra.fr>
% December 2016
# NAME
spell-pedigree – Precompute the Markov Models for a pedigree
# SYNOPSIS
**spell-pedigree** [**-h**] [**-wd** *PATH*] [**-s** *CHAR*] **-p** *FILE*
# DESCRIPTION
**spell-pedigree** computes Markov Models representing the evolution of the genotype on all the individuals in a
pedigree.
It outputs a data file that can be used with **spell-marker** to compute the Parental Origin
Probabilities for this pedigree given allelic or genotype observations on a set of markers.
**-p**, **--pedigree-file** *FILE*
: Path to the genetic map file.
| The expected pedigree file must be a CSV file with each row in the following format:
| **GENERATION_NAME** ; **Individual number** ; **Parent1 number** ; **Parent2 number**
| Any additional column will be silently ignored by **spell-pedigree**.
|
| Individual numbers are expected to increase and all GREATER than zero, and parent numbers for a given individual are expected to be LESSER than the individual number.
|
| Breeding lines are encoded with Parent1 = Parent2 = 0.
| Selfings are encoded with Parent1 = Parent2.
| Doubled haploids are encoded with Parent2 = 0.
|
| The generation names will be used when specifying genotype and phenotype observations in the later steps.
|
| The first line is expected to be a header line and will be ignored.
# OPTIONS
**-h**, **--help**
: Display usage.
**-wd**, **--work-directory** *PATH*
: Path to directory for output files. Defaults to current directory.
**-s**,**--separator** *CHAR*
: Column delimiter character used in the pedigree file. Defaults to ";".
# OUTPUT
**spell-pedigree** will create a file named *FILE***.ped-data** next to the input file.
# SEE ALSO
`spell-marker` (1), `spell-qtl` (1).
\ No newline at end of file
% SPELL-QTL(1) Spell-QTL software suite
% Damien Leroux <damien.leroux@inra.fr>, Sylvain Jasson <sylvain.jasson@inra.fr>
% December 2016
# NAME
spell-qtl – Compute n-point Parental Origin Probabilities and perform QTL analysis on modern genetic datasets.
# SYNOPSIS
**spell-qtl** [*options...*] **-n** *NAME* **-gm** *MAP* **-p** *POPDATA* *GEN* *TRAITS* [**-p**...] [model and algorithms configuration...]
# DESCRIPTION
**spell-qtl** computes the n-point Parental Origin Probabilities along the linkage groups using the data provided by
**spell-marker**.
**-n**,**--name** *NAME*
: User-friendly name for this configuration
**-gm**,**--genetic-map** *MAP*
: Path to the genetic map file
**-p**,**--population** *POPDATA* *GEN* *TRAITS*
: Specify a new population (dataset) to work on.
The *POPDATA* path must point to a file output by spell-marker. *GEN* is the name of the phenotyped generation.
The *TRAITS* path must point to a trait observation file with the **same** number of individuals as defined for
the given generation in the pedigree for this population.
The genetic map **MUST** be specified **BEFORE** any population data.
# OPTIONS
## Miscellaneous
**-v**,**--version**
: Display version and exit
**-h**,**--help**
: Display usage and exit
**-N**,**--notes** *TEXT*
: Optional free text
**-wd**,**--work-directory** *PATH*
: Path to directory for cache files and outputs. Defaults to **/tmp**.
**-P**,**--parallel** *N_CORES*
: Setup parallel computations (number of cores to use or 'auto'). Defaults to **0**.
**--clean**
: Clears all cached files in the specified working directory (the **-wd** parameter **MUST** appear before **--clean**).
**-a**,**--ansi**
: Use ANSI escape sequences to display colors and realtime progress information at the top of the terminal.
Enabled by default only if output is on a terminal.
**-na**,**--no-ansi**
: Don't use ANSI escape sequences, don't display colors or realtime progress information.
## Model options
The following configures the construction of the linear model.
**connected**
: Select connected mode. Disabled by default.
In connected mode, the same ancestors in two datasets share the same column in the linear model.
**epistasis**
: Detect epistasis. Disabled by default.
**pleiotropy** *TOLERANCE*
: Detect pleiotropic QTLs. Disabled by default.
## Processing options
The following configures the QTL analysis.
The standard pipeline is:
1. skeleton creation
2. cofactors detection
3. QTLs detection
4. effects estimation
**qtl-threshold-permutations** *VALUE*
: Set the number of permutations to compute the QTL threshold value in automatic mode. Default is **10000**.
**qtl-threshold-quantile** *VALUE*
: Set the quantile value in range [0:1] to select the QTL threshold value in automatic mode. Default is **0.05**.
**qtl-threshold-value** *trait=value,...*
: Set the QTL threshold value manually for some traits. If not specified, will be automatically computed using the above settings.
**cofactor-threshold** *trait=value,...*
: Set the cofactor threshold value manually for some traits. Defaults to **value of QTL threshold * .9**.
**cofactor-exclusion-window** *DISTANCE*
: Set the half-size (in cM) of the exclusion window around cofactors. No detection will be performed inside this window. Defaults to **30**.
**step** *VALUE*
: Step size in cM. Defaults to **1**.
**skeleton** *MODE* *marker,...\ OR \ distance*
: Setup the cofactor detection skeleton. Mode can be either *manual*, *auto* or *none*.
If *manual*, specify a comma-separated marker list.
If *auto*, specify the minimum interval between markers in cM.
By default, mode is *auto* and interval is *20*.
**cofactor-detection** **ALGORITHM**
: Specify the cofactor detection algorithm. Available algorithms are *forward*, *backward*, *none*, and *all*. Default is *forward*.
**initial-selection** *SELECTION*
: Specify the initial selection of QTLs for the detection algorithm.
The selection is a comma-separated list of CHROMOSOME:POSITION values.
Setting an initial selection overrides and cancels skeleton generation and cofactor detection.
**QTL-detection** *ALGORITHM*
: Specify the QTL detection algorithm. Available algorithms are *none*, *CIM*, *CIM-*, *iQTLm*, and *iQTLm-GW*. The default algorithm is *iQTLm*.
# SEE ALSO
`spell-pedigree` (1), `spell-qtl` (1).
\ No newline at end of file
......@@ -188,6 +188,10 @@ struct settings_t {
size_t max_order;
bool cross_indicator_can_interact;
std::vector<std::string> npoint_gen;
bool output_npoint;
settings_t()
: notes()
, map_filename(), map()
......@@ -228,6 +232,8 @@ struct settings_t {
, thread_stacks()
, max_order(1)
, cross_indicator_can_interact(false)
, npoint_gen()
, output_npoint(false)
{}
std::vector<std::pair<const chromosome*, double>>
......
......@@ -336,7 +336,7 @@ arguments = {
ensure(target)->n_threads = to<int>(*++ai);
SAFE_IGNORE_CALLBACK_ARGS;
}},
{{"-ssh", "--dipatch-SSH"},
{{"-ssh", "--dispatch-SSH"},
{"comma-separated host list"},
"Use SSH for job dispatch.",
false,
......@@ -433,7 +433,7 @@ arguments = {
// }},
{{"-m", "--marker-obs"},
{"gen:format", "path"},
"Path to the marker observations file of generation 'gen' with given format.\nIf a pedigree file is specified, the generation name 'gen' refers to individuals so labelled in the pedigree data. Otherwise, it refers to the corresponding generation in the breeding design specification file.",
"Path to the marker observations file of generation 'gen' with given format. This file must have as many individuals as the pedigree has for that generation.",
false,
{true},
[](CALLBACK_ARGS)
......@@ -478,7 +478,7 @@ arguments = {
SAFE_IGNORE_CALLBACK_ARGS;
}},
{{"-o", "--output-generations"},
{"comma-separated list of generation names"},
{"comma-separated list"},
"Specifies the list of variables to extract after the computation.\nThe state probabilities for all individuals in the given generations will be extracted and made available for spell-qtl.",
false,
{"all generations"},
......
......@@ -814,6 +814,19 @@ arguments = {
// }},
{"Input datasets", "The following specify the datasets you want processed.\nA dataset specification starts with argument -p, followed by one or more arguments -m.\nArguments -l, -e, and -z are non-requisite and may appear anywhere after -p.", false, {
{{"-gm", "--genetic-map"},
{"path"},
"Path to the genetic map file",
false,
{true},
[](CALLBACK_ARGS)
{
ensure(target)->map_filename = *++ai;
ifile ifs(ensure(target)->map_filename);
ensure(target)->map = read_data::read_map(ifs);
SAFE_IGNORE_CALLBACK_ARGS;
}},
{{"-p", "--population"},
{"popdata path", "QTL generation name", "traits path"},
"Specify a new population (dataset) to work on.\nThe popdata path musts point to a file output by spell-marker.\nThe QTL generation name is the name of the phenotyped generation.\n The traits path must point to a trait observation file with the SAME number of individuals as defined for the given generation in the pedigree for this population.\nThe genetic map MUST be specified BEFORE any population data.",
......@@ -979,6 +992,22 @@ arguments = {
}},
{"Processing options", "The following configures the QTL analysis.\nThe standard pipeline is:\n - skeleton creation\n - cofactors detection\n - QTLs detection\n - effects estimation", false, {
{{"output-nppop"},
{},
"Output the n-point POP for the given datasets and exit",
false,
{false},
[](CALLBACK_ARGS)
{
// auto& list = ensure(target)->npoint_gen;
// std::istringstream iss(*ai++);
// std::string gen;
// while (std::getline(iss, gen, ',')) {
// list.push_back(gen);
// }
ensure(target)->output_npoint = true;
SAFE_IGNORE_CALLBACK_ARGS;
}},
{{"qtl-threshold-permutations"},
{"value"},
"Set the number of permutations to compute the QTL threshold value in automatic mode",
......
......@@ -74,6 +74,53 @@ locus_probabilities(const context_key& ck, const locus_key& lk,
int ind,
const std::vector<double>& loci);
std::string
marker_names_and_increment(chromosome_value chr, chromosome::haplotype_iterator& hi, double locus) {
auto haplo = *hi;
double haplocus = chr->raw.marker_locus[haplo.first];
if (locus == haplocus) {
std::stringstream ss;
ss << chr->raw.marker_name[haplo.first];
for (size_t i = haplo.first + 1; i < haplo.second; ++i) {
ss << ',' << chr->raw.marker_name[i];
}
++hi;
return ss.str();
}
return "";
}
int
dump_locus_probabilities(const context_key& ck, const locus_key& lk,
int ind, const std::vector<double>& loci) {
value<locus_probabilities_type> lp = make_value<Disk>(locus_probabilities,
as_value(ck), as_value(lk),
as_value(ind),
as_value(loci));
std::string filename = MESSAGE("n-point." << ck->chr->name << '.' << ck->pop->gen->name << '.' << ind << ".csv");
std::ofstream ofs(filename);
auto chr = ck->chr;
auto hi = chr->begin();
ofs << "markers";
for (double l: loci) {
ofs << ';' << marker_names_and_increment(chr, hi, l);
}
ofs << std::endl;
ofs << "locus";
for (double l: loci) {
ofs << ';' << l;
}
ofs << std::endl;
for (int row = 0; row < lp->rows(); ++row) {
ofs << lp->row_labels[row];
for (int col = 0; col < lp->cols(); ++col) {
ofs << ';' << (*lp)(row, col);
}
ofs << std::endl;
}
return 0;
}
int main(int argc, const char** argv)
{
(void)msg_handler_t::instance();
......@@ -106,6 +153,28 @@ int main(int argc, const char** argv)
MSG_DEBUG("\x1b[2J\x1b[0;0H" << std::endl << std::endl << std::endl << std::endl);
for (int i = 0; i < active_settings->parallel; ++i) { MSG_DEBUG(""); }
}
if (active_settings->output_npoint) {
auto all_loci = full_search_intervals();
collection<int> all_lp;
for (const auto& chr_interval: all_loci) {
chromosome_value chr = chr_interval.first;
const auto& loci = chr_interval.second.front().all_positions;
locus_key empty_lk;
// for (const std::string& gen: active_settings->npoint_gen) {
for (auto pop: active_settings->populations) {
context_key ck(new context_key_struc(pop.get(), chr, loci));
auto alp = make_collection<Disk>(dump_locus_probabilities,
as_value(ck), as_value(empty_lk),
range<int>(0, pop->size(), 1),
as_value(loci));
all_lp.insert(all_lp.end(), alp.begin(), alp.end());
}
}
for (auto& x: all_lp) { *x; }
return 0;
}
{
std::stringstream sargs;
for (const char** a = argv + 1; *a; ++a) {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment