Commit dbb2bdec authored by Jean-Benoist Leger's avatar Jean-Benoist Leger
Browse files

initial commit

parents
Package: LineImputer
Type: Package
Title: Merging and imputation of files
Version: 0.1.58
Author: Stanislav Lipin, Tristan Mary-Huard, Michel Koskas, Émilie Lebarbier, Jean-Benoist Leger
Maintainer: Tristan Mary-Huard <maryhuar@agroparistech.fr>
Description: This package allows to merge and impute files. It contains three functions: 'mergevcf', 'imputation' and 'impThreshold'. The function 'mergevcf' is designed for merging two or more, VCF files but it can be applied to other highly structured file types. The function 'imputation' allows to predict the unknown values in VCF files. As distinct from the function 'mergevcf' this function can be applied only to VCF files. The function 'impThreshold' allows to process the files after imputation. This function can be applied only to VCF files.
License: GPL-2
Encoding: UTF-8
LazyData: true
Imports: Rcpp (>= 0.12.11), stringr (>= 1.2.0), tools
LinkingTo: Rcpp
RoxygenNote: 6.0.1
NeedsCompilation: yes
Packaged: 2018-05-03 13:03:31 UTC; lebarbier
importFrom(Rcpp, evalCpp)
import(stringr)
import(tools)
useDynLib(LineImputer)
export(imputation)
export(mergevcf)
export(impThreshold)
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
ImputatorCpp <- function(markov_order, input_path, input_filename, output_path, threshold, initial_count, do_viterbi, do_probas) {
invisible(.Call('_LineImputer_ImputatorCpp', PACKAGE = 'LineImputer', markov_order, input_path, input_filename, output_path, threshold, initial_count, do_viterbi, do_probas))
}
ImpThreshold_cpp <- function(probas_file, output_file, threshold, nb_supp_elements, nb_cols, replace_values, do_threshold) {
invisible(.Call('_LineImputer_ImpThreshold_cpp', PACKAGE = 'LineImputer', probas_file, output_file, threshold, nb_supp_elements, nb_cols, replace_values, do_threshold))
}
txttovcfCpp <- function(txt_filename, vcf_filename, output_filename, nb_supp_elements, nb_cols, replace_values) {
invisible(.Call('_LineImputer_txttovcfCpp', PACKAGE = 'LineImputer', txt_filename, vcf_filename, output_filename, nb_supp_elements, nb_cols, replace_values))
}
fusionCpp <- function(liststring, output_path, output_filename, input_file1, input_file2, unk_sign, comment_sign, separator, issorted, nb_supp_elements, value_length, file_extension, nb_supp_lines, last_iteration, num_col_sort) {
invisible(.Call('_LineImputer_fusionCpp', PACKAGE = 'LineImputer', liststring, output_path, output_filename, input_file1, input_file2, unk_sign, comment_sign, separator, issorted, nb_supp_elements, value_length, file_extension, nb_supp_lines, last_iteration, num_col_sort))
}
#' Performs imputation based on thresholded posterior probabilities
#'
#' This function creates an imputed data file from a posterior probability file by applying a thresholded classification rule to each posterior probability.
#'
#' @param probas_filename name of a posterior probability file, as created by \code{\link{imputation}}.
#' @param probas_path name of directory where \code{probas_filename} is located.
#' @param output_filename name of the output file. By default, the same as \code{probas_filename} (with an additional extension).
#' @param output_path name of the output directory.
#' @param threshold a numeric value between 0.5 and 1.
#' @param nb_supp_elements a numeric value indicating the number of metadata columns.
#'
#' @return
#' An imputed file with a format identical to the one of \code{probas_filename}.
#' By default the output file is created in the current work directory.
#'
#' @details
#' This function outputs a file of imputed data obtained by applying a thresholded classification rule to the posterior probabilities obtained from \code{prob_filename}. The thresholded rules performs classification as follows:
#' \itemize{
#' \item if the posterior probability is higher than \code{threshold}, the imputed value is '1/1',
#' \item if the posterior probability is lower than 1-\code{threshold}, the imputed value is '0/0',
#' \item if the posterior probability belongs to \eqn{[1-\code{threshold},\code{threshold}]} then no imputation is performed.
#' }
#' When applied with \eqn{\code{threshold}=0.5 } the classification rule boils down to the classical Maximum a Posteriori classification rule.
#'
#'
#' @examples
#'
#' ## Merging of two VCF files, imputation of the merged file using both the Viterbi and Forward Backward algorithms applying the default \eqn{\code{threshold}=0.5} for this latter and imputation with another threshold \code{threshold}.
#' file_path_1 <- system.file("extdata", "a.vcf", package = "LineImputer")
#' file_path_2 <- system.file("extdata", "b.vcf", package = "LineImputer")
#' mergevcf(filelist = c(file_path_1, file_path_2))
#'
#' imputation(markov_order = 3, input_filename = 'merged_file.vcf', do_viterbi = TRUE)
#'
#' impThreshold(probas_file = 'merged_file_MarkOrd_3_PosteriorProbabilities.vcf', threshold = 0.6)
#'
impThreshold <- function(probas_filename , probas_path=getwd(),
output_filename = paste0(substr(probas_filename, 1, nchar(probas_filename) - 4), '_Thres_', threshold, '_Imputed.vcf'),
output_path = getwd(),
threshold = 0.5, nb_supp_elements = 9) {
#### CHECKS
if (is.character(probas_filename) == FALSE)
stop('probas_filename must be a string')
if (is.character(probas_path) == FALSE)
stop('probas_path must be a string')
if (is.character(output_filename) == FALSE)
stop('output_file must be a string')
if (is.character(output_path) == FALSE)
stop('output_path must be a string')
if (threshold < 0.5 | threshold > 1 | is.numeric(threshold) == FALSE) {
stop('threshold must be a numeric value between 0.5 and 1')
}
if (nb_supp_elements < 0 | nb_supp_elements %% 1 != 0 | is.numeric(nb_supp_elements) == FALSE) {
stop('nb_supp_elements must be a positive numeric (integer) value')
}
probas_path <- paste0(probas_path,'/')
probas_filename <- file.path(probas_path, probas_filename)
if (!file.exists(probas_filename)) {
stop('File ', probas_filename, ' does not exist.')
}
if (substr(probas_filename, nchar(probas_filename) - 3, nchar(probas_filename)) != '.vcf') {
stop('probas_filename must be in VCF format')
}
output_path <- paste0(output_path,'/')
output_filename <- file.path(output_path, output_filename)
if (file.exists(output_filename)) {
unlink(output_filename)
}
if (file.create(output_filename, showWarnings = FALSE) == FALSE) {
stop('The path or name of output_filename is wrong. Output_file could not be created')
}
try_result <- try({
file_read <- file(probas_filename, 'r')
first_line <- readLines(file_read, n = 1)
while (substr(first_line, 1, 1) == '#') {
first_line <- readLines(file_read, n = 1)
}
close(file_read)
nb_cols <- str_count(string = first_line, pattern = '\t') - nb_supp_elements - 1
ImpThreshold_cpp(probas_filename, output_filename, threshold,
nb_supp_elements, nb_cols, replace_values = FALSE, do_threshold = TRUE)
}, silent = TRUE)
if (!is.null(try_result)) {
stop('Imputation was stopped. Probably the file ', probas_filename, ' has a wrong format')
}
}
#' Perform genotype imputation
#'
#' This function performs imputation of missing genotypic data based on a Markov chain model.
#'
#' @param markov_order a numeric value between 2 and 20 indicating the Markov chain order.
#' @param input_path path name of the directory containing the file to be imputed.
#' @param input_filename name of the file to be imputed.
#' @param output_path path name where the output files should be saved. Default is the current work directory.
#' @param nb_supp_elements a numeric value indicating the number of columns corresponding to metadata.
#' @param threshold a numeric value between 0.5 and 1.
#' @param initial_count initial probability of each datum to be '0/0' or '1/1. Default is 0.5.
#' @param do_viterbi a boolean indicating whether the Viterbi algorithm be used for imputation or not. Default is \code{FALSE}
#' @param do_probas a boolean indicating whether the posterior probabilities should be computed. Default is \code{TRUE}.
#'
#' @return If \code{do_probas=TRUE} (default), the function outputs two files:
#' \itemize{
#' \item "\code{input_filename}_MarkOrd_\code{markov_order}_PosteriorProbabilities.vcf" that corresponds to the posterior probability file obtained from the Forward-Backward algorithm,
#' \item "\code{input_filename}_MarkOrd_\code{markov_order}_PosteriorProbabilities_Thres_\code{threshold}_Imputed.vcf" that corresponds to the imputation based on the MAP rule applied to the posterior probabilities.
#' }
#' If \code{do_viterbi=TRUE}, the function outputs a .\code{vcf} file of (imputed) data named "\code{input_filename}_MarkOrd_\code{markov_order}_Imputed_Viterbi.vcf".
#' All output files have a structure identical to the initial file \code{input_filename} provided to the function (i.e. identical metadata). All files are created in the \code{output_path} directory.
#'
#' @details
#' The function performs imputation (prediction) of missing data based on a non-homogeneous Markov Chain (MC) model. The data to be imputed are assumed to be lines (i.e. pure homozygous individuals) described by bi-allelic markers (SNP), available from the \code{input_filename}.vcf file.
#' The \code{.vcf} format assumes that lines are displayed in columns and markers in rows. Additionally, the \code{input_filename} file may contain comment lines (starting with two or more "#" characters), and a header line starting with a single "#".
#' The meta-data, i.e. additional information about each marker, should be displayed in the first \code{nb_supp_elements} columns. These metadata are copied in all output files.
#'
#' The Markov model assumes that, at a given locus, the observation of value '0/0' or '1/1' depends on the observed values at the \code{markov_order} previous loci. One first needs to compute the transition matrix at each locus, then to infer the missing data either by inferring the most probable allelic sequence for each individual,
#' or by inferring at each locus and for each individual the most probable allele.
#'
#' \itemize{
#' \item Transition matrices are computed as follows. All values of all transition matrices are initialized at \code{initial_count}. The transition probabilities are then updated
#' by counting (for each locus and each allele) how many times each sequence of size \code{markov_order} is followed by '0/0' or '1/1'.
#' \item Finding the most probable sequence can be obtained using the Viterbi algorithm through the \code{do_viterbi} option. The most probable allelic sequence is then output.
#' \item Obtaining for each individual and at each locus the most probable allelic value can be done using the Forward Backward algorithm. The output is then the posterior probability (for each individual and each locus) to observer '1/1'.
#' Obtaining an imputed value requires an additional thresholding step: depending on whether the posterior probability is higher than \code{threshold}, lower than 1 - \code{threshold} or in the interval [1-\code{threshold}, \code{threshold}], the output value will be '1/1', '0/0' or './.', respectively.
#' If \code{threshold=0.5} this boils down to the classical Maximum a Posteriori classification rule, and all missing data are imputed. If \code{threshold>0.5}, only some of the missing values will be imputed.
#' }
#'
#'
#' @examples
#' ## Imputation of a VCF file
#' file_path_folder <- system.file("extdata", package = "LineImputer")
#' imputation(markov_order = 3, input_path = file_path_folder, input_filename = 'a.vcf', do_viterbi = TRUE)
#'
#' ## Merging of two VCF files then imputation the merged file.
#' file_path_1 <- system.file("extdata", "a.vcf", package = "LineImputer")
#' file_path_2 <- system.file("extdata", "b.vcf", package = "LineImputer")
#' mergevcf(filelist = c(file_path_1, file_path_2))
#'
#' imputation(markov_order = 3, input_filename = 'merged_file.vcf', do_viterbi = TRUE)
#'
# Sys.setenv("PKG_CXXFLAGS"="-O3")
imputation <- function(markov_order = 2, input_path = getwd(),
input_filename, output_path = getwd(),
nb_supp_elements = 9, threshold = 0.5, initial_count = 0.5,
do_viterbi = FALSE, do_probas = TRUE){
if (is.character(input_path) == FALSE)
stop('input_path must be a string')
if (is.character(input_filename) == FALSE)
stop('input_filename must be a string')
if (substr(input_filename, nchar(input_filename) - 3, nchar(input_filename)) != '.vcf')
stop ('input_filename must have VCF format')
if (is.character(output_path) == FALSE)
stop('output_path must be a string')
if (!file.exists(file.path(input_path, input_filename))) {
stop('File ', file.path(input_path, input_filename), ' doesn\'t exist')
}
if (is.numeric(markov_order) == FALSE | markov_order < 2 | markov_order > 20 | markov_order%%1 != 0) {
stop('markov_order must be whole number not less than 2')
}
if (do_viterbi != FALSE & do_viterbi != TRUE)
stop('do viterbi must get the value TRUE or FALSE')
if (do_probas != FALSE & do_probas != TRUE)
stop('do viterbi must get the value TRUE or FALSE')
if (do_viterbi == FALSE & do_probas == FALSE) {
stop('You should select at least one mode of imputation: viterbi or probas.')
}
if (is.numeric(threshold) == FALSE | threshold < 0.5 | threshold > 1) {
stop('threshold must be between 0.5 and 1')
}
if (is.numeric(initial_count) == FALSE | initial_count < 0 | initial_count > 1) {
stop('initial_count must be between 0.5 and 1')
}
file_read <- file(file.path(input_path, input_filename), 'r')
text_line <- readLines(file_read, n = 1)
while (substr(text_line, 1, 1) == '#') {
text_line <- readLines(file_read, n = 1)
}
text_line <- readLines(file_read, n = markov_order)
close(file_read)
if (length(text_line) < markov_order) {
stop('The minimal number of rows must be: markov_order + 1')
}
input_filename <- substr(input_filename, 1, nchar(input_filename) - 4)
try_ForwardBackward <- try({
ImputatorCpp(markov_order, dirpath(input_path), input_filename, dirpath(output_path), threshold, initial_count, do_viterbi, do_probas)
if (do_probas == TRUE) {
file_read <- file(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_ForwardBackward.txt')), 'r')
} else {
file_read <- file(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count,'_ImputedData_Viterbi.txt')), 'r')
}
first_line <- readLines(file_read, n = 1)
close(file_read)
nb_cols <- stringr::str_count(string = first_line, pattern = '\t') + 1
if (do_probas == TRUE) {
txttovcfCpp(txt_filename = file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_ForwardBackward.txt')),
vcf_filename = file.path(input_path, paste0(input_filename, '.vcf')),
output_filename = file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_PosteriorProbabilities_Thres_',threshold, '_Imputed.vcf')),
nb_supp_elements, nb_cols, replace_values = TRUE)
txttovcfCpp(txt_filename = file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Probabilities.txt')),
vcf_filename = file.path(input_path, paste0(input_filename, '.vcf')),
output_filename = file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order, '_PosteriorProbabilities.vcf')),
nb_supp_elements, nb_cols, replace_values = FALSE)
}
if (do_viterbi == TRUE) {
txttovcfCpp(txt_filename = file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Viterbi.txt')),
vcf_filename = file.path(input_path, paste0(input_filename, '.vcf')),
output_filename = file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order, '_Imputed_Viterbi.vcf')),
nb_supp_elements, nb_cols, replace_values = TRUE)
}
}, silent = TRUE)
if (!is.null(try_ForwardBackward)) {
stop('Imputation was stopped. Probably the file ', file.path(input_path, paste0(input_filename, '.vcf')), ' has a wrong format')
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_InitialMatrix.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_InitialMatrix.txt')))
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_TransposedMatrix.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_TransposedMatrix.txt')))
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_ForwardBackward.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_ForwardBackward.txt')))
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Probabilities.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Probabilities.txt')))
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Viterbi.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Viterbi.txt')))
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_ForwardBackward_adjusted.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_ForwardBackward_adjusted.txt')))
}
if (file.exists(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Viterbi_adjusted.txt')))) {
unlink(file.path(output_path, paste0(input_filename, '_MarkOrd_',markov_order,'_Thres_',threshold,'_InitialProb_',initial_count, '_ImputedData_Viterbi_adjusted.txt')))
}
}
dirpath <- function(x) substr(file.path(x,'.'),1,nchar(x)+1)
fusionlist <- function(filelist, output_path, output_filename, ext, unk_sign, comment_sign, separator, nb_supp_elements, value_length, nb_supp_lines,num_col_sort) {
nb <- 1
issorted <- FALSE
filelist_init <- filelist
filenamelist <- filelist
files_nb <- length(filelist)
last_iteration = FALSE
while (length(filelist) > 1) {
if (length(filelist) == 2) {
last_iteration = TRUE
}
merged_file <- paste0('merged', nb, '.', ext)
if (nb > (files_nb + files_nb%%2)/2) issorted <- TRUE
liststring <- paste0(filenamelist[1], '; ', filenamelist[2])
filenamelist[[length(filenamelist) + 1]] <- liststring
filenamelist <- filenamelist[-c(1,2)]
try_result <- try(fusionCpp(liststring, filelist[[1]], filelist[[2]], dirpath(output_path), merged_file, unk_sign, comment_sign, separator, issorted, nb_supp_elements, value_length, file_extension = ext, nb_supp_lines, last_iteration, num_col_sort), silent=FALSE)
if (!is.null(try_result)) {
if (filelist[[1]] %in% filelist_init & filelist[[2]] %in% filelist_init) {
stop('the execution is stopped during the processing of files: ', filelist[[1]], ' and ', filelist[[2]])
} else stop('the execution is stopped during the processing of file ', filelist[[1]])
}
filelist <- filelist[-c(1, 2)]
filelist[[length(filelist) + 1]] <- file.path(output_path, merged_file)
nb <- nb + 1
}
if (nb - 2 >= 1) {
for (i in 1:(nb - 2)) {
try(file.remove(file.path(output_path, paste0('merged', i, '.', ext))), silent = TRUE)
}
}
file.rename(file.path(output_path, paste0('merged', nb - 1, '.', ext)), file.path(output_path, output_filename))
}
#' Merge two or more VCF files
#'
#' This function merges two or more VCF files by common markers and individuals. Additionally, it can be applied to merge files with other (highly structured) formats.
#'
#' @param output_path path name where the output file should be saved. Default is the current work directory.
#' @param output_filename name of the output file.
#' @param filelist a list/vector of names of files to be merged.
#' @param dirlist a list/vector of names of directories that contain the files to be merged.
#' @param ext a string indicating the extension for the files to be merged (only files in filelist or dirlist with the given extension will be merged). The output merged file will have the same extension. Default value is "vcf".
#' @param unk_sign a single string which is to be interpreted as the NA value. Note that this string should be of size \code{value_length}.
#' @param comment_sign a single character to be interpreted as the comment sign. Following the vcf format convention, file lines beginning with two comment signs will be treated as comment lines. A line beginning with one comment sign will be interpreted as the header.
#' @param separator the field separator character. Default is tabulation.
#' @param nb_supp_elements a numeric value indicating the number of columns corresponding to metadata.
#' @param value_length a numeric value indicating the length (i.e. the number of digits) of a datum in each file.
#' @param recursive logical. Whether the listing should recurse into directories or not. This is of use only if a \code{dirlist} is specified. Default is \code{TRUE}.
#' @param num_col_sort a numeric value indicating which colunm should be used to sort the rows in the merged file.
#'
#' @return A file with extension given in the argument 'ext' corresponding to the merging of all files listed in filelist and/or belonging to directories listed in dirlist.
#' Additionally, special files 'Incoherences.txt' and 'log.txt' are created in the same directory as the file of merging.
#' The file 'Incoherences.txt' contains the names of columns and rows corresponding to mismatches (a mismatch occurs when two data sharing the same column and row names in two different files do not have the same value).
#' The file 'log.txt' contains the number of columns and rows of each merged file including intermediate merged files which are removed once the merging is completed. This file also contains the list of merged files. The same list appears in the console at the end of merging.
#'
#' @details
#' The \code{mergevcf} function merges two or more VCF files by common markers and individuals. All files are assumed to have a classical \code{.vcf} format: rows correspond to markers, and columns correspond to biological samples. The first lines of the files correspond to comment lines starting with two or more \code{comment_sign}s, and the headers are displayed in a line starting with a single \code{comment_sign}. Each line contains additional information about the marker, contained in the first \code{nb_supp_elements} columns, that constitute the metadata. These metadata are pasted in the output file.
#' The files are merged according to their column and row names. The merging rules are:
#' \enumerate{
#' \item for an unobserved combination (rowname,colname), the resulting coded value is \code{unk_sign},
#' \item for a combination (rowname, colname) for which either \code{unk_sign} or an identical value V is observed in all files, the resulting coded value is V,
#' \item for mismatches, the resulting coded value is \code{unk_sign}.
#' }
#' The default setting of arguments \code{ext}, \code{unk_sign}, \code{comment_sign}, \code{separator}, \code{nb_supp_elements} and \code{value_length} correspond to the ones of the VCF format. Alternatively \code{mergevcf} may be used to merge other types of structured files, but with severe restrictions regarding their format.
#'
#' @examples
#'
#' ## merge two VCF files
#' file_path_1 <- system.file("extdata", "a.vcf", package = "LineImputer")
#' file_path_2 <- system.file("extdata", "b.vcf", package = "LineImputer")
#' mergevcf(filelist = c(file_path_1, file_path_2))
#'
#' ## merge all VCF files in a directory
#' folder_path <- system.file("extdata", package = "LineImputer")
#' mergevcf(dirlist = folder_path)
#'
mergevcf <- function(output_path = getwd(), output_filename = 'merged_file',
filelist = NULL, dirlist = NULL,
ext = 'vcf', unk_sign = './.', comment_sign = '#', separator = '\t',
nb_supp_elements = 9, value_length = 3, recursive = TRUE,num_col_sort=2) {
### CHECKS
#Check the num_col_sort format
if (is.numeric(num_col_sort) == FALSE | num_col_sort < 0 | num_col_sort %% 1 != 0)
stop('num_col_sort should be a positive integer')
#Check the filelist format. If a list, transform as vector
if (is.vector(filelist) == FALSE & is.list(filelist) == FALSE & is.null(filelist) == FALSE)
stop('filelist has a wrong format')
filelist <- unlist(filelist)
#Check the dirlist format
if (is.vector(dirlist) == FALSE & is.list(dirlist) == FALSE & is.null(dirlist) == FALSE)
stop('dirlist has a wrong format')
dirlist <- unlist(dirlist)
# if (!(is.null(dirlist)))
# if (substr(dirlist, nchar(dirlist), nchar(dirlist)) == '/')
# dirlist <- substr(dirlist, 1, nchar(dirlist) - 1)
#Check the value_length format
if (is.numeric(value_length) == FALSE | value_length < 1 | value_length %% 1 != 0)
stop('value_length should be a positive integer')
#Check the nb_supp_elements format
if (is.numeric(nb_supp_elements) == FALSE | nb_supp_elements < 0 | nb_supp_elements %% 1 != 0)
stop('nb_supp_elements should be a positive integer')
#Check the recursive format
if (recursive != TRUE & recursive != FALSE)
stop('recursive should be a boolean')
#Check the comment_sign format
if (is.character(comment_sign) == FALSE | nchar(comment_sign) != 1)
stop('comment_sign should be a single character')
#Check the ouput_filename format
if (is.character(output_filename) == FALSE)
stop('output_filename must be a string')
#Check the ouput_path format
if (is.character(output_path) == FALSE)
stop('output_path must be a string')
# if (substr(output_path, nchar(output_path), nchar(output_path)) != '/')
# stop('output_path must have \'/\' at the end')
#Check the ext format and format the output filename
if (is.character(ext) == FALSE | ext == '')
stop('ext must be a string of length at least 1')
output_filename <- paste0(output_filename, '.', ext)
#Check the unk_sign format
if (is.character(unk_sign) == FALSE | nchar(unk_sign) < 1)
stop('unk_sign should be a string of length at least 1')
# if (ext != '') {
# output_filename <- paste0(output_filename, '.', ext)
# } else {
# output_filename <- paste0(output_filename, '.vcf')
# }
#Check that the output file can be created
if (file.create(file.path(output_path, output_filename), showWarnings = FALSE) == FALSE) {
stop('Unable to create the output_file.')
}
#Additional checks on dirlist
if (is.null(dirlist) == FALSE){
#Check the existence of each directory
for (j in 1:length(dirlist)) {
if (dir.exists(dirlist[j]) == FALSE) {
warning('Directory ', dirlist[j], ' not found')
dirlist <- dirlist[-j]
}
}
#Check if there are files to be merged
pat <- paste0('*.', ext, '$')
files <- list.files(dirlist, recursive = recursive, pattern = pat, full.names = TRUE)
if (length(files) == 0) {
warning(paste('dirlist does not contain any file with extension',ext))
} else {
for (k in 1:length(files)) {
filelist[[length(filelist) + 1]] <- files[k]
}
}
}
#Additional checks on filelist
for (i in 1:length(filelist)) {
if (file.exists(filelist[[i]]) == FALSE) {
warning('File ', filelist[[i]], ' not found')
filelist <- filelist[-i]
}
}
if (length(filelist) < 2) stop('At least two files required for merging')
#Remove files whose name is redundant with files created during merging
if (file.exists(file.path(output_path, 'Incoherences.txt'))) {
unlink(file.path(output_path, 'Incoherences.txt'))
}
if (file.exists(file.path(output_path, 'log.txt'))) {
unlink(file.path(output_path, 'log.txt'))
}
#Assign a arbitrary value to nb_supp_lines that is now obsolete
nb_supp_lines = -1
### PERFORM MERGING
fusionlist(filelist, output_path, output_filename, ext, unk_sign, comment_sign, separator, nb_supp_elements, value_length, nb_supp_lines,num_col_sort)
write('Merged files: ', file = file.path(output_path, 'log.txt'), append = TRUE)
write(filelist, file = file.path(output_path, 'log.txt'), append = TRUE)
print('Merged files: ')
print(filelist)
#return()
}
\ No newline at end of file
##fileformat=VCFv4.1
##filedate=20120310
##source="beagle2vcf 1.2"
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B73 11430 A3 A310 A340 A347 A374
10 275561 AX-91807321 A G . PASS . GT 0/0 1/1 0/0 0/0 1/1 0/0 0/0
10 276788 AX-91807318 T C . PASS . GT ./. 1/1 1/1 1/1 ./. 1/1 0/0
10 307655 AX-91157302 T G . PASS . GT 1/1 1/1 1/1 1/1 1/1 1/1 0/0
10 311032 AX-91157303 T C . PASS . GT 0/0 0/0 0/0 ./. 0/0 0/0 0/0
##fileformat=VCFv4.1
##filedate=20120310
##source="beagle2vcf 1.2"
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B73 11430 A3 A310 A340 A347 A374
10 275561 AX-91807321 A G . PASS . GT 0/0 1/1 0/0 0/0 1/1 0/0 0/0
10 276788 AX-91807318 T C . PASS . GT 1/1 0/0 1/1 1/1 1/1 1/1 0/0
10 307655 AX-91157302 T G . PASS . GT 1/1 1/1 1/1 1/1 1/1 1/1 0/0
10 311032 AX-91157303 T C . PASS . GT 0/0 0/0 0/0 0/0 1/1 0/0 0/0
##fileformat=VCFv4.1
##filedate=20120310
##source="beagle2vcf 1.2"
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B73 11430 A158 A188 A310 A340 A347 A374
10 307546 AX-91157301 A G . PASS . GT 0/0 0/0 1/1 0/0 0/0 0/0 0/0 1/1
10 307655 AX-91157302 T G . PASS . GT 1/1 1/1 0/0 1/1 1/1 1/1 1/1 0/0
10 311032 AX-91157303 T C . PASS . GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
\name{LineImputer-internal}
\alias{ImputatorCpp}
\alias{ImpThreshold_cpp}
\alias{txttovcfCpp}
\alias{fusionCpp}
\alias{fusionlist}
\title{Internal LineImputer Functions} \description{
Internal LineImputer functions
}
\details{
These are not to be called by the user.
}
\keyword{ internal }
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/impThreshold.R
\name{impThreshold}
\alias{impThreshold}
\title{Impute the file by threshold}
\usage{
impThreshold(probas_filename , probas_path, output_filename,
output_path,threshold = 0.5, nb_supp_elements = 9)
}
\arguments{
\item{probas_filename}{name of a posterior probability file, as created by \code{\link{imputation}}.}
\item{probas_path}{name of directory where \code{probas_filename} is located.}
\item{output_filename}{name of the output file. By default, the same as \code{probas_filename} (with an additional extension).}
\item{output_path}{name of the output directory.}
\item{threshold}{a numeric value between 0.5 and 1.}
\item{nb_supp_elements}{a numeric value indicating the number of metadata columns.}
}
\value{An imputed file with a format identical to the one of \code{probas_filename}. By default the output file is created in the current work directory.
}
\description{This function creates an imputed data file from a posterior probability file by applying a thresholded classification rule to each posterior probability.
}
\details{This function outputs a file of imputed data obtained by applying a thresholded classification rule to the posterior probabilities obtained from \code{prob_filename}. The thresholded rules performs classification as follows:
\itemize{
\item if the posterior probability is higher than \code{threshold}, the imputed value is '1/1',
\item if the posterior probability is lower than 1-\code{threshold}, the imputed value is '0/0',
\item if the posterior probability belongs to [1-\code{threshold},\code{threshold}] then no imputation is performed.
}
When applied with \code{threshold}=0.5 the classification rule boils down to the classical Maximum a Posteriori classification rule.
}
\examples{
## Merging of two VCF files,
## imputation of the merged file using both the Viterbi and Forward Backward algorithms,
## and imputation with another threshold \code{threshold}.