Commit c1068e53 authored by Maria Bernard's avatar Maria Bernard
Browse files

RADSeq ustacks component et RADSeq init

parent 4ed025f7
......@@ -26,7 +26,7 @@ from weaver.function import ShellFunction
class Ustacks (Analysis):
def define_parameters(self, samples_id, read1_files, min_cov=3, primary_mismatch=2, secondary_mismatch=4, uncall_hap=False, removal_algo=True, \
deleveraging_algo=True, max_locus=3, model_type="snp", alpha=0.05, bound_low=0, bound_high=1, bc_err_freq=0):
deleveraging_algo=True, max_locus=3, model_type="snp", alpha=0.05, bound_low=0.0, bound_high=1.0, bc_err_freq=0.0):
"""
@param samples_id (-i): list of individuals id
@param read1_files (-1): paths to reads 1
......@@ -48,26 +48,26 @@ class Ustacks (Analysis):
if len(self.read1_files) != len(self.samples_id):
raise Exception("[ERROR] : the number of files and the number of id are not equal. Please check your inputs")
self.add_parameter("min_cov", "Minimum depth of coverage required to create a stack (default 3).", default=min_cov)
self.add_parameter("primary_mismatch", "Maximum distance (in nucleotides) allowed between stacks (default 2).", default=primary_mismatch)
self.add_parameter("secondary_mismatch", "Maximum distance allowed to align secondary reads to primary stacks (adviced: M + 2).", default=secondary_mismatch)
self.add_parameter("uncall_hap", "disable calling haplotypes from secondary reads.", default=uncall_hap)
self.add_parameter("removal_algo", "enable the Removal algorithm, to drop highly-repetitive stacks (and nearby errors) from the algorithm.", default=removal_algo)
self.add_parameter("deleveraging_algo", "enable the Deleveraging algorithm, used for resolving over merged tags.", default=deleveraging_algo)
self.add_parameter("max_locus", "maximum number of stacks at a single de novo locus (default 3).", default=max_locus)
self.add_parameter("model_type", "either 'snp' (default), 'bounded', or 'fixed'", default=model_type)
self.add_parameter("min_cov", "Minimum depth of coverage required to create a stack (default 3).", default=min_cov, type=int)
self.add_parameter("primary_mismatch", "Maximum distance (in nucleotides) allowed between stacks (default 2).", default=primary_mismatch, type=int)
self.add_parameter("secondary_mismatch", "Maximum distance allowed to align secondary reads to primary stacks (adviced: M + 2).", default=secondary_mismatch, type=int)
self.add_parameter("uncall_hap", "disable calling haplotypes from secondary reads.", default=uncall_hap, type=bool)
self.add_parameter("removal_algo", "enable the Removal algorithm, to drop highly-repetitive stacks (and nearby errors) from the algorithm.", default=removal_algo, type=bool)
self.add_parameter("deleveraging_algo", "enable the Deleveraging algorithm, used for resolving over merged tags.", default=deleveraging_algo, type=bool)
self.add_parameter("max_locus", "maximum number of stacks at a single de novo locus (default 3).", default=max_locus, type=int)
self.add_parameter("model_type", "either 'snp' (default), 'bounded', or 'fixed'", default=model_type, type=str)
if self.model_type not in ['snp', 'bounded', 'fixed'] :
raise Exception("[ERROR] : the ustacks model type must be either 'snp' (default), 'bounded', or 'fixed'")
if self.model_type == "snp" or self.model_type =="bounded" :
self.add_parameter("alpha", "chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.", default=alpha)
self.add_parameter("alpha", "chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.", default=alpha, type=float)
if self.model_type == "bounded" :
self.add_parameter("bound_high", "upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).", default=bound_high)
self.add_parameter("bound_low", "lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).", default=bound_low)
self.add_parameter("bound_high", "upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).", default=bound_high, type=float)
self.add_parameter("bound_low", "lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).", default=bound_low, type=float)
if self.bound_high < self.bound_low:
raise Exception("[ERROR] : lower bound for epsilon must be smaller than upper bound")
if self.model_type == "fixed" :
self.add_parameter("bc_err_freq", "specify the barcode error frequency, between 0 and 1.0.", default=bc_err_freq)
self.add_parameter("bc_err_freq", "specify the barcode error frequency, between 0 and 1.0.", default=bc_err_freq, type=float)
self.add_output_file_list("tags", "cluster files", pattern='{basename_woext}.tags.tsv', items=self.read1_files)
self.add_output_file_list("alleles", "haplotype files", pattern='{basename_woext}.alleles.tsv', items=self.read1_files)
......@@ -130,7 +130,7 @@ class Ustacks (Analysis):
if self.model_type == "fixed" :
self.options+= " --bc_err_freq " + str(self.bc_err_freq)
print self.options
for idx, read1 in enumerate(self.read1_files):
id=self.samples_id[idx]
......
......@@ -58,19 +58,18 @@ class RADseq (NG6Workflow):
self.add_exclusion_rule('trim_reads2', 'trim_barcode')
# ustacks
self.add_multiple_parameter("ustacks_opt", "ustacks options",flag="--ustacks-opt")
self.add_parameter("minCov", "stacks minimum coverage", add_to = "ustacks_opt")
self.add_parameter("primary_mismatch", "stacks max mismatch", add_to = "ustacks_opt")
self.add_parameter("secondary_mismatch", "2nd read max mismatch", add_to = "ustacks_opt")
self.add_parameter("uncall_hap", "uncall hap with 2nd read", add_to = "ustacks_opt")
self.add_parameter("removal_algo", "removal cluster correction algo", add_to = "ustacks_opt")
self.add_parameter("deleveraging_algo", "deleveraging cluster correction algo", add_to = "ustacks_opt")
self.add_parameter("max_locus", "max stacks per cluster", add_to = "ustacks_opt")
self.add_parameter("model_type", "model_type : snp, bounded or fixed", add_to = "ustacks_opt")
self.add_parameter("alpha", "alpha threshold for snp or bounded model", add_to = "ustacks_opt")
self.add_parameter("bound_low", "bound low threshold for bounded model", add_to = "ustacks_opt")
self.add_parameter("bound_high", "bound low threshold for bounded model", add_to = "ustacks_opt")
self.add_parameter("bc_err_freq", "barcode error frequency for fixed model", add_to = "ustacks_opt")
self.add_parameter("min_cov", "stacks minimum coverage", type=int)
self.add_parameter("primary_mismatch", "stacks max mismatch", type=int)
self.add_parameter("secondary_mismatch", "2nd read max mismatch", type=int)
self.add_parameter("uncall_hap", "uncall hap with 2nd read", type=bool)
self.add_parameter("disable_removal_algo", "removal cluster correction algo", type=bool)
self.add_parameter("disable_deleveraging_algo", "deleveraging cluster correction algo", type=bool)
self.add_parameter("max_locus", "max stacks per cluster", type=int)
self.add_parameter("model_type", "model_type : snp, bounded or fixed", type=str)
self.add_parameter("alpha", "alpha threshold for snp or bounded model", type=float)
self.add_parameter("bound_low", "bound low threshold for bounded model", type=float)
self.add_parameter("bound_high", "bound low threshold for bounded model", type=float)
self.add_parameter("bc_err_freq", "barcode error frequency for fixed model", type=float)
# cstacks
self.add_parameter("catalog_mismatches", "How many mismatches allowed between sample to generate the cluster catalog",default = 1, type=int)
......@@ -89,18 +88,18 @@ class RADseq (NG6Workflow):
if pools.has_key(p["id"]) :
raise ValueError, "Duplicated pool id." + p['id']
pools[p["id"]] = (p, [])
for sample in self.samples :
pool_id = sample.get_metadata('pool_id')
if not pools.has_key(pool_id):
raise ValueError, "The pool id " + pool_id + " does not exists in (individual " + sample.name + ")"
pools[pool_id][1].append(sample)
# prepare fastq files
fastq_files_1 = []
fastq_files_2 = []
barcode_files = []
for pool_id, data in pools.iteritems() :
p = data[0]
samples = data[1]
......@@ -109,47 +108,67 @@ class RADseq (NG6Workflow):
fastq_files_1.append(p['read1']);
if p['read2'] :
fastq_files_2.append(p['read2'])
# write barcode file
with open(tmp_barcode, "w") as ff:
for sample in samples :
ff.write(sample.name + "\t" + sample.get_metadata('barcode') +"\n")
splitbc = self.add_component("Splitbc", [ fastq_files_1, barcode_files, fastq_files_2, rad, rad_tag, self.mismatches,
self.tag_mismatch, self.trim_barcode, self.trim_reads2])
# creation des listes
samples_id = []
read1_files = []
read2_files = []
# for idx, read1 in enumerate(splitbc.output_read1):
# samples_id.append(idx)
# read1_files.append(read1)
# read2_files.append(splitbc.output_read2[idx])
#
#
# # PROCESS_RATAG
# process_radtag(read1_files,read2_files)
# check non empty and MAJ sample_id
# # USTACKS
# # check non empty and MAJ sample_id
# for idx, read1 in enumerate(process_radtag.output_read_1):
# if empty read1:
# sample_id.pop(idx)
# ustacks = self.add_component("Ustacks", [sample_id,process_radtag.outputread1])
# test
samples_id = []
read1_files = []
# test composant Ustacks avant process_radtags ==> a supprimer
for idx, read1 in enumerate(splitbc.output_read1):
samples_id.append(idx)
read1_files.append(read1)
# print samples_id
# print read1_files
ustacks_opt={}
ustacks = self.add_component("Ustacks", [samples_id,read1_files])
ustacks_opt={"samples_id":samples_id,"read1_files":read1_files}
# ustacks_opt={"samples_id":samples_id,"read1_files":process_radtag.output_read_1}
if self.min_cov :
ustacks_opt["min_cov"] = self.min_cov
if self.primary_mismatch :
ustacks_opt["primary_mismatch"] = self.primary_mismatch
if self.secondary_mismatch :
ustacks_opt["secondary_mismatch"] = self.secondary_mismatch
if self.uncall_hap :
ustacks_opt["uncall_hap"] = self.uncall_hap
if self.disable_removal_algo :
ustacks_opt["removal_algo"] = False
if self.disable_deleveraging_algo :
ustacks_opt["deleveraging_algo"] = False
if self.max_locus :
ustacks_opt["max_locus"] = self.max_locus
if self.model_type :
ustacks_opt["model_type"] = self.model_type
if self.alpha :
ustacks_opt["alpha"] = self.alpha
if self.bound_low :
ustacks_opt["bound_low"] = self.bound_low
if self.bound_high :
ustacks_opt["bound_high"] = self.bound_high
if self.bc_err_freq :
ustacks_opt["bc_err_freq"] = self.bc_err_freq
print ustacks_opt
ustacks = self.add_component("Ustacks", [],ustacks_opt)
#cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]])
......@@ -71,4 +71,4 @@ HiSeq 2000
--type
Region 1
--species
Escherichia coli
Escherichia coli
\ No newline at end of file
......@@ -59,6 +59,8 @@ Region 1
--species
Escherichia coli
# ustacks option if not default
--ustacks-opt
minCov=3
\ No newline at end of file
# ustacks option
--min-cov
2
--uncall-hap
--disable-removal-algo
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment