Commit fb8fa92e authored by Maria Bernard's avatar Maria Bernard
Browse files

No commit message

No commit message
parent c1068e53
......@@ -84,7 +84,7 @@ tmp_directory = <path>/tmp
#mothur = /usr/bin/mothur
#fastx_reverse_complement = /usr/bin/fastx_reverse_complement
#STAR = /usr/bin/STAR
#fastuniq = /usr/local/bin/fastuniq
#
# workflow specific
#
......
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import re
import numpy as np
from subprocess import Popen, PIPE
from ng6.analysis import Analysis
from weaver.function import ShellFunction
class Fastuniq (Analysis):
def define_parameters(self, read1_files, read2_files):
"""
@param read1_files : paths to reads 1
@param read2_files : paths to reads 2
"""
self.add_input_file_list( "read1_files", "paths to reads 1", default=read1_files, required=True)
self.add_input_file_list( "read2_files", "paths to reads 2", default=read2_files, required=True)
if len(self.read1_files) != len(self.read2_files):
raise Exception("[ERROR] : the number of files for read 1 and read 2 are not equal. Please check your inputs")
self.add_output_file_list("out_read1_files", "uniq read1 files", pattern='{basename}', items=self.read1_files)
self.add_output_file_list("out_read2_files", "uniq read2 files", pattern='{basename}', items=self.read2_files)
def define_analysis(self):
self.name = "fastuniq"
self.description = "duplicate removed fastq paired files "
self.software = "fastuniq"
self.options = "-t q -c 0"
def get_version(self):
cmd = "1.1"
return cmd
def process(self):
self.options = "-t q -c 0"
for idx, read1 in enumerate(self.read1_files):
read2=self.read2_files[idx]
tmp_list_file = self.get_temporary_file()
with open(tmp_list_file, 'w') as tmp_file :
tmp_file.write(read1+"\n"+read2+"\n")
fastuniq = ShellFunction(self.get_exec_path("fastuniq") + " -i $1 -t q -o $2 -p $3 -c 0 ", cmd_format='{EXE} {IN} {OUT}')
fastuniq(inputs = [tmp_list_file], outputs = [self.out_read1_files[idx],self.out_read2_files[idx]], includes=[self.read1_files[idx],self.read2_files[idx]])
def post_process(self):
# nb read before
nb_before=[]
for read1 in self.read1_files:
with open(read1,"r") as r1:
nb = len(r1.readlines())/4
nb_before.append(nb)
# nb read after
nb_after=[]
for read1 in self.out_read1_files:
with open(read1,"r") as r1:
nb = len(r1.readlines())/4
nb_after.append(nb)
# percent dup
per_dup_list=[]
for i in range(0,len(nb_before)) :
percent_dup = round((nb_before[i]-nb_after[i])*100.0/nb_before[i],2)
per_dup_list.append(percent_dup)
self._add_result_element("fastuniq", "percent_dup", percent_dup)
# print "dup stat ",per_dup_list
\ No newline at end of file
......@@ -130,12 +130,10 @@ class Ustacks (Analysis):
if self.model_type == "fixed" :
self.options+= " --bc_err_freq " + str(self.bc_err_freq)
print self.options
for idx, read1 in enumerate(self.read1_files):
id=self.samples_id[idx]
format=gzfastq if read1.endswith(".gz") else "fastq"
# print self.get_exec_path("ustacks") + " -t " + format + " -f $1 -o " + self.output_directory + " -i " + str(id) + self.options
format="gzfastq" if read1.endswith(".gz") else "fastq"
ustacks = ShellFunction(self.get_exec_path("ustacks") + " -t " + format + " -f $1 -o " + self.output_directory + " -i " + str(id)
+ self.options + " 2> $2", cmd_format='{EXE} {IN} {OUT}')
ustacks(inputs = [read1], outputs = [self.stderrs[idx], self.tags[idx], self.alleles[idx], self.snps[idx]])
......@@ -152,6 +150,8 @@ class Ustacks (Analysis):
nb_sample,sum_reads_used,mean_reads_used,std_reads_used,min_reads_used,max_reads_used,median_reads_used=self.__colstat(nb_used)
# print "usacks nb lectures\nnb_sample,nb_tot,sum_reads_used,mean_reads_used,std_reads_used,min_reads_used,max_reads_used,median_reads_used\n",nb_sample,nb_tot,sum_reads_used,mean_reads_used,std_reads_used,min_reads_used,max_reads_used,median_reads_used
self._add_result_element("ustacks_log", "nb_sample", nb_sample)
self._add_result_element("ustacks_log", "nb_tot_reads", nb_tot)
percent_used=round((sum_reads_used*100.0/nb_tot),2)
......@@ -174,6 +174,8 @@ class Ustacks (Analysis):
nb_sample,sum_clust,mean_clust,std_clust,min_clust,max_clust,median_clust=self.__colstat(nb_cons_white)
# print "ustacks nb cluster\nnb_sample,nb_cons_tot,sum_clust,mean_clust,std_clust,min_clust,max_clust,median_clust\n",nb_sample,nb_cons_tot,sum_clust,mean_clust,std_clust,min_clust,max_clust,median_clust
self._add_result_element("ustacks_tags", "nb_sample", nb_sample)
self._add_result_element("ustacks_tags", "nb_tot_clust", nb_cons_tot)
percent_clust=round((sum_clust*100.0/nb_cons_tot),2)
......@@ -205,6 +207,8 @@ class Ustacks (Analysis):
mean_stacks=round(np.mean(clust_stacks),2)
med_stacks=np.median(clust_stacks)
# print "ustacks couv cluster\nnb_sample,max_cov,mean_cov,med_cov,max_2nd,mean_2nd,med_2nd,max_stacks,mean_stacks,med_stacks\n",nb_sample,max_cov,mean_cov,med_cov,max_2nd,mean_2nd,med_2nd,max_stacks,mean_stacks,med_stacks
self._add_result_element("ustacks_tags", "nb_sample", nb_sample)
self._add_result_element("ustacks_tags", "max_cov", max_cov)
self._add_result_element("ustacks_tags", "mean_cov", mean_cov)
......@@ -238,6 +242,8 @@ class Ustacks (Analysis):
nb_clust_var,sum_snp,mean_snp,std_snp,min_snp,max_snp,median_snp=self.__colstat(nb_snps_tot)
# print "ustacks nb snp\nnb_clust_var,sum_snp,mean_snp,std_snp,min_snp,max_snp,median_snp\n",nb_clust_var,sum_snp,mean_snp,std_snp,min_snp,max_snp,median_snp
self._add_result_element("ustacks_alleles", "nb_cluster_snp", nb_cluster_snp)
self._add_result_element("ustacks_alleles", "sum_snp", sum_snp)
self._add_result_element("ustacks_tags", "mean_clust", mean_snp)
......@@ -250,6 +256,8 @@ class Ustacks (Analysis):
percent_clust2_snp=round((cluster2_tot*100.0/nb_cluster_snp),2)
percent_clust3_snp=round((cluster3_tot*100.0/nb_cluster_snp),2)
# print "ustacks nb cluster var\npercent_clust1_snp,percent_clust2_snp,percent_clust3_snp\n",percent_clust1_snp,percent_clust2_snp,percent_clust3_snp
self._add_result_element("ustacks_alleles", "percent_cluster_1snp", percent_clust1_snp)
self._add_result_element("ustacks_alleles", "percent_cluster_2snp", percent_clust2_snp)
self._add_result_element("ustacks_alleles", "percent_cluster_3snp", percent_clust3_snp)
......@@ -279,11 +287,15 @@ class Ustacks (Analysis):
mediansnp_cov = round(np.median(snp_cov), 2)
maxsnp_cov = np.max(snp_cov)
# print "ustacks cov snp\nminsnp_cov,meansnp_cov,mediansnp_cov,maxsnp_cov\n",minsnp_cov,meansnp_cov,mediansnp_cov,maxsnp_cov
self._add_result_element("ustacks_alleles", "minsnp_cov", minsnp_cov)
self._add_result_element("ustacks_alleles", "meansnp_cov", meansnp_cov)
self._add_result_element("ustacks_alleles", "mediansnp_cov", mediansnp_cov)
self._add_result_element("ustacks_alleles", "maxsnp_cov", maxsnp_cov)
# print "ustacks\nnb_hap\n",nb_hap
self._add_result_element("ustacks_alleles", "clust_var_1hap", nb_hap["hap1"])
self._add_result_element("ustacks_alleles", "clust_var_2hap", nb_hap["hap2"])
self._add_result_element("ustacks_alleles", "clust_var_3hap", nb_hap["hap3"])
......
......@@ -41,7 +41,7 @@ class RADseq (NG6Workflow):
return 'radseq'
def get_description(self):
return "RADseq data analysis workflow"
return "RADseq data quality analysis workflow"
def define_parameters(self, function="process"):
......@@ -65,11 +65,6 @@ class RADseq (NG6Workflow):
self.add_parameter("disable_removal_algo", "removal cluster correction algo", type=bool)
self.add_parameter("disable_deleveraging_algo", "deleveraging cluster correction algo", type=bool)
self.add_parameter("max_locus", "max stacks per cluster", type=int)
self.add_parameter("model_type", "model_type : snp, bounded or fixed", type=str)
self.add_parameter("alpha", "alpha threshold for snp or bounded model", type=float)
self.add_parameter("bound_low", "bound low threshold for bounded model", type=float)
self.add_parameter("bound_high", "bound low threshold for bounded model", type=float)
self.add_parameter("bc_err_freq", "barcode error frequency for fixed model", type=float)
# cstacks
self.add_parameter("catalog_mismatches", "How many mismatches allowed between sample to generate the cluster catalog",default = 1, type=int)
......@@ -116,32 +111,32 @@ class RADseq (NG6Workflow):
splitbc = self.add_component("Splitbc", [ fastq_files_1, barcode_files, fastq_files_2, rad, rad_tag, self.mismatches,
self.tag_mismatch, self.trim_barcode, self.trim_reads2])
# creation des listes
samples_id = []
read1_files = []
read2_files = []
# for idx, read1 in enumerate(splitbc.output_read1):
# samples_id.append(idx)
# read1_files.append(read1)
# read2_files.append(splitbc.output_read2[idx])
# FASTUNIQ
fastuniq = self.add_component("Fastuniq", [splitbc.output_read1,splitbc.output_read2])
#
# # PROCESS_RATAG
# process_radtag(read1_files,read2_files)
# # PROCESS_RADTAG
# process_radtag(fastuniq.out_read1_files,fastuniq.out_read2_files)
#
# # USTACKS
# # creation des listes pour conserver l'ordre entre read1 read2 et sample id
# samples_id = []
# read1_files = []
# read2_files = []
# # check non empty and MAJ sample_id
# for idx, read1 in enumerate(process_radtag.output_read_1):
# if empty read1:
# sample_id.pop(idx)
# test composant Ustacks avant process_radtags ==> a supprimer
for idx, read1 in enumerate(splitbc.output_read1):
samples_id.append(idx)
read1_files.append(read1)
# if os.path.getsize(read1) > 0:
# read1_files.append(read1)
# read2_files.append(process_radtag.output_read_2[idx])
# samples_id.append(idx)
#
# test a supprimer
read1_files = fastuniq.out_read1_files
samples_id = range(0,len(read1_files))
ustacks_opt={"samples_id":samples_id,"read1_files":read1_files}
# ustacks_opt={"samples_id":samples_id,"read1_files":process_radtag.output_read_1}
if self.min_cov :
ustacks_opt["min_cov"] = self.min_cov
if self.primary_mismatch :
......@@ -156,18 +151,8 @@ class RADseq (NG6Workflow):
ustacks_opt["deleveraging_algo"] = False
if self.max_locus :
ustacks_opt["max_locus"] = self.max_locus
if self.model_type :
ustacks_opt["model_type"] = self.model_type
if self.alpha :
ustacks_opt["alpha"] = self.alpha
if self.bound_low :
ustacks_opt["bound_low"] = self.bound_low
if self.bound_high :
ustacks_opt["bound_high"] = self.bound_high
if self.bc_err_freq :
ustacks_opt["bc_err_freq"] = self.bc_err_freq
print ustacks_opt
# print ustacks_opt
ustacks = self.add_component("Ustacks", [],ustacks_opt)
#cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]])
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment