Commit b9143e84 authored by Maria Bernard's avatar Maria Bernard
Browse files

No commit message

No commit message
parent fb8fa92e
......@@ -16,30 +16,258 @@
#
import os
from subprocess import Popen, PIPE
from ng6.analysis import Analysis
from weaver.function import PythonFunction
from collections import Counter
import numpy as np
import re
from jflow.component import Component
from jflow.iotypes import OutputFile, InputFileList, Formats
from jflow.abstraction import MultiMap
def wrap_cstacks(exec_path, mismatch, batch_id, inputs_tags_list, output_tags, output_alleles, output_snps, stderr_path):
from subprocess import Popen, PIPE
inputs = ""
with open(inputs_tags_list, 'r') as input_tags :
for i in input_tags:
inputs += " -s " + os.path.abspath(i.strip()).replace(".tags"," ").split(" ")[0]
out_dir=os.path.dirname(output_tags)
cmd = [exec_path.strip(), " -b ", str(batch_id) , " -n ", str(mismatch), inputs, " -o ",out_dir ]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
# write down the stderr
stdeh = open(stderr_path, "w")
stdeh.write(stderr)
stdeh.close()
from weaver.function import ShellFunction
# cmd = exec_path+" -b "+str(batch_id)+" -n "+str(mismatch)+inputs+" -o "+out_dir+" 2> "+stderr_path
# os.system(cmd)
print cmd
class Cstacks (Component):
class Cstacks (Analysis):
def define_parameters(self, alleles_files, snps_files, tags_files, samples_id, population_list, batch_id=1, mismatch=1):
"""
@param alleles_files : list of individuals alleles files
@param snps_files : list of individuals snps files
@param tags_files : list of individuals clustering files
@param samples_id : list of individuals id
@param samples_id : list of individuals id
@param population_list :which sample belong to which population, same order as sample_id
@param batch_id (-b) : batch id
@param mismatch (-n) : number of mismatches allowed between sample tags when generating the catalog. default 1
"""
self.add_input_file_list( "alleles_files", "list of individuals alleles files", default=alleles_files, required=True)
self.add_input_file_list( "snps_files", "list of individuals snps files", default=snps_files, required=True)
self.add_input_file_list( "tags_files", "list of individuals clustering files", default=tags_files, required=True)
if len(self.tags_files) != len(self.alleles_files) and len(self.tags_files)!= len(self.snps_files) :
raise Exception("[ERROR] : number of ustacks input files are not equal. Please check the ustacks result")
self.add_parameter_list( "samples_id", "individuals id list", default=samples_id, required=False)
self.add_parameter_list( "population_list", "which sample belong to which population, same order as sample_id", default=population_list, required=False)
if len(self.samples_id) != len(self.population_list) :
raise Exception("[ERROR] : samples_id and population_list are not equal. Please check your inputs")
self.add_parameter("batch_id", "batch_id", default=batch_id, type=int)
self.add_parameter("mismatch", "number of mismatches allowed between sample tags when generating the catalog.", default=mismatch, type=int)
self.add_output_file("catalog_tags", "catalog cluster files", filename="batch_"+`self.batch_id`+".catalog.tags.tsv")
self.add_output_file("catalog_alleles", "catalog alleles files", filename="batch_"+`self.batch_id`+".catalog.alleles.tsv")
self.add_output_file("catalog_snps", "catalog snps files", filename="batch_"+`self.batch_id`+".catalog.snps.tsv")
self.add_output_file("stderr", "cstacks log", filename="cstacks_batch_"+`self.batch_id`+".stderr")
def define_parameters(self, alleles_files, snps_files, tags_files, mismatch=1):
self.alleles_files = InputFileList(alleles_files)
self.snps_files = InputFileList(snps_files)
self.tags_files = InputFileList(tags_files)
self.mismatch = mismatch
self.catalog_tags = OutputFile(os.path.join(self.output_directory, "batch_1.catalog.tags.tsv"))
self.catalog_alleles = OutputFile(os.path.join(self.output_directory, "batch_1.catalog.alleles.tsv"))
self.catalog_snps = OutputFile(os.path.join(self.output_directory, "batch_1.catalog.snps.tsv"))
self.stderrs = OutputFile(os.path.join(self.output_directory, "batch_1.catalog.stderrs"))
def get_version(self):
cmd = [self.get_exec_path("cstacks"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[1]
def define_analysis(self):
self.name = "cstacks"
self.description = "Construction of a catalog of consensus locus among all individuals"
self.software = "cstacks"
self.options = " -n " + str(self.mismatch)
def process(self):
inputs = ""
for i, input in enumerate(self.alleles_files):
inputs += " -s " + ".".join(input.split(".")[:-2])
cstacks = ShellFunction(self.get_exec_path("cstacks") + " -b 1 " + inputs + " -n " + str(self.mismatch) + \
" -o " + self.output_directory + " 2> $1", cmd_format='{EXE} {OUT} {IN}')
cstacks(inputs=[self.alleles_files, self.snps_files, self.tags_files], outputs = [self.stderrs, self.catalog_tags, self.catalog_alleles, self.catalog_snps])
tmp_list_file = self.get_temporary_file()
with open(tmp_list_file, 'w') as tmp_file :
for tag in self.tags_files:
tmp_file.write(tag+"\n")
cstacks = PythonFunction(wrap_cstacks, cmd_format='{EXE} {ARG} {IN} {OUT}')
cstacks(inputs = [tmp_list_file], outputs = [self.catalog_tags, self.catalog_alleles, self.catalog_snps, self.stderr], \
arguments = [self.get_exec_path("cstacks"), self.mismatch, self.batch_id], \
includes = [self.tags_files, self.alleles_files, self.snps_files])
def post_process(self):
# parse stderr
tot_locus = 0
tot_sample = 0
samples_name,nb_locus =[],[]
samples_name,nb_locus = self.__parse_stderr(self.stderr)
tot_locus=nb_locus[-1]
tot_sample = len(samples_name)
# ==> faire courbe f(idx)=elem sur nb_locus
# parse catalog.tags.tsv
locus_dict={"singleton":[], "multi_sample":[], "half_sample":[],"all_sample":[]}
if len(self.population_list) > 0:
self.__parse_tags(self.catalog_tags,tot_sample,locus_dict, self.samples_id, self.population_list)
else :
self.__parse_tags(self.catalog_tags,tot_sample,locus_dict)
print "tot_locus,len(locus_dict[singleton]), len(locus_dict[multi_sample]), len(locus_dict[half_sample]), \
len(locus_dict[all_sample])"
print tot_locus,len(locus_dict["singleton"]), len(locus_dict["multi_sample"]), len(locus_dict["half_sample"]), \
len(locus_dict["all_sample"])
self._add_result_element("cstacks_tags", "nb_locus", tot_locus)
self._add_result_element("cstacks_tags", "locus_singleton", len(locus_dict["singleton"]))
self._add_result_element("cstacks_tags", "locus_multi_sample", len(locus_dict["multi_sample"]))
if len(self.population_list) > 0:
self._add_result_element("cstacks_tags", "locus_multi_pop", locus_dict["multi_pop"])
self._add_result_element("cstacks_tags", "locus_half_sample", len(locus_dict["half_sample"]))
self._add_result_element("cstacks_tags", "locus_all_sample", len(locus_dict["all_sample"]))
# parse catalog.alleles.tsv
snp_dict={"nb_snp":{"clust_var":[],"singleton":[0,0], "multi_sample":[0,0], "half_sample":[0,0],"all_sample":[0,0]},\
"nb_hap":{"1_hap":0, "2_hap":0,"3_hap":0,"4_hap":0,"sup5_hap":0,"sup10_hap":0,"max_hap":0}}
self.__parse_alleles(self.catalog_alleles, locus_dict, snp_dict)
nb_clust_var=len(snp_dict["nb_snp"]["clust_var"])
tot_var = np.sum(snp_dict["nb_snp"]["clust_var"])
mean_var = np.mean(snp_dict["nb_snp"]["clust_var"])
min_var = np.min(snp_dict["nb_snp"]["clust_var"])
max_var = np.max(snp_dict["nb_snp"]["clust_var"])
print "nb_clust_var, tot_var, mean_var, min_var, max_var, snp_dict[nb_snp][singleton], snp_dict[nb_snp][multi_sample],\
snp_dict[nb_snp][half_sample], snp_dict[nb_snp][all_sample] "
print nb_clust_var, tot_var, mean_var, min_var, max_var, snp_dict["nb_snp"]["singleton"], snp_dict["nb_snp"]["multi_sample"],\
snp_dict["nb_snp"]["half_sample"], snp_dict["nb_snp"]["all_sample"]
self._add_result_element("cstacks_snps", "nb_locus_var", nb_clust_var)
self._add_result_element("cstacks_snps", "tot_var", tot_var)
self._add_result_element("cstacks_snps", "mean_var", mean_var)
self._add_result_element("cstacks_snps", "min_var", min_var)
self._add_result_element("cstacks_snps", "max_var", max_var)
self._add_result_element("cstacks_snps", "singleton_var", snp_dict["nb_snp"]["singleton"])
self._add_result_element("cstacks_snps", "multi_sample_var", snp_dict["nb_snp"]["multi_sample"])
self._add_result_element("cstacks_snps", "half_sample_var", snp_dict["nb_snp"]["half_sample"])
self._add_result_element("cstacks_snps", "all_sample_var", snp_dict["nb_snp"]["all_sample"])
# ==> courbe de distribution du nombre de cluster en fonction du nombre de variant par cluster
print "snp_dict[nb_hap][1_hap],snp_dict[nb_hap][2_hap], snp_dict[nb_hap][3_hap], snp_dict[nb_hap][4_hap], snp_dict[nb_hap][sup5_hap],\
snp_dict[nb_hap][sup10_hap], snp_dict[nb_hap][max_hap]"
print snp_dict["nb_hap"]["1_hap"], snp_dict["nb_hap"]["2_hap"], snp_dict["nb_hap"]["3_hap"], snp_dict["nb_hap"]["4_hap"], snp_dict["nb_hap"]["sup5_hap"],\
snp_dict["nb_hap"]["sup10_hap"], snp_dict["nb_hap"]["max_hap"]
self._add_result_element("cstacks_alleles", "locus_var_1hap", snp_dict["nb_hap"]["1_hap"])
self._add_result_element("cstacks_alleles", "locus_var_2hap", snp_dict["nb_hap"]["2_hap"])
self._add_result_element("cstacks_alleles", "locus_var_3_hap", snp_dict["nb_hap"]["3_hap"])
self._add_result_element("cstacks_alleles", "locus_var_4_hap", snp_dict["nb_hap"]["4_hap"])
self._add_result_element("cstacks_alleles", "locus_var_sup5_hap", snp_dict["nb_hap"]["sup5_hap"])
self._add_result_element("cstacks_alleles", "locus_var_sup10_hap", snp_dict["nb_hap"]["sup10_hap"])
self._add_result_element("cstacks_alleles", "locus_var_max_hap", snp_dict["nb_hap"]["max_hap"])
def __parse_stderr(self, cstacks_stderr):
samples_name=[]
nb_locus=[0]
with open(cstacks_stderr, 'r') as stderr :
for l in stderr:
sample_regex = re.search("Parsing (.*tags.tsv.*)",l)
nb_locus_regex = re.search("(\d+) loci in the catalog.*",l)
if sample_regex:
s = os.path.basename(sample_regex.group(1)).replace(".tags.tsv"," ").split()[0]
samples_name.append(s)
if nb_locus_regex :
nb = nb_locus_regex.group(1)
nb_locus.append(int(nb))
return samples_name,nb_locus
def __parse_tags(self,cstacks_tags, tot_sample, locus_dict, sample_id=[], pop_list=[]):
dic_pop={}
if len(sample_id) != 0:
locus_dict["multi_pop"]=0
dic_pop=dict(zip(sample_id, pop_list))
with open(cstacks_tags,"r") as tags:
for line in tags:
locus_id=line.split()[2]
compo={"ind":[],"pop":[]}
tab=line.split()[7].split(",")
for clust in tab :
indiv=clust.split("_")[0]
# compte les individus differents fusionne sur le meme locus
if not indiv in compo['ind']:
compo['ind'].append(indiv)
# si info de pop, compte les pop differentes fusionnees sur le meme locus
if dic_pop :
pop=dic_pop[indiv]
if not pop in compo['pop']:
compo['pop'].append(pop)
nb_ind=len(compo['ind'])
nb_pop=len(compo['pop'])
if nb_ind == 1 :
locus_dict["singleton"].append(locus_id)
elif nb_ind >= 2 :
locus_dict["multi_sample"].append(locus_id)
if nb_pop > 1 :
locus_dict["multi_pop"] += 1
elif nb_ind >= tot_sample/2 :
locus_dict["half_sample"].append(locus_id)
elif nb_ind == tot_sample :
locus_dict["all_sample"].append(locus_id)
def __parse_alleles(self,cstacks_alleles,locus_dict,snp_dict):
clust_hap=[]
with open(cstacks_alleles,"r") as alleles:
for line in alleles:
tab=line.split('\t')
if len(tab[3]) > 0:
locus_id=tab[2]
if not locus_id in clust_hap:
snp_dict["nb_snp"]["clust_var"].append(len(tab[3]))
if locus_id in locus_dict['singleton'] :
snp_dict["nb_snp"]["singleton"][0]+=1
snp_dict["nb_snp"]["singleton"][1]+=len(tab[3])
elif locus_id in locus_dict['multi_sample'] :
snp_dict["nb_snp"]["multi_sample"][0]+=1
snp_dict["nb_snp"]["multi_sample"][1]+=len(tab[3])
elif locus_id in locus_dict['half_sample'] :
snp_dict["nb_snp"]["half_sample"][0]+=1
snp_dict["nb_snp"]["half_sample"][1]+=len(tab[3])
elif locus_id in locus_dict['all_sample'] :
snp_dict["nb_snp"]["all_sample"][0]+=1
snp_dict["nb_snp"]["all_sample"][1]+=len(tab[3])
clust_hap.append(locus_id)
nb_hap = Counter(clust_hap)
max=0
for c in nb_hap:
if nb_hap[c] == 1 :
snp_dict["nb_hap"]["1_hap"] += 1
elif nb_hap[c] == 2 :
snp_dict["nb_hap"]["2_hap"] += 1
elif nb_hap[c] == 3 :
snp_dict["nb_hap"]["3_hap"] += 1
elif nb_hap[c] == 4 :
snp_dict["nb_hap"]["4_hap"] += 1
elif nb_hap[c] >= 5 :
snp_dict["nb_hap"]["sup5_hap"] += 1
if nb_hap[c] >= 10 :
snp_dict["nb_hap"]["sup10_hap"] += 1
if nb_hap[c] > max :
max = nb_hap[c]
snp_dict["nb_hap"]["max_hap"] = max
# print snp_dict
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment