Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • svdetection/popsim
1 result
Show changes
Commits on Source (1)
...@@ -56,7 +56,7 @@ def parse_args(): ...@@ -56,7 +56,7 @@ def parse_args():
:return: argparse arguments object :return: argparse arguments object
""" """
prg_path = os.path.dirname(os.path.realpath(__file__)) prg_path = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser(description='Generate simulated populations with SV', parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int) parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int)
...@@ -71,6 +71,8 @@ def parse_args(): ...@@ -71,6 +71,8 @@ def parse_args():
const=True, default=False) const=True, default=False)
parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
const=True, default=False) const=True, default=False)
parser.add_argument("--no-reads", help="Only simulate rearranged genomes, do not simulate reads", action="store_const",
const=True, default=False)
parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001) parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
parser.add_argument("-pu", "--proba-dup", help="Probabilty to have a duplication", type=float, default=0.000001) parser.add_argument("-pu", "--proba-dup", help="Probabilty to have a duplication", type=float, default=0.000001)
parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001) parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001)
...@@ -107,7 +109,7 @@ def parse_args(): ...@@ -107,7 +109,7 @@ def parse_args():
if args.proba_inv > 1 or args.proba_inv < 0: if args.proba_inv > 1 or args.proba_inv < 0:
parser.error("Probability of inversions must be between 0 and 1") parser.error("Probability of inversions must be between 0 and 1")
if args.proba_dup > 1 or args.proba_dup < 0: if args.proba_dup > 1 or args.proba_dup < 0:
parser.error("Probability of duplication must be between 0 and 1") parser.error("Probability of duplication must be between 0 and 1")
...@@ -198,7 +200,7 @@ def build_genotypes_vcf_list(deletions: dict, inversions: dict, duplications:dic ...@@ -198,7 +200,7 @@ def build_genotypes_vcf_list(deletions: dict, inversions: dict, duplications:dic
:param prg_path: program path {str} :param prg_path: program path {str}
:return: dictionary of genotypes for each variant for each dictionary {OrderedDict} :return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
""" """
vcf_file = os.path.join(tmp_dir, "template.vcf") vcf_file = os.path.join(tmp_dir, "template.vcf")
# Build VCF header: # Build VCF header:
...@@ -255,7 +257,7 @@ def build_genotypes_vcf_list(deletions: dict, inversions: dict, duplications:dic ...@@ -255,7 +257,7 @@ def build_genotypes_vcf_list(deletions: dict, inversions: dict, duplications:dic
vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")], ## -1 for 0-based? or already done by vcf model? vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")], ## -1 for 0-based? or already done by vcf model?
".", ".", info, "GT", [0], genotypes) ".", ".", info, "GT", [0], genotypes)
records.append(vcf_record) records.append(vcf_record)
# Duplications: <MC> # Duplications: <MC>
genotypes_for_inds_DUP = OrderedDict() genotypes_for_inds_DUP = OrderedDict()
for chrm, the_duplications in duplications.items(): for chrm, the_duplications in duplications.items():
...@@ -445,33 +447,33 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in ...@@ -445,33 +447,33 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in
printer): printer):
printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
fasta_orig = SeqIO.index(reference, "fasta") fasta_orig = SeqIO.index(reference, "fasta")
### Building the list of genome modification, with the sequence of allele 0 and allele 1 ### Building the list of genome modification, with the sequence of allele 0 and allele 1
fasta_modif_list = OrderedDict() fasta_modif_list = OrderedDict()
for chrm, svs_infos in genotypes_for_inds_INV.items(): for chrm, svs_infos in genotypes_for_inds_INV.items():
if chrm not in fasta_modif_list.keys(): if chrm not in fasta_modif_list.keys():
fasta_modif_list[chrm]=[] fasta_modif_list[chrm]=[]
for sv in svs_infos: for sv in svs_infos:
fasta_modif_list[chrm].append(build_info_inversion_from_sv(svs_infos[sv],fasta_orig[chrm])) fasta_modif_list[chrm].append(build_info_inversion_from_sv(svs_infos[sv],fasta_orig[chrm]))
for chrm, svs_infos in genotypes_for_inds_DUP.items(): for chrm, svs_infos in genotypes_for_inds_DUP.items():
if chrm not in fasta_modif_list.keys(): if chrm not in fasta_modif_list.keys():
fasta_modif_list[chrm]=[] fasta_modif_list[chrm]=[]
for sv in svs_infos: for sv in svs_infos:
fasta_modif_list[chrm].append(build_info_duplication_from_sv(svs_infos[sv],fasta_orig[chrm])) fasta_modif_list[chrm].append(build_info_duplication_from_sv(svs_infos[sv],fasta_orig[chrm]))
for chrm, svs_infos in genotypes_for_inds_DEL.items(): for chrm, svs_infos in genotypes_for_inds_DEL.items():
if chrm not in fasta_modif_list.keys(): if chrm not in fasta_modif_list.keys():
fasta_modif_list[chrm]=[] fasta_modif_list[chrm]=[]
for sv in svs_infos: for sv in svs_infos:
fasta_modif_list[chrm].append(build_info_deletion_from_sv(svs_infos[sv],fasta_orig[chrm])) fasta_modif_list[chrm].append(build_info_deletion_from_sv(svs_infos[sv],fasta_orig[chrm]))
#print("List",fasta_modif_list) #print("List",fasta_modif_list)
### ###
### Building the complete sequence list, adding the common part between the modification and their correspondnig sequence ### Building the complete sequence list, adding the common part between the modification and their correspondnig sequence
## TODO : do it in one step ## TODO : do it in one step
fasta_frag={} fasta_frag={}
...@@ -491,18 +493,18 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in ...@@ -491,18 +493,18 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in
else: else:
fasta_frag_chrm.append(sv) fasta_frag_chrm.append(sv)
previous_end=sv['end'] previous_end=sv['end']
if fasta_frag_chrm[-1]['end'] < last_nt: if fasta_frag_chrm[-1]['end'] < last_nt:
fasta_frag_chrm.append(build_info_common(fasta_frag_chrm[-1]['end']+1,last_nt,fasta_orig[chrm],nb_inds)) ## test_seq fasta_frag_chrm.append(build_info_common(fasta_frag_chrm[-1]['end']+1,last_nt,fasta_orig[chrm],nb_inds)) ## test_seq
fasta_frag[chrm] = fasta_frag_chrm fasta_frag[chrm] = fasta_frag_chrm
for t in fasta_frag[chrm]: for t in fasta_frag[chrm]:
print(t['type'],t['start'],t['end'],t['genotypes']) print(t['type'],t['start'],t['end'],t['genotypes'])
### ###
### fasta buildings ### fasta buildings
### use the complete list to create the haplotype sequence ### use the complete list to create the haplotype sequence
### for each haplotype, we append the common sequence and the allele sequence corresponding to the haplotype (0 or 1) ### for each haplotype, we append the common sequence and the allele sequence corresponding to the haplotype (0 or 1)
## TOSO : it's a bit slow, try to fasten the process ## TOSO : it's a bit slow, try to fasten the process
for chrm in fasta_orig: for chrm in fasta_orig:
for ind in range(0, nb_inds): for ind in range(0, nb_inds):
...@@ -511,13 +513,13 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in ...@@ -511,13 +513,13 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in
for sv in fasta_frag[chrm]: for sv in fasta_frag[chrm]:
geno_haplo = int(sv['genotypes'][ind][haplo]) geno_haplo = int(sv['genotypes'][ind][haplo])
seq_haplo_ind += sv['seq'][geno_haplo] seq_haplo_ind += sv['seq'][geno_haplo]
seq_haplo_ind.id=fasta_orig[chrm].id seq_haplo_ind.id=fasta_orig[chrm].id
seq_haplo_ind.name=fasta_orig[chrm].name seq_haplo_ind.name=fasta_orig[chrm].name
seq_haplo_ind.description=fasta_orig[chrm].description seq_haplo_ind.description=fasta_orig[chrm].description
output_handle = open(os.path.join(output_dir, "indiv" + str(ind + 1) + "_chr_" + str(haplo) + ".fasta"),"w") output_handle = open(os.path.join(output_dir, "indiv" + str(ind + 1) + "_chr_" + str(haplo) + ".fasta"),"w")
SeqIO.write(seq_haplo_ind, output_handle, "fasta") SeqIO.write(seq_haplo_ind, output_handle, "fasta")
### ###
...@@ -530,7 +532,7 @@ def build_info_common(start,end,fasta_orig_chrm,nb_inds): ...@@ -530,7 +532,7 @@ def build_info_common(start,end,fasta_orig_chrm,nb_inds):
info['genotypes']=[] info['genotypes']=[]
for ind in range(0, nb_inds): for ind in range(0, nb_inds):
info['genotypes'].append(['0','0']) info['genotypes'].append(['0','0'])
seq_ori = fasta_orig_chrm[int(info['start']-1):int(info['end'])] seq_ori = fasta_orig_chrm[int(info['start']-1):int(info['end'])]
info['seq']=[seq_ori,seq_ori] info['seq']=[seq_ori,seq_ori]
return info return info
...@@ -558,28 +560,28 @@ def build_info_inversion_from_sv(sv,fasta_orig_chrm): ...@@ -558,28 +560,28 @@ def build_info_inversion_from_sv(sv,fasta_orig_chrm):
seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])] seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])]
info['seq']=[seq_ori,seq_ori.reverse_complement(id=seq_ori.id,name=seq_ori.name,description=seq_ori.description)] info['seq']=[seq_ori,seq_ori.reverse_complement(id=seq_ori.id,name=seq_ori.name,description=seq_ori.description)]
return info return info
def build_info_duplication_from_sv(sv,fasta_orig_chrm,reverse=False): def build_info_duplication_from_sv(sv,fasta_orig_chrm,reverse=False):
info={} info={}
info['type']='DUPLICATION' info['type']='DUPLICATION'
## The CNV will be represented by the target_base (base after which the duplicated sequence insert) ## The CNV will be represented by the target_base (base after which the duplicated sequence insert)
## 0 : target_base, 1 : target base + duplicated sequence ## 0 : target_base, 1 : target base + duplicated sequence
## beware1 : if target base > seq_length, the insertion will occur at the end of the sequence ## beware1 : if target base > seq_length, the insertion will occur at the end of the sequence
## beware2 : if multiple duplications are inserted at the end of the sequence, the target site will be duplicated !! ## beware2 : if multiple duplications are inserted at the end of the sequence, the target site will be duplicated !!
## => patch : check if multiple duplication insert in the same place and keep only once the target_base ## => patch : check if multiple duplication insert in the same place and keep only once the target_base
## => not a priority ## => not a priority
target_base_pos = int(sv['end']+sv['shift']) target_base_pos = int(sv['end']+sv['shift'])
seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])] seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])]
target_base = fasta_orig_chrm[target_base_pos-1:target_base_pos] target_base = fasta_orig_chrm[target_base_pos-1:target_base_pos]
info['start']=target_base_pos info['start']=target_base_pos
info['end']=target_base_pos info['end']=target_base_pos
info['size']=sv['end']-sv['start'] + 1 #size of duplicated sequence info['size']=sv['end']-sv['start'] + 1 #size of duplicated sequence
info['genotypes']=sv['genotypes'] info['genotypes']=sv['genotypes']
if reverse: if reverse:
info['seq']=[target_base,target_base + seq_ori.reverse_complement(id=seq_ori.id,name=seq_ori.name,description=seq_ori.description)] info['seq']=[target_base,target_base + seq_ori.reverse_complement(id=seq_ori.id,name=seq_ori.name,description=seq_ori.description)]
else: else:
...@@ -703,7 +705,7 @@ def confirm(deletions: dict, inversions: dict, duplications: dict,variants: dict ...@@ -703,7 +705,7 @@ def confirm(deletions: dict, inversions: dict, duplications: dict,variants: dict
def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, proba_dup, freq_del, freq_inv, freq_dup, def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, proba_dup, freq_del, freq_inv, freq_dup,
haploid, force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None, haploid, force_polymorphism, no_reads, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None,
min_deletions=1, min_inversions=1, min_duplications=1, max_try=20, quiet=True, stdout=None, stderr=None): #MC min_deletions=1, min_inversions=1, min_duplications=1, max_try=20, quiet=True, stdout=None, stderr=None): #MC
printer = Print(stdout=stdout, stderr=stderr) printer = Print(stdout=stdout, stderr=stderr)
...@@ -827,9 +829,10 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p ...@@ -827,9 +829,10 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p
return 1 return 1
build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, genotypes_for_inds_DUP, haploid, output_dir, build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, genotypes_for_inds_DUP, haploid, output_dir,
nb_inds, printer) #MC nb_inds, printer) #MC
printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") if not no_reads:
generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
insert_len_sd, prg_path, threads, stdout, stderr) generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
insert_len_sd, prg_path, threads, stdout, stderr)
printer.out("DONE!\n") printer.out("DONE!\n")
except ExecException as e: except ExecException as e:
printer.err(e) printer.err(e)
......