Compare revisions

Thomas Faraut · ae01d898
--- a/build_pop.py
+++ b/build_pop.py
@@ -56,7 +56,7 @@ def parse_args():
    :return: argparse arguments object
    """
    prg_path = os.path.dirname(os.path.realpath(__file__))
    parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int)
@@ -71,6 +71,8 @@ def parse_args():
                        const=True, default=False)
    parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
                        const=True, default=False)
+    parser.add_argument("--no-reads", help="Only simulate rearranged genomes, do not simulate reads", action="store_const",
+                        const=True, default=False)
    parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
    parser.add_argument("-pu", "--proba-dup", help="Probabilty to have a duplication", type=float, default=0.000001)
    parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001)
@@ -107,7 +109,7 @@ def parse_args():
    if args.proba_inv > 1 or args.proba_inv < 0:
        parser.error("Probability of inversions must be between 0 and 1")
    if args.proba_dup > 1 or args.proba_dup < 0:
        parser.error("Probability of duplication must be between 0 and 1")
@@ -198,7 +200,7 @@ def build_genotypes_vcf_list(deletions: dict, inversions: dict, duplications:dic
    :param prg_path: program path {str}
    :return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
    """
    vcf_file = os.path.join(tmp_dir, "template.vcf")
    # Build VCF header:
@@ -255,7 +257,7 @@ def build_genotypes_vcf_list(deletions: dict, inversions: dict, duplications:dic
            vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")], ## -1 for 0-based? or already done by vcf model?
                                           ".", ".", info, "GT", [0], genotypes)
            records.append(vcf_record)
    # Duplications: <MC>
    genotypes_for_inds_DUP = OrderedDict()
    for chrm, the_duplications in duplications.items():
@@ -445,33 +447,33 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in
                             printer):
    printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
    fasta_orig = SeqIO.index(reference, "fasta")
    ### Building the list of genome modification, with the sequence of allele 0 and allele 1
    fasta_modif_list = OrderedDict()
    for chrm, svs_infos in genotypes_for_inds_INV.items():
        if chrm not in fasta_modif_list.keys():
            fasta_modif_list[chrm]=[]
        for sv in svs_infos:
            fasta_modif_list[chrm].append(build_info_inversion_from_sv(svs_infos[sv],fasta_orig[chrm]))
    for chrm, svs_infos in genotypes_for_inds_DUP.items():
        if chrm not in fasta_modif_list.keys():
            fasta_modif_list[chrm]=[]
        for sv in svs_infos:
            fasta_modif_list[chrm].append(build_info_duplication_from_sv(svs_infos[sv],fasta_orig[chrm]))
    for chrm, svs_infos in genotypes_for_inds_DEL.items():
        if chrm not in fasta_modif_list.keys():
            fasta_modif_list[chrm]=[]
        for sv in svs_infos:
            fasta_modif_list[chrm].append(build_info_deletion_from_sv(svs_infos[sv],fasta_orig[chrm]))
    #print("List",fasta_modif_list)
    ###
    ### Building the complete sequence list, adding the common part between the modification and their correspondnig sequence
    ## TODO : do it in one step
    fasta_frag={}
@@ -491,18 +493,18 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in
                else:
                    fasta_frag_chrm.append(sv)
                    previous_end=sv['end']
            if fasta_frag_chrm[-1]['end'] < last_nt:
                fasta_frag_chrm.append(build_info_common(fasta_frag_chrm[-1]['end']+1,last_nt,fasta_orig[chrm],nb_inds)) ## test_seq
        fasta_frag[chrm] = fasta_frag_chrm
        for t in fasta_frag[chrm]:
            print(t['type'],t['start'],t['end'],t['genotypes'])
    ###
-    ### fasta buildings 
+    ### fasta buildings
    ### use the complete list to create the haplotype sequence
-    ### for each haplotype, we append the common sequence and the allele sequence corresponding to the haplotype (0 or 1)        
+    ### for each haplotype, we append the common sequence and the allele sequence corresponding to the haplotype (0 or 1)
    ## TOSO : it's a bit slow, try to fasten the process
    for chrm in fasta_orig:
        for ind in range(0, nb_inds):
@@ -511,13 +513,13 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in
                for sv in fasta_frag[chrm]:
                    geno_haplo = int(sv['genotypes'][ind][haplo])
                    seq_haplo_ind += sv['seq'][geno_haplo]
                seq_haplo_ind.id=fasta_orig[chrm].id
                seq_haplo_ind.name=fasta_orig[chrm].name
                seq_haplo_ind.description=fasta_orig[chrm].description
                output_handle = open(os.path.join(output_dir, "indiv" + str(ind + 1) + "_chr_" + str(haplo) + ".fasta"),"w")
                SeqIO.write(seq_haplo_ind, output_handle, "fasta")
    ###
@@ -530,7 +532,7 @@ def build_info_common(start,end,fasta_orig_chrm,nb_inds):
    info['genotypes']=[]
    for ind in range(0, nb_inds):
        info['genotypes'].append(['0','0'])
    seq_ori = fasta_orig_chrm[int(info['start']-1):int(info['end'])]
    info['seq']=[seq_ori,seq_ori]
    return info
@@ -558,28 +560,28 @@ def build_info_inversion_from_sv(sv,fasta_orig_chrm):
    seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])]
    info['seq']=[seq_ori,seq_ori.reverse_complement(id=seq_ori.id,name=seq_ori.name,description=seq_ori.description)]
    return info
 def build_info_duplication_from_sv(sv,fasta_orig_chrm,reverse=False):
    info={}
    info['type']='DUPLICATION'
    ## The CNV will be represented by the target_base (base after which the duplicated sequence insert)
    ## 0 : target_base, 1 : target base + duplicated sequence
    ## beware1 : if target base > seq_length, the insertion will occur at the end of the sequence
    ## beware2 : if multiple duplications are inserted at the end of the sequence, the target site will be duplicated !!
    ## => patch : check if multiple duplication insert in the same place and keep only once the target_base
    ## => not a priority
    target_base_pos = int(sv['end']+sv['shift'])
-    seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])] 
+    seq_ori = fasta_orig_chrm[int(sv['start']-1):int(sv['end'])]
    target_base = fasta_orig_chrm[target_base_pos-1:target_base_pos]
    info['start']=target_base_pos
    info['end']=target_base_pos
    info['size']=sv['end']-sv['start'] + 1 #size of duplicated sequence
    info['genotypes']=sv['genotypes']
    if reverse:
        info['seq']=[target_base,target_base + seq_ori.reverse_complement(id=seq_ori.id,name=seq_ori.name,description=seq_ori.description)]
    else:
@@ -703,7 +705,7 @@ def confirm(deletions: dict, inversions: dict, duplications: dict,variants: dict
 def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, proba_dup, freq_del, freq_inv, freq_dup,
-         haploid, force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None,
+         haploid, force_polymorphism, no_reads, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None,
         min_deletions=1, min_inversions=1, min_duplications=1, max_try=20, quiet=True, stdout=None, stderr=None): #MC
    printer = Print(stdout=stdout, stderr=stderr)
@@ -827,9 +829,10 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p
                    return 1
            build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, genotypes_for_inds_DUP, haploid, output_dir,
                                     nb_inds, printer) #MC
-            printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
+            if not no_reads:
-            generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
+                printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
-                                   insert_len_sd, prg_path, threads, stdout, stderr)
+                generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
+                                       insert_len_sd, prg_path, threads, stdout, stderr)
            printer.out("DONE!\n")
        except ExecException as e:
            printer.err(e)
No results found