Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
popsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SVdetection
popsim
Commits
fa876d5b
Commit
fa876d5b
authored
7 years ago
by
Floreal Cabanettes
Browse files
Options
Downloads
Patches
Plain Diff
Refactoring of build_pop script
parent
8d6c7883
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
build_pop.py
+278
-172
278 additions, 172 deletions
build_pop.py
with
278 additions
and
172 deletions
build_pop.py
+
278
−
172
View file @
fa876d5b
#!/usr/bin/env python3
import
sys
import
os
import
random
import
argparse
...
...
@@ -8,192 +9,220 @@ import vcf
from
Bio
import
SeqIO
import
tempfile
from
lib.svrunner_utils
import
eprint
from
pysam
import
tabix_compress
,
tabix_index
parser
=
argparse
.
ArgumentParser
(
description
=
'
Generate simulated populations with SV
'
)
parser
.
add_argument
(
"
--nb-inds
"
,
help
=
"
Number of individuals to generate
"
,
required
=
True
,
type
=
int
)
parser
.
add_argument
(
"
--reference
"
,
help
=
"
Reference genome
"
,
required
=
True
)
parser
.
add_argument
(
"
--sv-list
"
,
help
=
"
File containing the SVs
"
,
required
=
True
)
parser
.
add_argument
(
"
--coverage
"
,
help
=
"
Coverage of reads (default: 15)
"
,
default
=
15
,
type
=
int
)
parser
.
add_argument
(
"
--output-directory
"
,
help
=
"
Output directory (default: res)
"
,
default
=
"
res
"
)
parser
.
add_argument
(
"
--force-polymorphism
"
,
help
=
"
Force polymorphism for each SV
"
,
action
=
'
store_const
'
,
const
=
True
,
default
=
False
)
parser
.
add_argument
(
"
--haploid
"
,
help
=
"
Make a haploid genome, instead of diploid one
"
,
action
=
"
store_const
"
,
const
=
True
,
default
=
False
)
parser
.
add_argument
(
"
-l
"
,
"
--read-len
"
,
help
=
"
Generate reads having a length of LEN
"
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
"
-m
"
,
"
--insert-len-mean
"
,
help
=
"
Generate inserts (fragments) having an average length of LEN
"
,
type
=
int
,
default
=
300
)
parser
.
add_argument
(
"
-v
"
,
"
--insert-len-sd
"
,
help
=
"
Set the standard deviation of the insert (fragment) length (%%)
"
,
type
=
int
,
default
=
30
)
args
=
parser
.
parse_args
()
nb_inds
=
args
.
nb_inds
if
nb_inds
<
2
:
raise
Exception
(
"
nb-inds must be at least 2
"
)
reference
=
args
.
reference
sv_list
=
args
.
sv_list
output_dir
=
args
.
output_directory
tmp_dir
=
tempfile
.
mkdtemp
()
haploid
=
args
.
haploid
if
not
os
.
path
.
isfile
(
reference
+
"
.fai
"
):
os
.
system
(
"
samtools faidx
"
+
reference
)
#############
# FUNCTIONS #
#############
def
allele
(
frequency
):
def
parse_args
():
"""
Parse script arguments
:return: argparse arguments object
"""
parser
=
argparse
.
ArgumentParser
(
description
=
'
Generate simulated populations with SV
'
)
parser
.
add_argument
(
"
--nb-inds
"
,
help
=
"
Number of individuals to generate
"
,
required
=
True
,
type
=
int
)
parser
.
add_argument
(
"
--reference
"
,
help
=
"
Reference genome
"
,
required
=
True
)
parser
.
add_argument
(
"
--sv-list
"
,
help
=
"
File containing the SVs
"
,
required
=
True
)
parser
.
add_argument
(
"
--coverage
"
,
help
=
"
Coverage of reads (default: 15)
"
,
default
=
15
,
type
=
int
)
parser
.
add_argument
(
"
--output-directory
"
,
help
=
"
Output directory (default: res)
"
,
default
=
"
res
"
)
parser
.
add_argument
(
"
--force-polymorphism
"
,
help
=
"
Force polymorphism for each SV
"
,
action
=
'
store_const
'
,
const
=
True
,
default
=
False
)
parser
.
add_argument
(
"
--haploid
"
,
help
=
"
Make a haploid genome, instead of diploid one
"
,
action
=
"
store_const
"
,
const
=
True
,
default
=
False
)
parser
.
add_argument
(
"
-l
"
,
"
--read-len
"
,
help
=
"
Generate reads having a length of LEN
"
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
"
-m
"
,
"
--insert-len-mean
"
,
help
=
"
Generate inserts (fragments) having an average length of LEN
"
,
type
=
int
,
default
=
300
)
parser
.
add_argument
(
"
-v
"
,
"
--insert-len-sd
"
,
help
=
"
Set the standard deviation of the insert (fragment) length (%%)
"
,
type
=
int
,
default
=
30
)
args
=
parser
.
parse_args
()
return
args
def
_allele
(
frequency
):
"""
Get randomly an allele, given a frequency
:param frequency: frequency of the allele {float}
:return: allele randomly choosen {0 or 1}
"""
return
1
if
random
.
uniform
(
0
,
1
)
<
frequency
else
0
def
get_genotypes_for_inds
():
def
_get_genotypes_for_inds
(
nb_inds
,
haploid
,
freq
):
"""
Get genotypes for each individual
:param nb_inds: number of individuals {int}
:param haploid: is the genome hamploid {bool}
:param freq: frequency of the allele {float}
:return: list of genotypes, vcf data {list}
"""
all_genotypes
=
[]
genotypes_row
=
[]
for
i
in
range
(
1
,
nb_inds
+
1
):
if
not
haploid
:
genotype
=
str
(
allele
(
freq
))
+
"
/
"
+
str
(
allele
(
freq
))
genotype
=
str
(
_
allele
(
freq
))
+
"
/
"
+
str
(
_
allele
(
freq
))
else
:
genotype
=
str
(
allele
(
freq
))
genotype
=
str
(
_
allele
(
freq
))
genotype_data
=
vcf
.
model
.
_Call
(
None
,
"
INDIV_
"
+
str
(
i
),
vcf
.
model
.
make_calldata_tuple
(
"
GT
"
)(
GT
=
genotype
))
genotypes_row
.
append
(
genotype_data
)
all_genotypes
.
append
(
genotype
)
return
all_genotypes
,
genotypes_row
def
svsort
(
sv
,
chr
):
def
_
svsort
(
sv
,
chr
m
,
genotypes_for_inds
):
"""
Function to sort regions
:param sv: the variant
:param chrm: chromosome {str}
:param genotypes_for_inds: dictionary of genotypes for each individual
"""
return
int
(
genotypes_for_inds
[
chr
][
sv
][
"
start
"
])
prg_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
if
not
os
.
path
.
isdir
(
output_dir
):
os
.
mkdir
(
output_dir
)
return
int
(
genotypes_for_inds
[
chrm
][
sv
][
"
start
"
])
###########################
# 1. Get random deletions #
###########################
print
(
"
GENERATE SV DELs...
\n
"
)
def
get_random_deletions
(
sv_list
,
reference
,
tmp_dir
,
prg_path
):
"""
Get random deletions
:param sv_list: file describing deletions (in SVsim format) {str}
:param reference: reference genome fasta file {str}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
"""
print
(
"
GENERATE SV DELs...
\n
"
)
os
.
system
(
os
.
path
.
join
(
prg_path
,
"
SVsim -W -i {0} -r {1} -o {2}{3}reference-sv
"
.
format
(
sv_list
,
reference
,
tmp_dir
,
os
.
path
.
sep
)))
os
.
system
(
os
.
path
.
join
(
prg_path
,
"
SVsim -W -i {0} -r {1} -o {2}{3}reference-sv
"
.
format
(
sv_list
,
reference
,
tmp_dir
,
os
.
path
.
sep
)))
###################################
# 2. Build BED files of deletions #
###################################
try
:
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
),
"
w
"
)
as
bed
:
try
:
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bedpe
"
),
"
r
"
)
as
bedpe
:
for
line
in
bedpe
:
freq
=
0.2
if
random
.
uniform
(
0
,
1
)
<
0.5
else
0.5
parts
=
line
.
split
(
"
\t
"
)
bed
.
write
(
"
\t
"
.
join
([
parts
[
0
],
parts
[
2
],
parts
[
4
],
parts
[
6
].
replace
(
"
::
"
,
"
_
"
),
str
(
freq
)])
+
"
\n
"
)
except
IOError
:
eprint
(
"
ERROR: SVSim failed to generate variants.
"
)
exit
(
1
)
except
IOError
:
eprint
(
"
ERROR: Unable to generate BED file
\"
{0}
\"
.
"
.
format
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
)))
exit
(
1
)
def
build_bed_deletions_file
(
bed_file
,
tmp_dir
):
"""
Build BED file of deletions
:param bed_file: bed full path name {str}
:param tmp_dir: temporary directory {str}
"""
try
:
with
open
(
bed_file
,
"
w
"
)
as
bed
:
try
:
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bedpe
"
),
"
r
"
)
as
bedpe
:
for
line
in
bedpe
:
freq
=
0.2
if
random
.
uniform
(
0
,
1
)
<
0.5
else
0.5
parts
=
line
.
split
(
"
\t
"
)
bed
.
write
(
"
\t
"
.
join
([
parts
[
0
],
parts
[
2
],
parts
[
4
],
parts
[
6
].
replace
(
"
::
"
,
"
_
"
),
str
(
freq
)])
+
"
\n
"
)
except
IOError
:
eprint
(
"
ERROR: SVSim failed to generate variants.
"
)
exit
(
1
)
except
IOError
:
eprint
(
"
ERROR: Unable to generate BED file
\"
{0}
\"
.
"
.
format
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
)))
exit
(
1
)
###############################################################################
# 3. Build VCF files containing genotypes for the given number of individuals #
###############################################################################
# VCF file is a result file, not used by this script
print
(
"
GENERATE SUMMARY VCF FILE...
\n
"
)
genotypes_for_inds
=
OrderedDict
()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
# Build VCF header:
try
:
with
open
(
os
.
path
.
join
(
prg_path
,
"
template.vcf
"
),
"
r
"
)
as
template
:
try
:
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
template.vcf
"
),
"
w
"
)
as
my_template
:
for
line
in
template
:
if
line
[:
6
]
==
"
#CHROM
"
:
line
=
line
.
replace
(
"
\n
"
,
""
)
for
i
in
range
(
0
,
nb_inds
):
line
+=
"
\t
INDIV_
"
+
str
(
i
+
1
)
line
+=
"
\n
"
my_template
.
write
(
line
)
except
IOError
:
eprint
(
"
ERROR: unable to create template file
\"
{0}
\"
in temp dir.
"
.
format
(
os
.
path
.
join
(
tmp_dir
,
"
template.vcf
"
)))
exit
(
1
)
except
IOError
:
eprint
(
"
ERROR: template file not found in program directory.
"
)
exit
(
1
)
try
:
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
),
"
r
"
)
as
bed
:
vcf_reader
=
vcf
.
Reader
(
filename
=
os
.
path
.
join
(
tmp_dir
,
'
template.vcf
'
))
output_vcf
=
os
.
path
.
join
(
output_dir
,
"
genotypes.vcf
"
)
vcf_writer
=
vcf
.
Writer
(
open
(
output_vcf
,
"
w
"
),
vcf_reader
)
for
line
in
bed
:
parts
=
line
.
replace
(
"
\n
"
,
""
).
split
(
"
\t
"
)
freq
=
float
(
parts
[
4
])
if
parts
[
0
]
not
in
genotypes_for_inds
:
genotypes_for_inds
[
parts
[
0
]]
=
{}
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]]
=
{
"
start
"
:
int
(
parts
[
1
]),
"
end
"
:
int
(
parts
[
2
]),
"
genotypes
"
:
[]}
# Get genotypes:
all_genotypes
,
genotypes
=
[],
[]
if
args
.
force_polymorphism
:
polymorph
=
False
while
not
polymorph
:
all_genotypes
,
genotypes
=
get_genotypes_for_inds
()
polymorph
=
len
(
set
(
all_genotypes
))
>
1
else
:
all_genotypes
,
genotypes
=
get_genotypes_for_inds
()
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]][
"
genotypes
"
]
=
[
x
.
split
(
"
/
"
)
for
x
in
all_genotypes
]
info
=
{
"
END
"
:
int
(
parts
[
2
]),
"
AF
"
:
freq
}
vcf_record
=
vcf
.
model
.
_Record
(
parts
[
0
],
int
(
parts
[
1
]),
parts
[
3
],
"
N
"
,
[
vcf
.
model
.
_SV
(
"
DEL
"
)],
"
.
"
,
"
.
"
,
info
,
"
GT
"
,
[
0
],
genotypes
)
vcf_writer
.
write_record
(
vcf_record
)
vcf_writer
.
close
()
# Bgzip + tabix:
os
.
system
(
"
bgzip -c
"
+
output_vcf
+
"
>
"
+
output_vcf
+
"
.gz
"
)
os
.
unlink
(
output_vcf
)
output_vcf
+=
"
.gz
"
os
.
system
(
"
tabix -p vcf
"
+
output_vcf
)
except
IOError
:
eprint
(
"
ERROR: Unable to load bed file
\"
{0}
\"
: file not found.
"
.
format
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
)))
exit
(
1
)
def
_build_vcf_header
(
vcf_file
,
prg_path
,
tmp_dir
,
nb_inds
):
"""
Build header of the VCF file
:param vcf_file: vcf file full path name {str}
:param prg_path: program path {str}
:param tmp_dir: temporary directory {str}
:param nb_inds: number of individuals {int}
"""
try
:
with
open
(
os
.
path
.
join
(
prg_path
,
"
template.vcf
"
),
"
r
"
)
as
template
:
try
:
with
open
(
vcf_file
,
"
w
"
)
as
my_template
:
for
line
in
template
:
if
line
[:
6
]
==
"
#CHROM
"
:
line
=
line
.
replace
(
"
\n
"
,
""
)
for
i
in
range
(
0
,
nb_inds
):
line
+=
"
\t
INDIV_
"
+
str
(
i
+
1
)
line
+=
"
\n
"
my_template
.
write
(
line
)
except
IOError
:
eprint
(
"
ERROR: unable to create template file
\"
{0}
\"
in temp dir.
"
.
format
(
os
.
path
.
join
(
tmp_dir
,
"
template.vcf
"
)))
exit
(
1
)
except
IOError
:
eprint
(
"
ERROR: template file not found in program directory.
"
)
exit
(
1
)
def
build_genotypes_vcf_list
(
bed_file
,
output_vcf
,
haploid
,
force_polymorphism
,
nb_inds
,
tmp_dir
,
prg_path
):
"""
Build VCF file describing genotypes for each individual (and the associated python dictionary)
:param bed_file: bed file describing deletions {str}
:param output_vcf: output VCF file full path name {str}
:param haploid: is haploid {bool}
:param force_polymorphism: force polymorphism {bool}
:param output_dir: output directory {str}
:param nb_inds: number of individuals {int}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
:return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
"""
print
(
"
GENERATE SUMMARY VCF FILE...
\n
"
)
genotypes_for_inds
=
OrderedDict
()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
vcf_file
=
os
.
path
.
join
(
tmp_dir
,
"
template.vcf
"
)
# Build VCF header:
_build_vcf_header
(
vcf_file
,
prg_path
,
tmp_dir
,
nb_inds
)
try
:
with
open
(
bed_file
,
"
r
"
)
as
bed
:
vcf_reader
=
vcf
.
Reader
(
filename
=
vcf_file
)
vcf_writer
=
vcf
.
Writer
(
open
(
output_vcf
,
"
w
"
),
vcf_reader
)
for
line
in
bed
:
parts
=
line
.
replace
(
"
\n
"
,
""
).
split
(
"
\t
"
)
freq
=
float
(
parts
[
4
])
if
parts
[
0
]
not
in
genotypes_for_inds
:
genotypes_for_inds
[
parts
[
0
]]
=
{}
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]]
=
{
"
start
"
:
int
(
parts
[
1
]),
"
end
"
:
int
(
parts
[
2
]),
"
genotypes
"
:
[]}
# Get genotypes:
all_genotypes
,
genotypes
=
[],
[]
if
force_polymorphism
:
polymorph
=
False
while
not
polymorph
:
all_genotypes
,
genotypes
=
_get_genotypes_for_inds
(
nb_inds
,
haploid
,
freq
)
polymorph
=
len
(
set
(
all_genotypes
))
>
1
else
:
all_genotypes
,
genotypes
=
_get_genotypes_for_inds
(
nb_inds
,
haploid
,
freq
)
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]][
"
genotypes
"
]
=
[
x
.
split
(
"
/
"
)
for
x
in
all_genotypes
]
info
=
{
"
END
"
:
int
(
parts
[
2
]),
"
AF
"
:
freq
}
vcf_record
=
vcf
.
model
.
_Record
(
parts
[
0
],
int
(
parts
[
1
]),
parts
[
3
],
"
N
"
,
[
vcf
.
model
.
_SV
(
"
DEL
"
)],
"
.
"
,
"
.
"
,
info
,
"
GT
"
,
[
0
],
genotypes
)
vcf_writer
.
write_record
(
vcf_record
)
vcf_writer
.
close
()
tabix_compress
(
output_vcf
,
output_vcf
+
"
.gz
"
,
True
)
tabix_index
(
output_vcf
+
"
.gz
"
,
force
=
True
,
preset
=
"
vcf
"
)
except
IOError
:
eprint
(
"
ERROR: Unable to load bed file
\"
{0}
\"
: file not found.
"
.
format
(
bed_file
))
exit
(
1
)
return
genotypes_for_inds
###############################################
# Build fasta chromosomes for each individual #
###############################################
print
(
"
BUILD FASTA GENOME FOR EACH INDIVIDUAL...
\n
"
)
fasta_orig
=
SeqIO
.
index
(
reference
,
"
fasta
"
)
for
chr
,
svs_infos
in
genotypes_for_inds
.
items
():
print
(
"
PROCESSING CHROMOSOME {0}...
\n
"
.
format
(
chr
))
svs
=
list
(
svs_infos
.
keys
())
svs
=
sorted
(
svs
,
key
=
lambda
x
:
svsort
(
x
,
chr
))
fasta_orig_chr
=
fasta_orig
[
chr
]
last_nt
=
len
(
fasta_orig_chr
)
-
1
# Compute keeped genomic regions for each diploid chromosome:
def
_compute_keeped_genomic_regions
(
svs
,
svs_infos
,
haploid
):
"""
Get list of all regions keeped (not deleted)
:param svs: list of variants (deletions) {list}
:param svs_infos: infos of variants {dict}
:param haploid: is haploid {bool}
:return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
"""
regions
=
OrderedDict
()
current_region_pointer
=
OrderedDict
()
for
svs_i
in
svs
:
...
...
@@ -203,7 +232,7 @@ for chr, svs_infos in genotypes_for_inds.items():
regions
[
i
]
=
{}
current_region_pointer
[
i
]
=
{}
for
j
in
range
(
0
,
2
if
not
haploid
else
1
):
# For each chromosome of the diploid genome, or the chromosome
# of the haploid genome
# of the haploid genome
if
j
not
in
regions
[
i
]:
regions
[
i
][
j
]
=
[]
current_region_pointer
[
i
][
j
]
=
"
0
"
...
...
@@ -211,18 +240,31 @@ for chr, svs_infos in genotypes_for_inds.items():
regions
[
i
][
j
].
append
([
current_region_pointer
[
i
][
j
],
svs_infos
[
svs_i
][
"
start
"
]])
current_region_pointer
[
i
][
j
]
=
svs_infos
[
svs_i
][
"
end
"
]
i
+=
1
return
regions
,
current_region_pointer
# Build FASTA of each diploid/haploid chromosome:
def
_build_fastas
(
chrm
,
regions
,
current_region_pointer
,
output_dir
,
fasta_orig_chr
,
last_nt
):
"""
Build fasta files
:param chrm:
:param regions:
:param current_region_pointer:
:param output_dir:
:param fasta_orig_chr:
:param last_nt:
:return:
"""
for
indiv
,
chrs_dips
in
regions
.
items
():
id_chrs
=
list
(
chrs_dips
.
keys
())
id_chrs
=
sorted
(
id_chrs
)
for
id_chr
in
id_chrs
:
chr_dip
=
chrs_dips
[
id_chr
]
# SVs for each diploid chromosome
logs_regions
=
[]
# Store logs
chr_dip
=
chrs_dips
[
id_chr
]
# SVs for each diploid chromosome
logs_regions
=
[]
# Store logs
# Build FASTA and store logs:
try
:
with
open
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
_chr_
"
+
str
(
id_chr
)
+
"
.fasta
"
),
"
a
"
)
as
output_handle
:
with
open
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
_chr_
"
+
str
(
id_chr
)
+
"
.fasta
"
),
"
a
"
)
as
output_handle
:
fasta
=
""
last_one
=
0
for
chr_region
in
chr_dip
:
...
...
@@ -235,39 +277,103 @@ for chr, svs_infos in genotypes_for_inds.items():
SeqIO
.
write
(
fasta
,
output_handle
,
"
fasta
"
)
except
IOError
:
eprint
(
"
ERROR: unable to write
\"
{0}
\"
file.
"
.
format
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
_chr_
"
+
str
(
id_chr
)
+
"
.fasta
"
)))
format
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
_chr_
"
+
str
(
id_chr
)
+
"
.fasta
"
)))
exit
(
1
)
# Write logs
try
:
with
open
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
.regions.log
"
),
"
a
"
)
as
log_handle
:
log_handle
.
write
(
chr
+
"
_
"
+
str
(
id_chr
)
+
"
\t
"
)
with
open
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
.regions.log
"
),
"
a
"
)
as
log_handle
:
log_handle
.
write
(
chr
m
+
"
_
"
+
str
(
id_chr
)
+
"
\t
"
)
log_handle
.
write
(
"
\n\t
"
.
join
(
logs_regions
))
log_handle
.
write
(
"
\n
"
)
except
IOError
:
eprint
(
"
ERROR: unable to write log file:
\"
{0}
\"
"
.
format
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
.regions.log
"
)))
print
(
"
GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...
\n
"
)
# Generate reads for all individuals:
if
not
haploid
:
cmd
=
str
(
"
{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz
"
"
-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7}
"
"
-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}
"
)
else
:
cmd
=
str
(
"
{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz
"
"
-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7}
"
"
-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}
"
)
for
i
in
range
(
1
,
nb_inds
+
1
):
prefix
=
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
i
))
chr0
=
prefix
+
"
_chr_0.fasta
"
chr1
=
prefix
+
"
_chr_1.fasta
"
format
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
.regions.log
"
)))
def
build_fastas_chromosomes
(
reference
,
genotypes_for_inds
,
haploid
,
output_dir
):
print
(
"
BUILD FASTA GENOME FOR EACH INDIVIDUAL...
\n
"
)
fasta_orig
=
SeqIO
.
index
(
reference
,
"
fasta
"
)
for
chrm
,
svs_infos
in
genotypes_for_inds
.
items
():
print
(
"
PROCESSING CHROMOSOME {0}...
\n
"
.
format
(
chrm
))
svs
=
list
(
svs_infos
.
keys
())
svs
=
sorted
(
svs
,
key
=
lambda
x
:
_svsort
(
x
,
chrm
,
genotypes_for_inds
))
fasta_orig_chr
=
fasta_orig
[
chrm
]
last_nt
=
len
(
fasta_orig_chr
)
-
1
# Compute keeped genomic regions for each diploid chromosome:
regions
,
current_region_pointer
=
_compute_keeped_genomic_regions
(
svs
,
svs_infos
,
haploid
)
# Build FASTA of each diploid/haploid chromosome:
_build_fastas
(
chrm
,
regions
,
current_region_pointer
,
output_dir
,
fasta_orig_chr
,
last_nt
)
def
generate_samples_fastq
(
haploid
,
nb_inds
,
output_dir
,
coverage
,
read_len
,
insert_len_mean
,
insert_len_sd
,
prg_path
):
print
(
"
GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...
\n
"
)
# Generate reads for all individuals:
if
not
haploid
:
os
.
system
(
cmd
.
format
(
prefix
,
chr0
,
chr1
,
args
.
coverage
,
prg_path
+
os
.
path
.
sep
,
args
.
read_len
,
args
.
insert_len_mean
,
args
.
insert_len_sd
))
cmd
=
str
(
"
{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz
"
"
-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7}
"
"
-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}
"
)
else
:
os
.
system
(
cmd
.
format
(
prefix
,
chr0
,
""
,
args
.
coverage
,
prg_path
+
os
.
path
.
sep
,
args
.
read_len
,
args
.
insert_len_mean
,
args
.
insert_len_sd
))
print
(
"
DONE!
\n
"
)
cmd
=
str
(
"
{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz
"
"
-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7}
"
"
-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}
"
)
for
i
in
range
(
1
,
nb_inds
+
1
):
prefix
=
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
i
))
chr0
=
prefix
+
"
_chr_0.fasta
"
chr1
=
prefix
+
"
_chr_1.fasta
"
if
not
haploid
:
os
.
system
(
cmd
.
format
(
prefix
,
chr0
,
chr1
,
coverage
,
prg_path
+
os
.
path
.
sep
,
read_len
,
insert_len_mean
,
insert_len_sd
))
else
:
os
.
system
(
cmd
.
format
(
prefix
,
chr0
,
""
,
coverage
,
prg_path
+
os
.
path
.
sep
,
read_len
,
insert_len_mean
,
insert_len_sd
))
def
main
():
args
=
parse_args
()
reference
=
args
.
reference
sv_list
=
args
.
sv_list
output_dir
=
args
.
output_directory
tmp_dir
=
tempfile
.
mkdtemp
()
haploid
=
args
.
haploid
nb_inds
=
args
.
nb_inds
if
nb_inds
<
2
:
raise
Exception
(
"
nb-inds must be at least 2
"
)
if
not
os
.
path
.
isfile
(
reference
+
"
.fai
"
):
os
.
system
(
"
samtools faidx
"
+
reference
)
prg_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
if
not
os
.
path
.
isdir
(
output_dir
):
os
.
mkdir
(
output_dir
)
################
# Launch steps #
################
get_random_deletions
(
sv_list
,
reference
,
tmp_dir
,
prg_path
)
bed_file
=
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
)
build_bed_deletions_file
(
bed_file
,
tmp_dir
)
output_vcf
=
os
.
path
.
join
(
output_dir
,
"
genotypes.vcf
"
)
genotypes_for_inds
=
build_genotypes_vcf_list
(
bed_file
,
output_vcf
,
haploid
,
args
.
force_polymorphism
,
nb_inds
,
tmp_dir
,
prg_path
)
build_fastas_chromosomes
(
reference
,
genotypes_for_inds
,
haploid
,
output_dir
)
generate_samples_fastq
(
haploid
,
nb_inds
,
output_dir
,
args
.
coverage
,
args
.
read_len
,
args
.
insert_len_mean
,
args
.
insert_len_sd
,
prg_path
)
print
(
"
DONE!
\n
"
)
if
__name__
==
'
__main__
'
:
try
:
sys
.
exit
(
main
())
except
Exception
as
e
:
print
(
e
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment