Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
popsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
SVdetection
popsim
Commits
b654d97e
Commit
b654d97e
authored
8 years ago
by
Floreal Cabanettes
Browse files
Options
Downloads
Patches
Plain Diff
Make sure there is always polymorph
parent
e8bcd0a4
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
build_pop.py
+29
-17
29 additions, 17 deletions
build_pop.py
with
29 additions
and
17 deletions
build_pop.py
+
29
−
17
View file @
b654d97e
#!/usr/bin/env python3
import
os
import
shutil
import
random
import
argparse
from
collections
import
OrderedDict
...
...
@@ -27,8 +26,9 @@ tmp_dir = args.tmp_directory
if
not
os
.
path
.
isfile
(
reference
+
"
.fai
"
):
os
.
system
(
"
samtools faidx
"
+
reference
)
def
allele
(
freq
):
return
1
if
random
.
uniform
(
0
,
1
)
<
freq
else
0
def
allele
(
frequency
):
return
1
if
random
.
uniform
(
0
,
1
)
<
frequency
else
0
prg_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
...
...
@@ -48,7 +48,7 @@ os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".
###################################
#
2. Build BED files of deletions #
#
2. Build BED files of deletions #
###################################
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
reference-sv.bed
"
),
"
w
"
)
as
bed
:
...
...
@@ -60,20 +60,20 @@ with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:
###############################################################################
#
3. Build VCF files containing genotypes for the given number of individuals #
#
3. Build VCF files containing genotypes for the given number of individuals #
###############################################################################
#
VCF file is a result file, not used by this script
#
VCF file is a result file, not used by this script
print
(
"
GENERATE SUMMARY VCF FILE...
\n
"
)
genotypes_for_inds
=
OrderedDict
()
# { chr: id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...]}}
genotypes_for_inds
=
OrderedDict
()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
# Build VCF header:
with
open
(
os
.
path
.
join
(
prg_path
,
"
template.vcf
"
),
"
r
"
)
as
template
:
with
open
(
os
.
path
.
join
(
tmp_dir
,
"
template.vcf
"
),
"
w
"
)
as
my_template
:
for
line
in
template
:
if
line
[:
6
]
==
"
#CHROM
"
:
#line += "\t" + "\t".join(list((("INDIV_" + str(i)) for i in range(0, int(args.nb_inds)))))
line
=
line
.
replace
(
"
\n
"
,
""
)
for
i
in
range
(
0
,
int
(
args
.
nb_inds
)):
line
+=
"
\t
INDIV_
"
+
str
(
i
+
1
)
...
...
@@ -90,11 +90,22 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:
genotypes_for_inds
[
parts
[
0
]]
=
{}
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]]
=
{
"
start
"
:
int
(
parts
[
1
]),
"
end
"
:
int
(
parts
[
2
]),
"
genotypes
"
:
[]}
genotypes
=
[]
for
i
in
range
(
1
,
int
(
args
.
nb_inds
)
+
1
):
genotype
=
str
(
allele
(
freq
))
+
"
/
"
+
str
(
allele
(
freq
))
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]][
"
genotypes
"
].
append
(
genotype
.
split
(
"
/
"
))
genotype_data
=
vcf
.
model
.
_Call
(
None
,
"
INDIV_
"
+
str
(
i
),
vcf
.
model
.
make_calldata_tuple
(
"
GT
"
)(
GT
=
genotype
))
genotypes
.
append
(
genotype_data
)
polymorph
=
False
while
not
polymorph
:
unique_genotypes
=
set
()
all_genotypes
=
[]
genotypes_tmp
=
[]
for
i
in
range
(
1
,
int
(
args
.
nb_inds
)
+
1
):
genotype
=
str
(
allele
(
freq
))
+
"
/
"
+
str
(
allele
(
freq
))
genotype_data
=
vcf
.
model
.
_Call
(
None
,
"
INDIV_
"
+
str
(
i
),
vcf
.
model
.
make_calldata_tuple
(
"
GT
"
)(
GT
=
genotype
))
genotypes_tmp
.
append
(
genotype_data
)
unique_genotypes
.
add
(
genotype
)
all_genotypes
.
append
(
genotype
)
polymorph
=
len
(
unique_genotypes
)
>
1
if
polymorph
:
genotypes
+=
genotypes_tmp
genotypes_for_inds
[
parts
[
0
]][
parts
[
3
]][
"
genotypes
"
]
=
[
x
.
split
(
"
/
"
)
for
x
in
all_genotypes
]
info
=
{
"
END
"
:
int
(
parts
[
2
]),
"
AF
"
:
freq
}
vcf_record
=
vcf
.
model
.
_Record
(
parts
[
0
],
int
(
parts
[
1
]),
parts
[
3
],
"
N
"
,
[
vcf
.
model
.
_SV
(
"
DEL
"
)],
"
.
"
,
"
.
"
,
info
,
"
GT
"
,
[
0
],
genotypes
)
vcf_writer
.
write_record
(
vcf_record
)
...
...
@@ -107,6 +118,7 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:
print
(
"
BUILD FASTA GENOME FOR EACH INDIVIDUAL...
\n
"
)
def
svsort
(
sv
,
chr
):
"""
Function to sort regions
...
...
@@ -119,7 +131,7 @@ fasta_orig = SeqIO.index(reference, "fasta")
for
chr
,
svs_infos
in
genotypes_for_inds
.
items
():
print
(
"
PROCESSING CHROMOSOME {0}...
\n
"
.
format
(
chr
))
svs
=
list
(
svs_infos
.
keys
())
svs
=
sorted
(
svs
,
key
=
lambda
x
:
svsort
(
x
,
chr
))
...
...
@@ -149,10 +161,10 @@ for chr, svs_infos in genotypes_for_inds.items():
id_chrs
=
list
(
chrs_dips
.
keys
())
id_chrs
=
sorted
(
id_chrs
)
for
id_chr
in
id_chrs
:
chr_dip
=
chrs_dips
[
id_chr
]
#
SVs for each diploid chromosome
chr_dip
=
chrs_dips
[
id_chr
]
#
SVs for each diploid chromosome
logs_regions
=
[]
# Store logs
#
Build FASTA and store logs:
#
Build FASTA and store logs:
with
open
(
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
indiv
+
1
)
+
"
_chr_
"
+
str
(
id_chr
)
+
"
.fasta
"
),
"
a
"
)
as
output_handle
:
fasta
=
""
last_one
=
0
...
...
@@ -173,7 +185,7 @@ for chr, svs_infos in genotypes_for_inds.items():
print
(
"
GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...
\n
"
)
#
Generate reads for all individuals:
#
Generate reads for all individuals:
cmd
=
"
{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz -I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}
"
for
i
in
range
(
1
,
int
(
args
.
nb_inds
)
+
1
):
prefix
=
os
.
path
.
join
(
output_dir
,
"
INDIV_
"
+
str
(
i
))
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment