Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
workflows
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
Cervin Guyomar
workflows
Commits
ec072ab9
Commit
ec072ab9
authored
6 years ago
by
mariabernard
Browse files
Options
Downloads
Patches
Plain Diff
improve filter_multi_sample to take into account GATK specificity
parent
756ae24e
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py
+66
-53
66 additions, 53 deletions
Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py
with
66 additions
and
53 deletions
Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py
+
66
−
53
View file @
ec072ab9
#!/usr/bin/env python3
## Martijn Derks
## Martijn Derks
from Wageningen University, Netherlands and Maria Bernard from INRA, France
## Filter on average depth per sample set genotypes to missing
from
collections
import
OrderedDict
import
sys
import
gzip
import
vcf
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
'
Script to set genotype calls to missing if they do not contain coverage requirements
'
)
parser
.
add_argument
(
"
-v
"
,
"
--vcf_file
"
,
help
=
"
VCF_file (compressed)
"
,
nargs
=
1
)
parser
.
add_argument
(
"
-b
"
,
"
--bam_coverage
"
,
help
=
"
Tab delimited file with average depth per sample
"
,
nargs
=
1
)
parser
.
add_argument
(
"
-o
"
,
"
--output_vcf
"
,
help
=
"
Output_VCF (gzipped)
"
,
nargs
=
1
)
class
FILTER_VCF
:
def
avg_depth_per_sample
(
self
,
table
):
self
.
sample_avg_depth_dic
=
{}
depth_file
=
open
(
table
,
"
r
"
)
for
sample
in
depth_file
:
sample_id
,
sample_avg_depth
=
sample
.
strip
().
split
()[
0
],
sample
.
split
()[
1
]
self
.
sample_avg_depth_dic
[
sample_id
]
=
float
(
sample_avg_depth
.
replace
(
'
X
'
,
''
))
## Dictionary with avg depth per sample
def
avg_depth_per_sample
(
self
,
table
):
self
.
sample_avg_depth_dic
=
{}
depth_file
=
open
(
table
,
"
r
"
)
for
sample
in
depth_file
:
sample_id
,
sample_avg_depth
=
sample
.
strip
().
split
()[
0
],
sample
.
split
()[
1
]
self
.
sample_avg_depth_dic
[
sample_id
]
=
float
(
sample_avg_depth
.
replace
(
'
X
'
,
''
))
## Dictionary with avg depth per sample
def
filter_depth_per_sample
(
self
,
vcf
,
outfile
):
vcf_outfile
=
gzip
.
open
(
outfile
,
"
wb
"
)
total_set_to_missing
=
0
nsites
=
0
with
gzip
.
open
(
vcf
,
'
r
'
)
as
vcf_reader
:
for
record
in
vcf_reader
:
filtered_call
=
record
.
strip
().
split
(
"
\t
"
)
if
not
record
.
startswith
(
"
#
"
):
record
=
record
.
strip
().
split
(
"
\t
"
)
filtered_call
=
record
[
0
:
9
]
sample_count
=
0
nsites
+=
1
for
call
in
record
[
9
:]:
if
not
call
.
startswith
(
"
.
"
):
# if not call.startswith(".:."):
sample
=
samples
[
sample_count
]
depth
=
call
.
split
(
"
:
"
)[
1
]
avg_depth_times_2
=
float
(
self
.
sample_avg_depth_dic
[
sample
])
*
2.5
if
int
(
depth
)
<
4
:
call
=
"
.:.:.
"
total_set_to_missing
+=
1
if
float
(
depth
)
>
avg_depth_times_2
:
## Set to missing if depth > avg depth*3
call
=
"
.:.:.
"
total_set_to_missing
+=
1
else
:
pass
#missing
filtered_call
.
append
(
call
)
sample_count
+=
1
elif
record
.
startswith
(
"
#CHROM
"
):
samples
=
record
.
split
()[
9
:]
vcf_outfile
.
write
(
"
\t
"
.
join
(
filtered_call
)
+
"
\n
"
)
print
(
vcf
,
"
: Avg set to missing per site:
"
,
total_set_to_missing
/
nsites
)
def
filter_depth_per_sample
(
self
,
vcf
,
outfile
):
vcf_outfile
=
gzip
.
open
(
outfile
,
"
wt
"
)
total_set_to_missing_un
=
0
total_set_to_missing_low
=
0
total_set_to_missing_high
=
0
nsites
=
0
format_idx
=
None
depth_idx
=
None
with
gzip
.
open
(
vcf
,
'
rt
'
)
as
vcf_reader
:
for
record
in
vcf_reader
:
filtered_call
=
record
.
strip
().
split
(
"
\t
"
)
# variants line parsing
if
not
record
.
startswith
(
"
#
"
):
record
=
record
.
strip
().
split
(
"
\t
"
)
filtered_call
=
record
[
0
:
9
]
sample_count
=
0
nsites
+=
1
# identify depth in genotype format
if
depth_idx
is
None
:
depth_idx
=
record
[
format_idx
].
split
(
"
:
"
).
index
(
"
DP
"
)
# parse samples genotype calling
for
call
in
record
[
9
:]:
# if known
if
not
call
.
startswith
(
"
.
"
):
sample
=
samples
[
sample_count
]
depth
=
call
.
split
(
"
:
"
)[
depth_idx
]
avg_depth_times_2
=
float
(
self
.
sample_avg_depth_dic
[
sample
])
*
2.5
if
depth
==
"
.
"
:
call
=
"
.:.:.
"
total_set_to_missing_un
+=
1
elif
int
(
depth
)
<
4
:
call
=
"
.:.:.
"
total_set_to_missing_low
+=
1
elif
float
(
depth
)
>
avg_depth_times_2
:
## Set to missing if depth > avg depth*3
call
=
"
.:.:.
"
total_set_to_missing_high
+=
1
else
:
pass
#missing
filtered_call
.
append
(
call
)
sample_count
+=
1
# parsing genotype header line
elif
record
.
startswith
(
"
#CHROM
"
):
samples
=
record
.
split
()[
9
:]
format_idx
=
record
.
split
(
"
\t
"
).
index
(
"
FORMAT
"
)
vcf_outfile
.
write
(
"
\t
"
.
join
(
filtered_call
)
+
"
\n
"
)
print
(
vcf
,
"
: Avg set to missing per site:
"
,
(
total_set_to_missing_un
+
total_set_to_missing_low
+
total_set_to_missing_high
)
/
nsites
,
"
unknown depth :
"
,
total_set_to_missing_un
,
"
low depth :
"
,
total_set_to_missing_low
,
"
high depth :
"
,
total_set_to_missing_high
)
if
__name__
==
"
__main__
"
:
F
=
FILTER_VCF
()
args
=
parser
.
parse_args
()
vcf
=
args
.
vcf_file
[
0
]
coverage_table
=
args
.
bam_coverage
[
0
]
out
=
args
.
output_vcf
[
0
]
F
.
avg_depth_per_sample
(
coverage_table
)
F
.
filter_depth_per_sample
(
vcf
,
out
)
parser
=
argparse
.
ArgumentParser
(
description
=
'
Script to set genotype calls to missing if they do not contain coverage requirements
'
)
parser
.
add_argument
(
"
-v
"
,
"
--vcf_file
"
,
help
=
"
VCF_file (compressed)
"
)
parser
.
add_argument
(
"
-b
"
,
"
--bam_coverage
"
,
help
=
"
Tab delimited file with average depth per sample
"
)
parser
.
add_argument
(
"
-o
"
,
"
--output_vcf
"
,
help
=
"
Output_VCF (gzipped)
"
)
args
=
parser
.
parse_args
()
F
=
FILTER_VCF
()
F
.
avg_depth_per_sample
(
args
.
bam_coverage
)
F
.
filter_depth_per_sample
(
args
.
vcf_file
,
args
.
output_vcf
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment