Skip to content
Snippets Groups Projects
Commit 49a6acef authored by Maria Bernard's avatar Maria Bernard
Browse files

add genotype filtering step

parent 2fa878be
No related branches found
No related tags found
No related merge requests found
......@@ -33,9 +33,10 @@ for rule in rule_resources:
for resource in rule_resources[rule]:
check_param(rule,resource)
# add tabix in PATH
# add tabix and filter_multi_sample_vcf.py in PATH
tabix_dir=os.path.dirname(config['bin']['tabix'])
os.environ['PATH'] = tabix_dir + os.pathsep + os.environ['PATH']
script_dir=os.path.abspath('script')
##########################################################
### workflows options
......@@ -67,6 +68,7 @@ include : "rules/intersect_vcf.smk"
include : "rules/split_vcf.smk"
include : "rules/hardfiltering.smk"
include : "rules/vqsr.smk"
include : "rules/genoFilter.smk"
localrules: all
......@@ -82,6 +84,8 @@ if gatk_prefix.endswith('vcf') :
gatk_prefix = os.path.splitext(gatk_prefix)[0]
final_outputs.append("results/VQSR/" + gatk_prefix +"_SNP_filtered.vcf.gz")
final_outputs.append("results/VQSR/" + gatk_prefix +"_INDEL_filtered.vcf.gz")
final_outputs.append("results/genoFilter/" + gatk_prefix +"vqsr_SNP_genFiltered.vcf.gz")
final_outputs.append("results/genoFilter/" + gatk_prefix +"vqsr_INDEL_genFiltered.vcf.gz")
#~ print(final_outputs)
......
......@@ -12,6 +12,15 @@ resources: "resources_SLURM.yaml"
# Fasta reference genome
reference_genome: data/ensembl_gallus_gallus_1.fasta
# Tab delimited file with average depth per sample.
# If you used IMAGE_calling pipeline, for the mapping step, you could use the following command line to obtain it:
# for sample in `ls results/qualimap/*`
# do
# cov=`grep "mean coverageData" results/qualimap/$sample/genome_results.txt | awk '{print $NF}'`
# echo -e $sample"\t"$cov >> sample_mean_cov.tsv
# done
sample_mean_cov : data/sample_mean_cov.tsv
# Freebayes calling, vcf.gz, tabix index must be available
Freebayes_variants : data/variants_freebayes.vcf.gz
......
# coding: utf-8
__author__ = "Kenza BAZI KABBAJ / Maria BERNARD- Sigenae"
__version__ = "1.0.0"
"""
Rules to filtered genotypes called from tool little or too much reads
"""
rule genoFilter:
input:
sample_cov = config["sample_mean_cov"],
vcf = "results/VQSR/{GATK_input_prefix}_{var}_filtered.vcf.gz"
output:
vcf = temp("results/VQSR/{GATK_input_prefix}_vqsr_{var}_genFiltered.vcf.gz")
shell:
"""
filter_multi_sample_vcf.py --vcf_file {input.vcf} --bam_coverage {input.sample_cov} --output_vcf {output.vcf}
"""
\ No newline at end of file
#!/usr/bin/env python3
## Martijn Derks
## Filter on average depth per sample set genotypes to missing
from collections import OrderedDict
import sys
import gzip
import vcf
import argparse
parser = argparse.ArgumentParser( description='Script to set genotype calls to missing if they do not contain coverage requirements')
parser.add_argument("-v", "--vcf_file", help="VCF_file (compressed)", nargs=1)
parser.add_argument("-b", "--bam_coverage", help="Tab delimited file with average depth per sample", nargs=1)
parser.add_argument("-o", "--output_vcf", help="Output_VCF (gzipped)", nargs=1)
class FILTER_VCF:
def avg_depth_per_sample(self, table):
self.sample_avg_depth_dic = {}
depth_file = open(table,"r")
for sample in depth_file:
sample_id, sample_avg_depth = sample.strip().split()[0], sample.split()[1]
self.sample_avg_depth_dic[sample_id] = float(sample_avg_depth.replace('X','')) ## Dictionary with avg depth per sample
def filter_depth_per_sample(self, vcf, outfile):
vcf_outfile = gzip.open(outfile,"wb")
total_set_to_missing = 0
nsites = 0
with gzip.open(vcf,'r') as vcf_reader:
for record in vcf_reader:
filtered_call = record.strip().split("\t")
if not record.startswith("#"):
record = record.strip().split("\t")
filtered_call = record[0:9]
sample_count = 0
nsites += 1
for call in record[9:]:
if not call.startswith("."):
# if not call.startswith(".:."):
sample = samples[sample_count]
depth = call.split(":")[1]
avg_depth_times_2 = float(self.sample_avg_depth_dic[sample])*2.5
if int(depth) < 4:
call = ".:.:."
total_set_to_missing += 1
if float(depth) > avg_depth_times_2: ## Set to missing if depth > avg depth*3
call = ".:.:."
total_set_to_missing += 1
else:
pass #missing
filtered_call.append(call)
sample_count += 1
elif record.startswith("#CHROM"):
samples = record.split()[9:]
vcf_outfile.write("\t".join(filtered_call)+"\n")
print( vcf, ": Avg set to missing per site: ", total_set_to_missing/nsites)
if __name__=="__main__":
F=FILTER_VCF()
args = parser.parse_args()
vcf = args.vcf_file[0]
coverage_table = args.bam_coverage[0]
out = args.output_vcf[0]
F.avg_depth_per_sample(coverage_table)
F.filter_depth_per_sample(vcf, out)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment