Skip to content
Snippets Groups Projects
Commit ec072ab9 authored by mariabernard's avatar mariabernard
Browse files

improve filter_multi_sample to take into account GATK specificity

parent 756ae24e
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
## Martijn Derks
## Martijn Derks from Wageningen University, Netherlands and Maria Bernard from INRA, France
## Filter on average depth per sample set genotypes to missing
from collections import OrderedDict
import sys
import gzip
import vcf
import argparse
parser = argparse.ArgumentParser( description='Script to set genotype calls to missing if they do not contain coverage requirements')
parser.add_argument("-v", "--vcf_file", help="VCF_file (compressed)", nargs=1)
parser.add_argument("-b", "--bam_coverage", help="Tab delimited file with average depth per sample", nargs=1)
parser.add_argument("-o", "--output_vcf", help="Output_VCF (gzipped)", nargs=1)
class FILTER_VCF:
def avg_depth_per_sample(self, table):
self.sample_avg_depth_dic = {}
depth_file = open(table,"r")
for sample in depth_file:
sample_id, sample_avg_depth = sample.strip().split()[0], sample.split()[1]
self.sample_avg_depth_dic[sample_id] = float(sample_avg_depth.replace('X','')) ## Dictionary with avg depth per sample
def avg_depth_per_sample(self, table):
self.sample_avg_depth_dic = {}
depth_file = open(table,"r")
for sample in depth_file:
sample_id, sample_avg_depth = sample.strip().split()[0], sample.split()[1]
self.sample_avg_depth_dic[sample_id] = float(sample_avg_depth.replace('X','')) ## Dictionary with avg depth per sample
def filter_depth_per_sample(self, vcf, outfile):
vcf_outfile = gzip.open(outfile,"wb")
total_set_to_missing = 0
nsites = 0
with gzip.open(vcf,'r') as vcf_reader:
for record in vcf_reader:
filtered_call = record.strip().split("\t")
if not record.startswith("#"):
record = record.strip().split("\t")
filtered_call = record[0:9]
sample_count = 0
nsites += 1
for call in record[9:]:
if not call.startswith("."):
# if not call.startswith(".:."):
sample = samples[sample_count]
depth = call.split(":")[1]
avg_depth_times_2 = float(self.sample_avg_depth_dic[sample])*2.5
if int(depth) < 4:
call = ".:.:."
total_set_to_missing += 1
if float(depth) > avg_depth_times_2: ## Set to missing if depth > avg depth*3
call = ".:.:."
total_set_to_missing += 1
else:
pass #missing
filtered_call.append(call)
sample_count += 1
elif record.startswith("#CHROM"):
samples = record.split()[9:]
vcf_outfile.write("\t".join(filtered_call)+"\n")
print( vcf, ": Avg set to missing per site: ", total_set_to_missing/nsites)
def filter_depth_per_sample(self, vcf, outfile):
vcf_outfile = gzip.open(outfile,"wt")
total_set_to_missing_un = 0
total_set_to_missing_low = 0
total_set_to_missing_high = 0
nsites = 0
format_idx = None
depth_idx = None
with gzip.open(vcf,'rt') as vcf_reader:
for record in vcf_reader:
filtered_call = record.strip().split("\t")
# variants line parsing
if not record.startswith("#"):
record = record.strip().split("\t")
filtered_call = record[0:9]
sample_count = 0
nsites += 1
# identify depth in genotype format
if depth_idx is None:
depth_idx = record[format_idx].split(":").index("DP")
# parse samples genotype calling
for call in record[9:]:
# if known
if not call.startswith("."):
sample = samples[sample_count]
depth = call.split(":")[depth_idx]
avg_depth_times_2 = float(self.sample_avg_depth_dic[sample])*2.5
if depth == "." :
call = ".:.:."
total_set_to_missing_un += 1
elif int(depth) < 4 :
call = ".:.:."
total_set_to_missing_low += 1
elif float(depth) > avg_depth_times_2: ## Set to missing if depth > avg depth*3
call = ".:.:."
total_set_to_missing_high += 1
else:
pass #missing
filtered_call.append(call)
sample_count += 1
# parsing genotype header line
elif record.startswith("#CHROM"):
samples = record.split()[9:]
format_idx = record.split("\t").index("FORMAT")
vcf_outfile.write("\t".join(filtered_call)+"\n")
print( vcf, ": Avg set to missing per site: ", (total_set_to_missing_un + total_set_to_missing_low + total_set_to_missing_high)/nsites, "unknown depth : ", total_set_to_missing_un, "low depth : ", total_set_to_missing_low, "high depth : ", total_set_to_missing_high)
if __name__=="__main__":
F=FILTER_VCF()
args = parser.parse_args()
vcf = args.vcf_file[0]
coverage_table = args.bam_coverage[0]
out = args.output_vcf[0]
F.avg_depth_per_sample(coverage_table)
F.filter_depth_per_sample(vcf, out)
parser = argparse.ArgumentParser( description='Script to set genotype calls to missing if they do not contain coverage requirements')
parser.add_argument("-v", "--vcf_file", help="VCF_file (compressed)")
parser.add_argument("-b", "--bam_coverage", help="Tab delimited file with average depth per sample")
parser.add_argument("-o", "--output_vcf", help="Output_VCF (gzipped)")
args = parser.parse_args()
F=FILTER_VCF()
F.avg_depth_per_sample(args.bam_coverage)
F.filter_depth_per_sample(args.vcf_file, args.output_vcf)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment