From ec072ab9bd56a44c5daad50f9ef7df0483bd882a Mon Sep 17 00:00:00 2001 From: mariabernard <maria.bernard@jouy.inra.fr> Date: Mon, 1 Apr 2019 10:43:59 +0200 Subject: [PATCH] improve filter_multi_sample to take into account GATK specificity --- .../script/filter_multi_sample_vcf.py | 119 ++++++++++-------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py b/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py index ad4a715..f4d410b 100755 --- a/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py +++ b/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py @@ -1,68 +1,81 @@ #!/usr/bin/env python3 -## Martijn Derks +## Martijn Derks from Wageningen University, Netherlands and Maria Bernard from INRA, France ## Filter on average depth per sample set genotypes to missing from collections import OrderedDict import sys import gzip -import vcf import argparse -parser = argparse.ArgumentParser( description='Script to set genotype calls to missing if they do not contain coverage requirements') -parser.add_argument("-v", "--vcf_file", help="VCF_file (compressed)", nargs=1) -parser.add_argument("-b", "--bam_coverage", help="Tab delimited file with average depth per sample", nargs=1) -parser.add_argument("-o", "--output_vcf", help="Output_VCF (gzipped)", nargs=1) - class FILTER_VCF: - def avg_depth_per_sample(self, table): - self.sample_avg_depth_dic = {} - depth_file = open(table,"r") - for sample in depth_file: - sample_id, sample_avg_depth = sample.strip().split()[0], sample.split()[1] - self.sample_avg_depth_dic[sample_id] = float(sample_avg_depth.replace('X','')) ## Dictionary with avg depth per sample + def avg_depth_per_sample(self, table): + self.sample_avg_depth_dic = {} + depth_file = open(table,"r") + for sample in depth_file: + sample_id, sample_avg_depth = sample.strip().split()[0], sample.split()[1] + self.sample_avg_depth_dic[sample_id] = float(sample_avg_depth.replace('X','')) ## Dictionary with avg depth per sample - def filter_depth_per_sample(self, vcf, outfile): - vcf_outfile = gzip.open(outfile,"wb") - total_set_to_missing = 0 - nsites = 0 - with gzip.open(vcf,'r') as vcf_reader: - for record in vcf_reader: - filtered_call = record.strip().split("\t") - if not record.startswith("#"): - record = record.strip().split("\t") - filtered_call = record[0:9] - sample_count = 0 - nsites += 1 - for call in record[9:]: - if not call.startswith("."): -# if not call.startswith(".:."): - sample = samples[sample_count] - depth = call.split(":")[1] - avg_depth_times_2 = float(self.sample_avg_depth_dic[sample])*2.5 - if int(depth) < 4: - call = ".:.:." - total_set_to_missing += 1 - if float(depth) > avg_depth_times_2: ## Set to missing if depth > avg depth*3 - call = ".:.:." - total_set_to_missing += 1 - else: - pass #missing - filtered_call.append(call) - sample_count += 1 - elif record.startswith("#CHROM"): - samples = record.split()[9:] - vcf_outfile.write("\t".join(filtered_call)+"\n") - print( vcf, ": Avg set to missing per site: ", total_set_to_missing/nsites) + def filter_depth_per_sample(self, vcf, outfile): + vcf_outfile = gzip.open(outfile,"wt") + total_set_to_missing_un = 0 + total_set_to_missing_low = 0 + total_set_to_missing_high = 0 + nsites = 0 + format_idx = None + depth_idx = None + + with gzip.open(vcf,'rt') as vcf_reader: + for record in vcf_reader: + filtered_call = record.strip().split("\t") + # variants line parsing + if not record.startswith("#"): + record = record.strip().split("\t") + filtered_call = record[0:9] + sample_count = 0 + nsites += 1 + # identify depth in genotype format + if depth_idx is None: + depth_idx = record[format_idx].split(":").index("DP") + + # parse samples genotype calling + for call in record[9:]: + # if known + if not call.startswith("."): + sample = samples[sample_count] + depth = call.split(":")[depth_idx] + avg_depth_times_2 = float(self.sample_avg_depth_dic[sample])*2.5 + if depth == "." : + call = ".:.:." + total_set_to_missing_un += 1 + elif int(depth) < 4 : + call = ".:.:." + total_set_to_missing_low += 1 + elif float(depth) > avg_depth_times_2: ## Set to missing if depth > avg depth*3 + call = ".:.:." + total_set_to_missing_high += 1 + else: + pass #missing + filtered_call.append(call) + sample_count += 1 + # parsing genotype header line + elif record.startswith("#CHROM"): + samples = record.split()[9:] + format_idx = record.split("\t").index("FORMAT") + + vcf_outfile.write("\t".join(filtered_call)+"\n") + print( vcf, ": Avg set to missing per site: ", (total_set_to_missing_un + total_set_to_missing_low + total_set_to_missing_high)/nsites, "unknown depth : ", total_set_to_missing_un, "low depth : ", total_set_to_missing_low, "high depth : ", total_set_to_missing_high) if __name__=="__main__": - F=FILTER_VCF() - args = parser.parse_args() - vcf = args.vcf_file[0] - coverage_table = args.bam_coverage[0] - out = args.output_vcf[0] - - F.avg_depth_per_sample(coverage_table) - F.filter_depth_per_sample(vcf, out) + + parser = argparse.ArgumentParser( description='Script to set genotype calls to missing if they do not contain coverage requirements') + parser.add_argument("-v", "--vcf_file", help="VCF_file (compressed)") + parser.add_argument("-b", "--bam_coverage", help="Tab delimited file with average depth per sample") + parser.add_argument("-o", "--output_vcf", help="Output_VCF (gzipped)") + args = parser.parse_args() + F=FILTER_VCF() + + F.avg_depth_per_sample(args.bam_coverage) + F.filter_depth_per_sample(args.vcf_file, args.output_vcf) -- GitLab