diff --git a/Snakemake/IMAGE_vqsr/Snakefile b/Snakemake/IMAGE_vqsr/Snakefile index c8840a03f05025b8c30f121fbdd450a8d59f1cac..0a77d9c9b77090dd74920a6d1f79d95bc60e4740 100644 --- a/Snakemake/IMAGE_vqsr/Snakefile +++ b/Snakemake/IMAGE_vqsr/Snakefile @@ -33,9 +33,10 @@ for rule in rule_resources: for resource in rule_resources[rule]: check_param(rule,resource) -# add tabix in PATH +# add tabix and filter_multi_sample_vcf.py in PATH tabix_dir=os.path.dirname(config['bin']['tabix']) os.environ['PATH'] = tabix_dir + os.pathsep + os.environ['PATH'] +script_dir=os.path.abspath('script') ########################################################## ### workflows options @@ -67,6 +68,7 @@ include : "rules/intersect_vcf.smk" include : "rules/split_vcf.smk" include : "rules/hardfiltering.smk" include : "rules/vqsr.smk" +include : "rules/genoFilter.smk" localrules: all @@ -82,6 +84,8 @@ if gatk_prefix.endswith('vcf') : gatk_prefix = os.path.splitext(gatk_prefix)[0] final_outputs.append("results/VQSR/" + gatk_prefix +"_SNP_filtered.vcf.gz") final_outputs.append("results/VQSR/" + gatk_prefix +"_INDEL_filtered.vcf.gz") +final_outputs.append("results/genoFilter/" + gatk_prefix +"vqsr_SNP_genFiltered.vcf.gz") +final_outputs.append("results/genoFilter/" + gatk_prefix +"vqsr_INDEL_genFiltered.vcf.gz") #~ print(final_outputs) diff --git a/Snakemake/IMAGE_vqsr/config.yaml b/Snakemake/IMAGE_vqsr/config.yaml index c9d5d4bdc24dfc63a62a6f172b7808f415359f94..319c20cc1e02feadd988f35b413e9f7a8078459b 100644 --- a/Snakemake/IMAGE_vqsr/config.yaml +++ b/Snakemake/IMAGE_vqsr/config.yaml @@ -12,6 +12,15 @@ resources: "resources_SLURM.yaml" # Fasta reference genome reference_genome: data/ensembl_gallus_gallus_1.fasta +# Tab delimited file with average depth per sample. +# If you used IMAGE_calling pipeline, for the mapping step, you could use the following command line to obtain it: +# for sample in `ls results/qualimap/*` +# do +# cov=`grep "mean coverageData" results/qualimap/$sample/genome_results.txt | awk '{print $NF}'` +# echo -e $sample"\t"$cov >> sample_mean_cov.tsv +# done +sample_mean_cov : data/sample_mean_cov.tsv + # Freebayes calling, vcf.gz, tabix index must be available Freebayes_variants : data/variants_freebayes.vcf.gz diff --git a/Snakemake/IMAGE_vqsr/rules/genoFilter.smk b/Snakemake/IMAGE_vqsr/rules/genoFilter.smk new file mode 100644 index 0000000000000000000000000000000000000000..c830f45ba4a8e69dc49c756f7bb01a210daad59d --- /dev/null +++ b/Snakemake/IMAGE_vqsr/rules/genoFilter.smk @@ -0,0 +1,19 @@ +# coding: utf-8 + +__author__ = "Kenza BAZI KABBAJ / Maria BERNARD- Sigenae" +__version__ = "1.0.0" + +""" +Rules to filtered genotypes called from tool little or too much reads +""" + +rule genoFilter: + input: + sample_cov = config["sample_mean_cov"], + vcf = "results/VQSR/{GATK_input_prefix}_{var}_filtered.vcf.gz" + output: + vcf = temp("results/VQSR/{GATK_input_prefix}_vqsr_{var}_genFiltered.vcf.gz") + shell: + """ + filter_multi_sample_vcf.py --vcf_file {input.vcf} --bam_coverage {input.sample_cov} --output_vcf {output.vcf} + """ \ No newline at end of file diff --git a/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py b/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py new file mode 100755 index 0000000000000000000000000000000000000000..ad4a715281764463633f15b0dd2ea288532e989b --- /dev/null +++ b/Snakemake/IMAGE_vqsr/script/filter_multi_sample_vcf.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +## Martijn Derks +## Filter on average depth per sample set genotypes to missing + +from collections import OrderedDict +import sys +import gzip +import vcf +import argparse + +parser = argparse.ArgumentParser( description='Script to set genotype calls to missing if they do not contain coverage requirements') +parser.add_argument("-v", "--vcf_file", help="VCF_file (compressed)", nargs=1) +parser.add_argument("-b", "--bam_coverage", help="Tab delimited file with average depth per sample", nargs=1) +parser.add_argument("-o", "--output_vcf", help="Output_VCF (gzipped)", nargs=1) + +class FILTER_VCF: + + def avg_depth_per_sample(self, table): + self.sample_avg_depth_dic = {} + depth_file = open(table,"r") + for sample in depth_file: + sample_id, sample_avg_depth = sample.strip().split()[0], sample.split()[1] + self.sample_avg_depth_dic[sample_id] = float(sample_avg_depth.replace('X','')) ## Dictionary with avg depth per sample + + def filter_depth_per_sample(self, vcf, outfile): + vcf_outfile = gzip.open(outfile,"wb") + total_set_to_missing = 0 + nsites = 0 + with gzip.open(vcf,'r') as vcf_reader: + for record in vcf_reader: + filtered_call = record.strip().split("\t") + if not record.startswith("#"): + record = record.strip().split("\t") + filtered_call = record[0:9] + sample_count = 0 + nsites += 1 + for call in record[9:]: + if not call.startswith("."): +# if not call.startswith(".:."): + sample = samples[sample_count] + depth = call.split(":")[1] + avg_depth_times_2 = float(self.sample_avg_depth_dic[sample])*2.5 + if int(depth) < 4: + call = ".:.:." + total_set_to_missing += 1 + if float(depth) > avg_depth_times_2: ## Set to missing if depth > avg depth*3 + call = ".:.:." + total_set_to_missing += 1 + else: + pass #missing + filtered_call.append(call) + sample_count += 1 + elif record.startswith("#CHROM"): + samples = record.split()[9:] + vcf_outfile.write("\t".join(filtered_call)+"\n") + print( vcf, ": Avg set to missing per site: ", total_set_to_missing/nsites) + +if __name__=="__main__": + F=FILTER_VCF() + args = parser.parse_args() + vcf = args.vcf_file[0] + coverage_table = args.bam_coverage[0] + out = args.output_vcf[0] + + F.avg_depth_per_sample(coverage_table) + F.filter_depth_per_sample(vcf, out) + +