Newer
Older
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from svrunner_utils import eprint
from svreader.vcfwrapper import VCFReader, VCFWriter
from svfilter import GenomeSTRIPLikeRedundancyAnnotator
from svfilter import GenomeSTRIPLikefiltering
def add_metadata(reader, add_infos=True):
if add_infos:
# INFOS
# Adding DEPTHRATIO info (equivalent to GSDEPTHRATIO)
reader.addInfo("DEPTHRATIO", 1, "Float", "Read depth ratio between "
"cases and controls")
# Adding DEPTHCALLTHRESHOLD (equivalent to GSDEPTHCALLTHRESHOLD)
reader.addInfo("DEPTHCALLTHRESHOLD", 1, "Float",
"Read depth ratio between cases and controls")
# Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
reader.addInfo("DUPLICATESCORE", 1, "Float",
"LOD score that the events are distinct based on the "
"genotypes of the most discordant sample")
# Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
"Highest overlap with a duplicate event")
# Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
reader.addInfo("DUPLICATES", ".", "String",
"List of duplicate events preferred to this one")
# Adding NONDUPLICATEOVERLAP
reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
"Amount of overlap with a non-duplicate")
# FILTERS
# Adding specific filters
reader.addFilter("CALLRATE", "Call rate <0.75")
reader.addFilter("POLYMORPH", "All samples have the same genotype")
reader.addFilter("ALIGNLENGTH", "GSELENGTH < 200")
reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
reader.addFilter("NONVARIANT", "GSNONVARSCORE>13")
reader.addFilter("DDEPTH", "DEPTHRATIO>0.8 (genomestrip-like)")
reader.addFilter("CCOVERAGE", "DEPTHCALLTHRESHOLD>1 (genomestrip-like)")
reader.addFilter("CLUSTERSEP", "GSM1<=-0.5 || GSM1>=2 "
"(genomestrip best practice)")
def filtering(inputfile, outputfile, genotyper, add_infos=True, overlap_cutoff=0.5):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
- call rate > 0.8
- at least one variant (homozygous or heterozygote) has a
genotype quality > 20
- the variant is not everywhere heterozygote or homozygote
(use NONVARIANTSCORE in both cases)
- sv with no overlap with CNVR filtered out by genomeSTRIP
"""
# Reading the vcf file
SVSet = []
eprint(" Reading file %s" % (inputfile))
reader = VCFReader(inputfile, "merge")
for record in reader:
SVSet.append(record)
eprint("Working with " + str(len(SVSet)) + " records")
UnifiedPassAnnotation(SVSet)
# Redundancy annotation
GenomeSTRIPLikeRedundancyAnnotator(SVSet, genotyper=genotyper)
# Now genomeSTRIP inspired variant filtration
if reader.numSamples() > 0:
GenomeSTRIPLikefiltering(SVSet)
VCFout = VCFWriter(outputfile, reader)
for record in sorted(SVSet, key=lambda k: k.start):
VCFout.write(record)
VCFout.close()
def UnifiedPassAnnotation(SVSet):
"""
All records passing the filters (PASS, .) ar now labelled PASS
"""
for record in SVSet:
filters = [f for f in record.filter]
# We make the assumption when a "." is present no other filter
# are present
# if no filter or "." is present in filters
if len(filters) == 0 or "." in filters:
record.filter.clear()
record.filter.add("PASS")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
prog="filter.py",
description="Filter by applying flags to variants")
parser.add_argument("-i", "--input-vcf", required=True,
help="Genotypes vcf input file")
parser.add_argument("-o", "--output-vcf", required=True,
help="Filtered vcf output file")
parser.add_argument("-g", "--genotyper", required=True,
help="Genotyper tool")
parser.add_argument("-k", "--keep-infos", type=bool, const=True, nargs="?",
required=False, default=False,
help="Don't add infos to VCF, as they already exists")
args = parser.parse_args()
filtering(inputfile=args.input_vcf,
outputfile=args.output_vcf,
genotyper=args.genotyper,
add_infos=not args.keep_infos)