filter.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys

from svrunner_utils import eprint

from svreader.vcfwrapper import VCFReader, VCFWriter

from svfilter import GenomeSTRIPLikeRedundancyAnnotator
from svfilter import GenomeSTRIPLikefiltering


def add_metadata(reader, add_infos=True):

    if add_infos:

        # INFOS
        # Adding DEPTHRATIO info (equivalent to GSDEPTHRATIO)
        reader.addInfo("DEPTHRATIO", 1, "Float", "Read depth ratio between "
                                                 "cases and controls")
        # Adding  DEPTHCALLTHRESHOLD (equivalent to GSDEPTHCALLTHRESHOLD)
        reader.addInfo("DEPTHCALLTHRESHOLD", 1, "Float",
                       "Read depth ratio between cases and controls")
        # Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE)
        reader.addInfo("DUPLICATESCORE", 1, "Float",
                       "LOD score that the events are distinct based on the "
                       "genotypes of the most discordant sample")
        # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP)
        reader.addInfo("DUPLICATEOVERLAP", 1, "Float",
                       "Highest overlap with a duplicate event")
        # Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP)
        reader.addInfo("DUPLICATES", ".", "String",
                       "List of duplicate events preferred to this one")
        # Adding NONDUPLICATEOVERLAP
        reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float",
                       "Amount of overlap with a non-duplicate")

        # FILTERS
        # Adding specific filters
        reader.addFilter("CALLRATE", "Call rate <0.75")
        reader.addFilter("POLYMORPH", "All samples have the same genotype")
        reader.addFilter("ALIGNLENGTH", "GSELENGTH < 200")
        reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0")
        reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7")
        reader.addFilter("NONVARIANT", "GSNONVARSCORE>13")
        reader.addFilter("DDEPTH", "DEPTHRATIO>0.8 (genomestrip-like)")
        reader.addFilter("CCOVERAGE", "DEPTHCALLTHRESHOLD>1 (genomestrip-like)")
        reader.addFilter("CLUSTERSEP", "GSM1<=-0.5 || GSM1>=2 "
                         "(genomestrip best practice)")


def filtering(inputfile, outputfile, genotyper, add_infos=True, overlap_cutoff=0.5):
    """ Filtering the candidate CNVs according to the following criteria
          - non duplicate sites
          - variant sites
          - call rate > 0.8
          - at least one variant (homozygous or heterozygote) has a
            genotype quality > 20
          - the variant is not everywhere heterozygote or homozygote
            (use NONVARIANTSCORE in both cases)
          - sv with no overlap with CNVR filtered out by genomeSTRIP
    """
    # Reading the vcf file
    SVSet = []
    eprint(" Reading file %s" % (inputfile))
    reader = VCFReader(inputfile, "merge")

    add_metadata(reader, add_infos)

    for record in reader:
        SVSet.append(record)
    eprint("Working with " + str(len(SVSet)) + " records")

    # PASS and "." will be now marked PASS
    UnifiedPassAnnotation(SVSet)

    # Redundancy annotation
    GenomeSTRIPLikeRedundancyAnnotator(SVSet, genotyper=genotyper)

    # Now genomeSTRIP inspired variant filtration
    if reader.numSamples() > 0:
        GenomeSTRIPLikefiltering(SVSet)

    VCFout = VCFWriter(outputfile, reader)
    for record in sorted(SVSet, key=lambda k: k.start):
        VCFout.write(record)
    VCFout.close()


def UnifiedPassAnnotation(SVSet):
    """
       All records passing the filters (PASS, .) ar now labelled PASS
    """
    for record in SVSet:
        filters = [f for f in record.filter]
        # We make the assumption when a "." is present no other filter
        # are present
        # if no filter or "." is present in filters
        if len(filters) == 0 or "." in filters:
            record.filter.clear()
            record.filter.add("PASS")

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        prog="filter.py",
        description="Filter by applying flags to variants")
    parser.add_argument("-i", "--input-vcf", required=True,
                        help="Genotypes vcf input file")
    parser.add_argument("-o", "--output-vcf", required=True,
                        help="Filtered vcf output file")
    parser.add_argument("-g", "--genotyper", required=True,
                        help="Genotyper tool")
    parser.add_argument("-k", "--keep-infos", type=bool, const=True, nargs="?",
                        required=False, default=False,
                        help="Don't add infos to VCF, as they already exists")

    args = parser.parse_args()

    filtering(inputfile=args.input_vcf,
              outputfile=args.output_vcf,
              genotyper=args.genotyper,
              add_infos=not args.keep_infos)