new filters for annotation

18c31896 · Thomas Faraut · c1c226a5 · 18c31896 · 18c31896 · 18c31896
Commit 18c31896 authored 5 years ago by Thomas Faraut
--- a/snakecnv/bin/annotate.py
+++ b/snakecnv/bin/annotate.py
@@ -10,12 +10,26 @@ from svfilter import GenomeSTRIPLikefiltering
 from svfilter import AnnotateSupport


+def add_metadata(reader):
+    reader.addInfo("SOURCEID", 1, "String",
+                   "The source sv identifier")
+    reader.addFilter("LOWSUPPORT", "total supp reads < 5 or supp samples < 2")
+
+
 def setNewId(record, identifier):
    record.record.info['SOURCEID'] = record.id
-    record.id = id
+    record.id = identifier
+

+def FilteringBySupport(SVSet, minsupp_reads=5, min_supp_sampes=2):
+    for sv in SVSet:
+        info = sv.record.info
+        if (info["SUPP_READS"] < minsupp_reads or
+                info["NUM_SUPP_SAMPLES"] < min_supp_sampes):
+            sv.filter.add("LOWSUPPORT")

-def annotate(inputfile, outputfile, genotyper, add_infos=True,
+
+def annotate(inputfile, outputfile, genotyper, chrom, add_infos=True,
             overlap_cutoff=0.5):
    """ Filtering the candidate CNVs according to the following criteria
          - non duplicate sites
@@ -32,20 +46,22 @@ def annotate(inputfile, outputfile, genotyper, add_infos=True,
    eprint(" Reading file %s" % (inputfile))
    reader = VCFReader(inputfile, "merge")

-    # Adding metadata : INFO and FILTER
-    add_metadata(reader, add_infos)
+    add_metadata(reader)

+    num = 1
    for record in reader:
-        setNewId(record)
+        ident = "_".join(["svpipeline_", chrom, record.svtype, str(num)])
+        setNewId(record, ident)
        SVSet.append(record)
+        num += 1
    eprint("Working with " + str(len(SVSet)) + " records")

-    # Support annotation
-    AnnotateSupport(SVSet)
-
    # PASS and "." will be now marked PASS
    UnifiedPassAnnotation(SVSet)

+    # Support annotation
+    AnnotateSupport(SVSet, reader)
+
    # Redundancy annotation
    GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, genotyper=genotyper)

@@ -53,6 +69,9 @@ def annotate(inputfile, outputfile, genotyper, add_infos=True,
    if reader.numSamples() > 0:
        GenomeSTRIPLikefiltering(SVSet, reader)

+    # Now supporting info filtering
+    FilteringBySupport(SVSet)
+
    VCFout = VCFWriter(outputfile, reader)
    for record in sorted(SVSet, key=lambda k: k.start):
        VCFout.write(record)

--- a/snakecnv/common.smk
+++ b/snakecnv/common.smk
@@ -505,7 +505,10 @@ class MergeBatches(Detection):

    def get_all_outputs(self, wildcards):
        outputs = ["Summarized_results.html"]
-        outputs += expand("filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
+        #outputs += expand("filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
+        #                  chrom=self.chromosomes,
+        #                  svtype=[x for x in self.varianttypes if x != "mCNV"])
+        outputs += expand("annotated/{svtype}/svtyper_{chrom}_{svtype}_final.vcf.gz",
                          chrom=self.chromosomes,
                          svtype=[x for x in self.varianttypes if x != "mCNV"])
        return outputs

--- a/snakecnv/rules/mergingbatches.smk
+++ b/snakecnv/rules/mergingbatches.smk
@@ -185,7 +185,7 @@ def get_root_path():

 rule summary:
    input:
-        expand("filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
+        expand("annotated/{svtype}/svtyper_{chrom}_{svtype}_final.vcf.gz",
               chrom=cmd.chromosomes, svtype=variant_types)
    params:
        tpl = os.path.join(get_root_path(), "full.tpl"),