From 4645bf7cfb76bd799090380f23c444e7620a9af4 Mon Sep 17 00:00:00 2001 From: Thomas Faraut <Thomas.Faraut@inra.fr> Date: Wed, 27 Nov 2019 14:23:30 +0100 Subject: [PATCH] modified annotation --- svfilter.py | 50 +++++++++++++++---------- svreader/Annotate.py | 87 ++++++++++++++++++++++++++++++++++++++++---- svreader/__init__.py | 29 +-------------- 3 files changed, 111 insertions(+), 55 deletions(-) diff --git a/svfilter.py b/svfilter.py index abd885f..4a568eb 100644 --- a/svfilter.py +++ b/svfilter.py @@ -253,12 +253,30 @@ def setLikeliTag(genotyper): exit(1) -def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2, +def add_redundancy_infos_header(reader): + # Adding DUPLICATESCORE info (equivalent to GSDUPLICATESCORE) + reader.addInfo("DUPLICATESCORE", 1, "Float", + "LOD score that the events are distinct based on the " + "genotypes of the most discordant sample") + # Adding DUPLICATES info (equivalent to GSDUPLICATEOVERLAP) + reader.addInfo("DUPLICATEOVERLAP", 1, "Float", + "Highest overlap with a duplicate event") + # Adding DUPLICATEOVERLAP info (equivalent to GSDUPLICATEOVERLAP) + reader.addInfo("DUPLICATES", ".", "String", + "List of duplicate events preferred to this one") + # Adding NONDUPLICATEOVERLAP + reader.addInfo("NONDUPLICATEOVERLAP", 1, "Float", + "Amount of overlap with a non-duplicate") + + +def GenomeSTRIPLikeRedundancyAnnotator(SVSet, reader, + duplicatescore_threshold=-2, genotyper="svtyper"): """ Annotating duplicate candidates based on the genotype likelihoods - genotype likelihoods can be provided by svtyper or genomestrip """ + add_redundancy_infos_header(reader) setLikeliTag(genotyper) variants = defaultdict() @@ -344,7 +362,16 @@ def add_overlap_info_sv(sv, overlap, duplicatescore): sv.record.info['NONDUPLICATEOVERLAP'] = overlap -def GenomeSTRIPLikefiltering(SVSet): +def add_filter_infos_header(reader): + # FILTERS + # Adding specific filters + reader.addFilter("CALLRATE", "Call rate <0.75") + reader.addFilter("MONOMORPH", "All samples have the same genotype") + reader.addFilter("DUPLICATE", "GSDUPLICATESCORE>0") + reader.addFilter("OVERLAP", "NONDUPLICATEOVERLAP>0.7") + + +def GenomeSTRIPLikefiltering(SVSet, reader): """ Filtering the candidate CNVs according to the following criteria - non duplicate sites - variant sites @@ -355,30 +382,15 @@ def GenomeSTRIPLikefiltering(SVSet): (use NONVARIANTSCORE in both cases) """ + add_filter_infos_header(reader) + for sv in SVSet: info = sv.record.info if callRate(sv) < 0.75: sv.filter.add("CALLRATE") if not Polymorph(sv): sv.filter.add("MONOMORPH") - # if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0: - # sv.filter.add("DUPLICATE") - # DUPLICATE if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: sv.filter.add("OVERLAP") - if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2: sv.filter.add("DUPLICATE") - # NONVARIANT - if 'GSNONVARSCORE' in info and info['GSNONVARSCORE'] and info['GSNONVARSCORE'] > 13: - sv.filter.add("NONVARIANT") - # GTDEPTH - if 'GSDEPTHRATIO' in info and (info['GSDEPTHRATIO'] is None or info['GSDEPTHRATIO'] > 0.8): - sv.filter.add("DEPTH") - if 'DEPTHRATIO' in info and (isnan(info['DEPTHRATIO']) or info['DEPTHRATIO'] > 0.8): - sv.filter.add("DDEPTH") - if 'DEPTHCALLTHRESHOLD' in info and info['DEPTHCALLTHRESHOLD'] > 1: - sv.filter.add("CCOVERAGE") - # CLUSTERSEP - if 'GSM1' in info and (info['GSM1'] <= -0.5 or info['GSM1'] >= 2.0): - sv.filter.add("CLUSTERSEP") diff --git a/svreader/Annotate.py b/svreader/Annotate.py index d6bb22a..66d6813 100644 --- a/svreader/Annotate.py +++ b/svreader/Annotate.py @@ -1,6 +1,7 @@ - -from svreader import SVRecord, SVReader, SVWriter +import sys +from svrunner_utils import eprint +from svreader import SVReader, SVWriter from pysam import VariantFile @@ -8,6 +9,7 @@ HOM_REF = (0, 0) HET_VAR = (0, 1) HOM_VAR = (1, 1) + class AnnotatedRecord(object): """ A lightweight object to annotated the final records @@ -56,7 +58,49 @@ class AnnotatedRecord(object): @property def samples(self): - return self.record.samples + return [AnnotatedRecordSample(s) for s in self.record.samples.values()] + + def setNewId(self, identifier): + try: + self.record.info['SOURCEID'] = self.id + except KeyError: + eprint("SOURCEID absent from record info keys,") + sys.exit(1) + self.id = identifier + + def num_variant_samples(self): + return sum([s.isVariant for s in self.samples]) + + def variant_read_support(self): + return sum([s.AltSupport() for s in self.samples]) + + def qual(self): + return sum([s.SQ_score() for s in self.samples if s.isVariant]) + + def setQual(self): + self.record.qual = self.qual() + + def AddSupportInfos(self): + supp_reads = self.variant_read_support() + num_supp_samples = self.num_variant_samples() + try: + self.record.info['SUPP_READS'] = supp_reads + self.record.info['NUM_SUPP_SAMPLES'] = num_supp_samples + except KeyError: + eprint("SUPP_READS or NUM_SUPP_SAMPLES absent from record info keys") + sys.exit(1) + + def UnifiedPass(self): + """ + All records passing the filters (PASS, .) ar now labelled PASS + """ + record = self.record + filters = [f for f in record.filter] + # We make the assumption when a "." is present no other filter + # are present + if len(filters) == 0 or "." in filters: + record.filter.clear() + record.filter.add("PASS") class AnnotatedRecordSample(object): @@ -66,25 +110,41 @@ class AnnotatedRecordSample(object): def __init__(self, variantsample): self.variantsample = variantsample + def genotype(self): + return self.variantsample.get('GT') + + def SQ_score(self): + return self.variantsample.get('SQ') + + def AltSupport(self): + return self.variantsample.get('AO')[0] + + @property + def isVariant(self): + if self.genotype() in [HET_VAR, HOM_VAR]: + return True + else: + return False + class VCFReader(SVReader): def __init__(self, file_name, sv_to_report=None): - super(VCFReader, self).__init__(file_name, - generic_name, - reference_handle) + super(VCFReader, self).__init__(file_name) self.vcf_reader = VariantFile(file_name) self.filename = file_name - self.sv_to_report =sv_to_report + self.sv_to_report = sv_to_report self.vcf_reader = VariantFile(file_name) + self.add_metadata() + def __iter__(self): return self def __next__(self): while True: raw_record = next(self.vcf_reader) - record = (raw_record) + record = AnnotatedRecord(raw_record) return record def getHeader(self): @@ -102,6 +162,17 @@ class VCFReader(SVReader): type=None, description=description) + def add_metadata(self): + self.addInfo("SOURCEID", 1, "String", + "The source sv identifier") + self.addInfo("SUPP_READS", 1, "Integer", + "Number of supporting reads") + self.addInfo("NUM_SUPP_SAMPLES", 1, "Integer", + "Number of supporting samples") + self.addFilter("LOWSUPPORT", + "total supp reads < 5 or supp samples < 2") + + def getOrderedSamples(self): samples = self.vcf_reader.header.samples sample_names = [sample.rsplit('.')[0] for sample in samples] diff --git a/svreader/__init__.py b/svreader/__init__.py index be3e22c..2309f13 100644 --- a/svreader/__init__.py +++ b/svreader/__init__.py @@ -228,34 +228,7 @@ class SVReader(object): if excluded_regions is not None: vcf_records = self._filter_for_gaps(vcf_records, excluded_regions, cutoff_proximity) - # Convert vcf to intervals - # eprint("Converting to bedtools intervals") - # variants = vcf_to_pybed(vcf_records, excluded_regions, - # cutoff_proximity) - # - # # Identify SV overlapping regions to exclude - # eprint("Loading exlusion intervals ") - # eprint(excluded_regions) - # toexclude = BedTool(excluded_regions).sort() - # excluded = dict() - # if len(toexclude): - # eprint("Computing distance with gaps: %d gaps" % len(toexclude)) - # # removing cnv in a close proximity to gaps in the assembly - # # -d=True In addition to the closest feature in B, - # # report its distance to A as an extra column - # # TODO this is questionnable ??? (remove this for genomestrip ?) - # for sv in variants.closest(toexclude, d=True): - # if (sv[4] != "." - # and sv[4] != "-1" - # and float(sv[-1]) < cutoff_proximity): - # excluded[sv[3]] = 1 - # - # vcf_records_filtered = [] - # for record in vcf_records: - # if record.id not in excluded: - # vcf_records_filtered.append(record) - # vcf_records = vcf_records_filtered - + return vcf_records def GeneralFilterPass(self, record, minlen, maxlen): -- GitLab