Skip to content
Snippets Groups Projects
Commit 785c7324 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

- new criteria for selecting the representative of duplicate calls (based on SQ)

- remove sorting contig based on contig names when adding contigs info to vcf file
parent 2f124943
No related branches found
No related tags found
No related merge requests found
......@@ -116,6 +116,13 @@ def gstrength(u):
return np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()])
def variantstrength(u):
# maximum SQ, where SQ stands for
# Phred-scaled probability that this site is variant (non-reference
# in this sample)
return max([s['SQ'] if s['SQ'] is not None else 0 for s in u.samples.values()])
def ondiagonal(s, u, v):
# Probability that, for that individual, the two SVs are identically
# genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1)
......@@ -168,7 +175,7 @@ def duplicatescore(u, v):
return computed
def getduplicates(u, v):
def getduplicatesOld(u, v):
# select the prefered duplicate
# returns prefered, discarded, strength of both
if gstrength(u) > gstrength(v):
......@@ -177,6 +184,15 @@ def getduplicates(u, v):
return (v, u, gstrength(v), gstrength(u))
def getduplicates(u, v):
# select the prefered duplicate
# returns prefered, discarded, strength of both
if variantstrength(u) > variantstrength(v):
return (u, v, variantstrength(u), variantstrength(v))
else:
return (v, u, variantstrength(v), variantstrength(u))
def GenotypeQuals(u):
# the genotype qualities as an array
return [u.samples[s]['GQ'] if u.samples[s]['GQ'] is not None else 0
......@@ -247,6 +263,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
seen = defaultdict(tuple)
duplicates = defaultdict(list)
overlapping = defaultdict(tuple)
reference = defaultdict()
for o in self_overlap:
if o[3] == o[7]:
continue
......@@ -259,6 +276,7 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
score = duplicatescore(u, v)
if score > duplicatescore_threshold:
ref, dupli, s1, s2 = getduplicates(u, v)
reference[ref] = 1
overlap_size = int(o[-1])
overlap = getoverlap(dupli, overlap_size)
if maxGQ(ref) > 0 and passed(dupli):
......@@ -286,8 +304,10 @@ def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2,
for o in overlapping:
info = overlapping[o]
u, v = o
add_overlap_info_sv(u, info['overlap_left'], info['dupli_score'])
add_overlap_info_sv(v, info['overlap_right'], info['dupli_score'])
if u not in reference:
add_overlap_info_sv(u, info['overlap_left'], info['dupli_score'])
if v not in reference:
add_overlap_info_sv(v, info['overlap_right'], info['dupli_score'])
def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates):
......
......@@ -11,6 +11,7 @@ from collections import defaultdict
from pybedtools import BedTool, create_interval_from_list
from pysam import AlignmentFile
import numpy as np
import natsort
VERBOSE = True
......@@ -129,6 +130,7 @@ def addContigInfosTovcf(vcffile, outvcffile, reference):
"""
# compute contig information
contigs = get_contigs(reference)
filename, extension = os.path.splitext(outvcffile)
if extension == ".gz":
outputfile = filename
......@@ -142,9 +144,12 @@ def addContigInfosTovcf(vcffile, outvcffile, reference):
if re.search(r"^##contig", line) is not None:
continue
if re.search(r"^#CHROM", line):
for ctg in sorted(contigs, key=lambda x: chromNum(x.name)):
for ctg in contigs:
fout.write('##contig=<ID=%s,length=%d>\n'
% (ctg.name, ctg.length))
# for ctg in sorted(contigs,
# key=lambda x: chromNum(x.name)):
fout.write(line)
# bgzip file
if gzip:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment