Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import logging
import os
from collections import defaultdict
from pysam import VariantFile
from svreader import SVRecord, SVReader, SVWriter
from svrunner_utils import eprint, vcf_to_pybed
logger = logging.getLogger(__name__)
mydir = os.path.dirname(os.path.realpath(__file__))
'''
Delly output is a vcf file, fields are described in the header
#using awk
cat vcf |
awk '/##FORMAT/{ last=index($0,","); id=index($0,"ID"); \
desc=substr($0,match($0,"Description")+12) ; gsub(/"/, "", desc); \
printf "%-30s %s\n", substr($0,id+3,last-id-3), \
substr(desc,0,length(desc)-1)}'
FILTER
PASS All filters passed
LowQual PE/SR support below 3 or mapping quality below 20.
INFO
CIEND PE confidence interval around END
CIPOS PE confidence interval around POS
CHR2 Chromosome for END coordinate in case of a translocation
END End position of the structural variant
PE Paired-end support of the structural variant
MAPQ Median mapping quality of paired-ends
SR Split-read supportSpecificFilterPass
SRQ Split-read consensus alignment quality
CONSENSUS Split-read consensus sequence
CE Consensus sequence entropy
CT Paired-end signature induced connection type
IMPRECISE Imprecise structural variation
PRECISE Precise structural variation
SVTYPE Type of structural variant
SVMETHOD Type of approach used to detect SV
INSLEN Predicted length of the insertion
HOMLEN Predicted microhomology length using a max. edit distance of 2
RDRATIO Read-depth ratio of het. SV carrier vs. non-carrier.
FORMAT
GT Genotype
GL Log10-scaled genotype likelihoods for RR,RA,AA genotypes
GQ Genotype Quality
FT Per-sample genotype filter
RC Raw high-quality read counts for the SV
RCL Raw high-quality read counts for the left control region
RCR Raw high-quality read counts for the right control region
CN Read-depth based copy-number estimate for autosomal sites
DR high-quality reference pairs
DV high-quality variant pairs
RR high-quality reference junction reads
RV high-quality variant junction reads
'''
delly_name = "delly"
delly_source = set([delly_name])
class DellyRecord(SVRecord):
def __init__(self, record, sample_order, reference_handle=None):
# Custom id
record.id = "_".join([delly_name, record.chrom, record.id])
super(DellyRecord, self).__init__(delly_name,
record.chrom,
record.pos,
record.stop,
record.id,
ref=record.ref,
qual=record.qual,
filter=record.filter)
self.__record = record
self._sv_type = record.info["SVTYPE"]
self.__pe = record.info["PE"]
self.__sr = record.info["SR"] if "SR" in list(record.info.keys()) else 0
self.__ct = record.info["CT"]
self.__record.info['SR'] = self.__sr
sv = {}
for sample in record.samples:
sample_name = sample.rsplit('.')[0]
sv[sample_name] = record.samples[sample]
self.sv = sv
self.__record.info['VSAMPLES'] = ",".join(self.variantSamples())
@property
def record(self):
return self.__record
@property
def start(self):
return self.record.pos
@property
def end(self):
return self.record.stop
@property
def pe(self):
return self.__pe
@property
def sr(self):
return self.__sr
@property
def ct(self):
return self.__ct
def update(self, start, end, pe_supp):
self.record.pos = start
self.record.stop = end
self.record.info['PE'] = pe_supp
def variantSample(self, sample):
if self.sv[sample]["DV"] > 0 or self.sv[sample]["RV"] > 0:
return True
else:
return False
def variantSamples(self):
variant_samples = []
for sample in self.sv:
if self.variantSample(sample):
variant_samples.append(sample)
return sorted(variant_samples)
def MaxIndSupportingRP(self):
''' Maximum number of supporting PE among individuals.
'''
max_ind_supp = 0
su = 0
rep_sample = ""
for sample in self.sv:
dv = self.sv[sample]["DV"]
su += dv
if dv > max_ind_supp:
rep_sample = sample
max_ind_supp = dv
if max_ind_supp == 0:
ratio = 0
else:
total = self.sv[rep_sample]["DV"] + self.sv[rep_sample]["DR"]
ratio = float(self.sv[rep_sample]["DV"])/total
return su, max_ind_supp, ratio
def to_svcf_record(self):
return self.record
class DellyReader(SVReader):
svs_supported = set(["DEL", "INS", "DUP", "INV"])
def __init__(self, file_name, reference_handle=None, svs_to_report=None):
super(DellyReader, self).__init__(file_name,
delly_name,
reference_handle)
self.vcf_reader = VariantFile(file_name)
if 'VSAMPLES' not in self.vcf_reader.header.info.keys():
self.vcf_reader.header.info.add('VSAMPLES',
number='.',
type='String',
description='Samples with variant alleles')
self.svs_supported = DellyReader.svs_supported
if svs_to_report is not None:
self.svs_supported &= set(svs_to_report)
def __iter__(self):
return self
def __next__(self):
while True:
delly_record = next(self.vcf_reader)
record = DellyRecord(delly_record, self.reference_handle)
if record.sv_type in self.svs_supported:
return record
def getHeader(self):
return self.vcf_reader.header
def getOrderedSamples(self):
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
def SpecificFilterPass(self, record):
# Filtering criteria more than 5 PE or more than SR and PE > 0
# see http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
# TODO : should we modifiy this in order to account for individual
# specific filtering (e.g pe > 5 at least for an individual)
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def bnd_merge(sef, svtype, records):
"""
Merging delly two lines inversion format according to the
CT info field (distal connection)
3to3 tail-to-tail
5to5 head-to-head
CT types are separated and for each 3to3, the corresponding
5to5 is searched for (bedtools intersect with >90% overlap)
A single line corresponding to the intersection of both
signature is kept
"""
if svtype != "INV":
return records
# a dictionnary of records with id as key
records_dict = defaultdict()
records_CT = defaultdict(list)
for r in records:
records_dict[r.id] = r
records_CT[r.ct].append(r)
eprint("Converting to bedtools intervals")
tail2tail = vcf_to_pybed(records_CT["3to3"])
head2head = vcf_to_pybed(records_CT["5to5"])
# TODO check if the following code really recover balanced inversions
balanced = tail2tail.intersect(head2head, r=True, f=0.95, wo=True)
confirmed = []
seen = defaultdict()
for b in balanced:
start = int(b[5]) # tail
end = int(b[2]) # head
t2t = records_dict[b[3]]
h2h = records_dict[b[7]]
repre = t2t
if repre in seen:
continue
seen[repre] = 1
pe_supp = t2t.record.info['PE'] + h2h.record.info['PE']
print("merging %s and %s" % (t2t, h2h))
repre.update(start, end, pe_supp)
confirmed.append(repre)
return confirmed
class DellyWriter(SVWriter):
def __init__(self, file_name, reference_contigs, template_reader):
super(DellyWriter, self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(self.filename, 'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
self._dumpemptyvcf()