Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import logging
import sys
import os
import gzip
import vcf
from svrunner_utils import smart_open
from svreader.vcf_utils import get_template, get_template_header
from svreader import SVRecord, SVReader, SVWriter
from pysam import VariantFile, VariantHeader
logger = logging.getLogger(__name__)
mydir = os.path.dirname(os.path.realpath(__file__))
'''
from http://gmt.genome.wustl.edu/packages/pindel/user-manual.html
There is a head line for each variant reported, followed by the alignment of supporting reads to the reference on the
second line. The example variants are a 1671bp deletion and a 10bp insertion on chr20. The breakpoints are specified
after "BP". Due to microhomology around the breakpoints, the breakpoint coordinates may shift both upstream and
downstream,'BP_range' is used to indicate the range of this shift. The header line contains the following data:
1) The index of the indel/SV (57 means that 57 insertions precede this insertion in the file)
2) The type of indel/SV: I for insertion, D for deletion, INV for inversion, TD for tandem duplication
3) The length of the SV
4) "NT" (to indicate that the next number is the length of non-template sequences inserted; insertions are fully covered
by the NT-fields, deletions can have NT bases if the deletion is not 'pure', meaning that while bases have been
deleted, some bases have been inserted between the breakpoints)
5) the length(s) of the NT fragment(s)
6) the sequence(s) of the NT fragment(s)
7-8) the identifier of the chromosome the read was found on
9-10-11) BP: the start and end positions of the SV
12-13-14) BP_range: if the exact position of the SV is unclear since bases at the edge of one read-half could equally
well be appended to the other read-half. In the deletion example, ACA could be on any side of the gap, so the original
deletion could have been between 1337143 and 1338815, between 1337144 and 1338816, or between 1337145 and 133817, or
between 1337146 and 133818. BP-range is used to indicate this range.
15) "Supports": announces that the total count of reads supporting the SV follow.
16) The number of reads supporting the SV
17) The number of unique reads supporting the SV (so not counting duplicate reads)
18) +: supports from reads whose anchors are upstream of the SV
19-20) total number of supporting reads and unique number of supporting reads whose anchors are upstream of the SV.
21) -: supports from reads whose anchors are downstream of the SV
22-23) total number of supporting reads and unique number of supporting reads whose anchors are downstream of the SV
24-25) S1: a simple score, ("# +" + 1)* ("# -" + 1) ;
26-27) SUM_MS: sum of mapping qualities of anchor reads, The reads with variants or unmapped are called split-read,
whose mate is called anchor reads. We use anchor reads to narrow down the search space to speed up and increase
sensitivity;
28) the number of different samples scanned
29-30-31) NumSupSamples?: the number of samples supporting the SV, as well as the number of samples having unique
reads supporting the SV (in practice, these numbers are the same)
32+) Per sample: the sample name, followed by the total number of supporting reads whose anchors are upstream, the
total number of unique supporting reads whose anchors are upstream, the total number of supporting reads whose anchors
are downstream, and finally the total number of unique supporting reads whose anchors are downstream.
WARNING see below
----
The reported larger insertion (LI) record is rather different than other types of variants. Here is the format:
1) index,
2) type(LI),
3) "ChrID",
4) chrName,
5) left breakpoint,
6) "+"
7) number of supporting reads for the left coordinate,
8) right breakpoint,
9) "-"
10) number of supporting reads for the right coordinate.
Following lines are repeated for each sample
11) Sample name
12) "+"
13) upstream supporting reads
14) "-"
15) downstream supporting reads
'''
'''
New version of pindel changes the output starting from 32+
Check https://trac.nbic.nl/pipermail/pindel-users/2013-March/000229.html
hi mike,
This is the output format for the new version where the number of reads
supporting the reference allele is also counted on the left and right
breakpoints of the variants.
TS1_3051 164 167 0 0 2 2 TS1_3054 117 118 0 0 0 0
For example, 164 and 167 reads supporting the reference for sample TS1_3051
but only 2 reads and 2 unique reads from - strand support the alternative.
It is less likely that this is a true germline variant. with the read count
for the reference allele, you may filter the calls in the same way as snps.
'''
pindel_name = "pindel"
pindel_source = set([pindel_name])
min_coverage = 10
het_cutoff = 0.2
hom_cutoff = 0.8
GT_REF = "0/0"
GT_HET = "0/1"
GT_HOM = "1/1"
PINDEL_TO_SV_TYPE = {"I": "INS", "D": "DEL", "LI": "INS",
"TD": "DUP", "INV": "INV", "RPL":"RPL"}
class PindelHeader:
def __init__(self, fd):
self.header_dict = {}
self.header_fields = []
start = fd.tell()
# Parse header : read the header until the last line is
# encoutered (^#Chr1) break before reading the next line
while True:
try:
line = next(fd)
if line.find("ChrID") >= 1:
self.parse_firstrecord_line(line)
break
except StopIteration:
break
# We return to the start of the file
fd.seek(start)
def parse_firstrecord_line(self, line):
fields = line.split()
num_samples = int(fields[27])
pindel024u_or_later = len(fields) > 31 + 5 * num_samples
if pindel024u_or_later:
self.samples = [fields[i] for i in range(31, len(fields), 7)]
else:
self.samples = [fields[i] for i in range(31, len(fields), 5)]
logger.info("\t".join(self.samples))
def getOrderedSamples(self):
# sample order is given by the sample order in the pindel output
if not hasattr(self, "samples"):
return []
return self.samples
def __str__(self):
return str(self.__dict__)
def pysam_header(pindel_header):
# Using the new pysam interface for creating records and header
# not implemented in the python3.4 version of pysam (0.10.0)
# have to wait at least for the 0.11.2.2 pysam version
header = VariantHeader()
vcf_header = get_template_header("pindel")
with open(vcf_header) as fin:
for line in fin:
if (line.startswith("##INFO")
or line.startswith("##FORMAT")
or line.startswith("##FILTER")):
header.add_line(line.rstrip())
for sample in pindel_header.getOrderedSamples():
header.add_sample(sample)
return header
class PindelRecord(SVRecord):
counter = {"I": 0, "D": 0, "LI": 0, "TD": 0, "INV": 0, "RPL": 0}
# Only DEL, INV and DUP for the moment
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
svs_supported = set(["DEL", "DUP", "INV"])
def __init__(self, record_string, reference_handle=None):
self.name = pindel_name
fields = record_string.split()
index = fields[0]
sv_type = fields[1]
if PINDEL_TO_SV_TYPE[sv_type] not in PindelRecord.svs_supported:
print(("Unsupported structural variation " + sv_type))
exit(1)
sv_len = int(fields[2])
num_nt_added = fields[4]
# nt_added = fields[5]
chrom = fields[7]
start_pos = int(fields[9])
end_pos = int(fields[10])
bp_range = (int(fields[12]), int(fields[13]))
read_supp = int(fields[15])
uniq_read_supp = int(fields[16])
up_read_supp = int(fields[18])
up_uniq_read_supp = int(fields[19])
down_read_supp = int(fields[21])
down_uniq_read_supp = int(fields[22])
# score = int(fields[24])
# nums_samples = int(fields[27])
num_sample_supp = int(fields[29])
# num_sample_uniq_supp = int(fields[30])
sv = {}
samples = []
for i in range(31, len(fields), 7):
sv[fields[i]] = {"name": fields[i],
"up_ref_read_supp": int(fields[i + 1]),
"down_ref_read_supp": int(fields[i + 2]),
"up_var_read_supp": int(fields[i + 3]),
"up_var_uniq_read_supp": int(fields[i + 4]),
"down_var_read_supp": int(fields[i + 5]),
"down_var_uniq_read_supp": int(fields[i + 6])
}
samples.append(sv[fields[i]])
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
self.pindel_sv_type = sv_type
self.up_read_supp = up_read_supp
self.down_read_supp = down_read_supp
self.up_uniq_read_supp = up_uniq_read_supp
self.down_uniq_read_supp = down_uniq_read_supp
self.uniq_read_supp = uniq_read_supp
# Computing CIPOS and CIEND
pos_conf = max(1, abs(int((bp_range[0] - start_pos)/2)))
end_conf = max(1, abs(int((bp_range[1] - end_pos)/2)))
self.cipos = (-pos_conf, pos_conf)
self.ciend = (-end_conf, end_conf)
PindelRecord.counter[sv_type] += 1
id = "_".join([pindel_name, chrom, sv_type,
str(PindelRecord.counter[sv_type])])
super(PindelRecord, self).__init__(pindel_name, chrom,
start_pos, end_pos, id)
self._sv_type = PINDEL_TO_SV_TYPE[sv_type]
self.__sv_len = sv_len
self.__samples = samples
self.__sv = sv
self.Modinfo = {
"END": end_pos,
"SU": read_supp,
"INDEX": index,
"NTLEN": num_nt_added,
"CIPOS": ",".join([str(x) for x in self.cipos]),
"CIEND": ",".join([str(x) for x in self.ciend]),
"NUM_SUPP_SAMPLES": num_sample_supp
}
def addbatch2Id(self, batch=None):
if batch:
self.id += "_" + batch
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
def variantSample(self, sample):
if (self.__sv[sample]["up_var_read_supp"] > 0
or self.__sv[sample]["down_var_read_supp"] > 0):
return True
else:
return False
def variantSamples(self):
variant_samples = []
for sample in self.__sv:
if self.variantSample(sample):
variant_samples.append(sample)
return sorted(variant_samples)
def getOrderedSamples(self):
# sample order is given by the sample order in the pindel output
return self.__samples
def getSampleCalls(self):
# TODO make sure that the order of the sample conform to
# the order of the vcf header !!
calls = []
for s in self.getOrderedSamples():
sample = s["name"]
ad = s["up_var_read_supp"] + s["down_var_read_supp"]
called_value = vcf.model.make_calldata_tuple("AD")(AD=ad)
calldata = vcf.model._Call(None, sample, called_value)
calls.append(calldata)
return calls
def MaxIndSupportingRP(self):
max_ind_supp = 0
for s in self.__samples:
ad = s["up_var_read_supp"] + s["down_var_read_supp"]
max_ind_supp = max(ad, max_ind_supp)
return max_ind_supp
def to_vcf_record(self):
alt = [vcf.model._SV(self.sv_type)]
info = {"SVLEN": self.sv_len, "SVTYPE": self.sv_type}
info["MAX_IND_SU"] = self.MaxIndSupportingRP()
info["VSAMPLES"] = ",".join(self.variantSamples())
info.update(self.Modinfo)
calls = self.getSampleCalls()
vcf_record = vcf.model._Record(self.chrom,
self.start,
self.id,
"N",
alt,
".",
"PASS",
info,
"AD",
[0],
calls)
return vcf_record
class PindelReader(SVReader):
svs_supported = set(["DEL", "INS", "DUP", "INV"])
def __init__(self, file_name, reference_handle=None, svs_to_report=None):
super(PindelReader, self).__init__(file_name, pindel_name,
reference_handle)
self.file_fd = smart_open(file_name)
self.header = PindelHeader(self.file_fd)
self.pysamheader = pysam_header(self.header)
self.svs_supported = PindelReader.svs_supported
if svs_to_report is not None:
self.svs_supported &= set(svs_to_report)
def __iter__(self):
return self
def __next__(self):
while True:
line = next(self.file_fd)
if line.find("ChrID") >= 1:
record = PindelRecord(line.strip(), self.reference_handle)
if record.sv_type in self.svs_supported:
return record
def AreSamplesSpecified(self):
return 1
def getOrderedSamples(self):
return self.header.getOrderedSamples()
# def SpecificFilterPass(self, record):
# # Filtering criteria more than 4 PE
# # see
# # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
# return (record.length() > 60)
record.MaxIndSupportingRP() <= 3):
return False
else:
return True
returns a vector of records where duplicates were removed
vcf_records_without_duplicate = self.RemoveDuplicates(records)
return vcf_records_without_duplicate
def RemoveDuplicates(self, records, cutoff=1):
n analysis of all variants to identify the duplicates
that share excatly the same breakpoints (cutoff = 1)
or slightly different breakpoints controlled by cutoff
"""
# Sorting the vcf records
records.sort(key=lambda x: (x.chrom, x.start, x.end, -x.su))
for r in records:
if self.IsDuplicate(vcf_records_without_duplicate, r, cutoff):
print("removed entry %s identified as exact duplicate " % r)
continue
vcf_records_without_duplicate.append(r)
def IsDuplicate(self, valid_records, record, cutoff):
if len(valid_records) == 0:
return False
last_included = valid_records[-1]
start_offset = abs(record.start - last_included.start)
end_offset = abs(record.end - last_included.end)
if start_offset + end_offset < cutoff:
return True
return False
class PindelWriter(SVWriter):
def __init__(self, file_name, reference_contigs, template_reader):
super(PindelWriter, self).__init__(file_name,
template_reader.tool_name,
template_reader)
vcf_template_reader = get_template(template_reader.tool_name)
vcf_template_reader.samples = template_reader.getOrderedSamples()
self.vcf_writer = vcf.Writer(open(self.filename, 'w'),
vcf_template_reader)
def write(self, record):
self.vcf_writer.write_record(record.to_vcf_record())
def _close(self):
self.vcf_writer.close()