Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import logging
import os
import re
from pysam import VariantFile
from svreader import SVRecord, SVReader, SVWriter
logger = logging.getLogger(__name__)
mydir = os.path.dirname(os.path.realpath(__file__))
'''
An ultra lightweight vcf reader for lumpy
INFO:
SVTYPE Type of structural variant
SVLEN Difference in length between REF and ALT alleles
END End position of the variant described in this record
STRANDS Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)
IMPRECISE Imprecise structural variation
CIPOS Confidence interval around POS for imprecise variants
CIEND Confidence interval around END for imprecise variants
CIPOS95 Confidence interval (95%) around POS for imprecise variants
CIEND95 Confidence interval (95%) around END for imprecise variants
MATEID ID of mate breakends
EVENT ID of event associated to breakend
SECONDARY Secondary breakend in a multi-line variants
SU Number of pieces of evidence supporting the variant across all samples
PE Number of paired-end reads supporting the variant across all samples
SR Number of split reads supporting the variant across all samples
EV Type of LUMPY evidence contributing to the variant call
PRPOS LUMPY probability curve of the POS breakend
PREND LUMPY probability curve of the END breakend
FORMAT:
GT Genotype
SU Number of pieces of evidence supporting the variant
PE Number of paired-end reads supporting the variant
SR Number of split reads supporting the variant
BD Amount of BED evidence supporting the variant
RO Reference allele observation count, with partial observations recorded fractionally
AO Alternate allele observations, with partial observations recorded fractionally
QR Sum of quality of reference observations
QA Sum of quality of alternate observations
RS Reference allele split-read observation count, with partial observations recorded fractionally
AS Alternate allele split-read observation count, with partial observations recorded fractionally
RP Reference allele paired-end observation count, with partial observations recorded fractionally
AP Alternate allele paired-end observation count, with partial observations recorded fractionally
AB Allele balance, fraction of observations from alternate allele, QA/(QR+QA)
'''
lumpy_name = "lumpy"
lumpy_source = set([lumpy_name])
class LumpyRecord(SVRecord):
def __init__(self, record, reference_handle=None):
record.id = "_".join(
[lumpy_name,
record.chrom,
record.info['SVTYPE'],
record.id])
super(LumpyRecord, self).__init__(lumpy_name,
record.chrom,
record.pos,
record.stop,
record.id,
ref=record.ref,
qual=record.qual,
filter=record.filter)
self.__record = record
sv = {}
for sample in record.samples:
sample_name = sample.rsplit('.')[0]
sv[sample_name] = record.samples[sample]
self.sv = sv
# self.__record.info["MAX_IND_SU"] = self.MaxIndSupportingRP()
self._sv_type = self.__record.info['SVTYPE']
self.__record.info['VSAMPLES'] = ",".join(self.variantSamples())
@property
def svlen(self):
svlen = 0
if self.__record.INFO["SVTYPE"] == "BND":
chrom, end = self. getAltBND()
svlen = end - self.start + 1
else:
svlen = abs(int(self.__record.INFO["SVLEN"][0]))
return svlen
@property
def record(self):
return self.__record
@property
def sr(self):
return self.__record.info['SR']
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def getAltBND(self):
''' extract second breakend chr,pos from ALT field '''
alt = self.__record.ALT[0]
m = re.search(r"[\[\]]([\w\.:0-9]+)[\[\]]", str(alt))
if m:
return m.group(1).split(":")
else:
return 0
def variantSampleOld(self, sample):
if (max(self.sv[sample]["AS"]) > 0
or max(self.sv[sample]["AP"])
or max(self.sv[sample]["ASC"]) > 0):
return True
else:
return False
def variantSample(self, sample):
if self.sv[sample]["SU"] > 0:
return True
else:
return False
def variantSamples(self):
variant_samples = []
for sample in self.sv:
if self.variantSample(sample):
variant_samples.append(sample)
return sorted(variant_samples)
def MaxIndSupportingRP(self):
sv = self.sv
max_ind_supp = 0
for sample in sv:
max_ind_supp = max(max_ind_supp, sv[sample]['SU'])
return max_ind_supp
class LumpyReader(SVReader):
svs_supported = set(["DEL", "INS", "DUP", "INV"])
def __init__(self, file_name, reference_handle=None, svs_to_report=None):
super(LumpyReader, self).__init__(file_name,
lumpy_name,
reference_handle)
self.vcf_reader = VariantFile(file_name)
self.vcf_reader.header.info.add(
'VSAMPLES',
number='.',
type='String',
description='Samples with variant alleles')
if svs_to_report is not None:
self.svs_supported &= set(svs_to_report)
def __iter__(self):
return self
def __next__(self):
while True:
lumpy_record = next(self.vcf_reader)
# BND not supported
if lumpy_record.info['SVTYPE'] == "BND":
continue
record = LumpyRecord(lumpy_record, self.reference_handle)
if record.sv_type in self.svs_supported:
return record
def getHeader(self):
return self.vcf_reader.header
def AreSamplesSpecified(self):
return 1
def getOrderedSamples(self):
if not hasattr(self.vcf_reader, "samples"):
return []
return self.vcf_reader.samples
def SpecificFilterPass(self, record):
# Filtering criteria more than 4 PE
# see
# http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
return record.MaxIndSupportingRP() > 4 and record.sr[0] >0
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
class LumpyWriter(SVWriter):
def __init__(self, file_name, reference_contigs, template_reader):
super(
LumpyWriter,
self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(
self.filename,
'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
self._dumpemptyvcf()