Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import logging
import os
import re
from pysam import VariantFile
from svreader import SVRecord, SVReader, SVWriter
logger = logging.getLogger(__name__)
mydir = os.path.dirname(os.path.realpath(__file__))
'''
An ultra lightweight vcf reader for lumpy
INFO:
SVTYPE Type of structural variant
SVLEN Difference in length between REF and ALT alleles
END End position of the variant described in this record
STRANDS Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)
IMPRECISE Imprecise structural variation
CIPOS Confidence interval around POS for imprecise variants
CIEND Confidence interval around END for imprecise variants
CIPOS95 Confidence interval (95%) around POS for imprecise variants
CIEND95 Confidence interval (95%) around END for imprecise variants
MATEID ID of mate breakends
EVENT ID of event associated to breakend
SECONDARY Secondary breakend in a multi-line variants
SU Number of pieces of evidence supporting the variant across all samples
PE Number of paired-end reads supporting the variant across all samples
SR Number of split reads supporting the variant across all samples
EV Type of LUMPY evidence contributing to the variant call
PRPOS LUMPY probability curve of the POS breakend
PREND LUMPY probability curve of the END breakend
FORMAT:
GT Genotype
SU Number of pieces of evidence supporting the variant
PE Number of paired-end reads supporting the variant
SR Number of split reads supporting the variant
BD Amount of BED evidence supporting the variant
RO Reference allele observation count, with partial observations recorded fractionally
AO Alternate allele observations, with partial observations recorded fractionally
QR Sum of quality of reference observations
QA Sum of quality of alternate observations
RS Reference allele split-read observation count, with partial observations recorded fractionally
AS Alternate allele split-read observation count, with partial observations recorded fractionally
RP Reference allele paired-end observation count, with partial observations recorded fractionally
AP Alternate allele paired-end observation count, with partial observations recorded fractionally
AB Allele balance, fraction of observations from alternate allele, QA/(QR+QA)
'''
lumpy_name = "lumpy"
lumpy_source = set([lumpy_name])
class LumpyRecord(SVRecord):
def __init__(self, record, reference_handle=None):
record.id = "_".join(
[lumpy_name,
record.chrom,
record.info['SVTYPE'],
record.id])
super(LumpyRecord, self).__init__(lumpy_name,
record.chrom,
record.pos,
record.stop,
record.id,
ref=record.ref,
qual=record.qual,
filter=record.filter)
self.__record = record
sv = {}
for sample in record.samples:
sample_name = sample.rsplit('.')[0]
sv[sample_name] = record.samples[sample]
self.sv = sv
# self.__record.info["MAX_IND_SU"] = self.MaxIndSupportingRP()
self._sv_type = self.__record.info['SVTYPE']
self.__record.info['VSAMPLES'] = ",".join(self.variantSamples())
@property
def svlen(self):
svlen = 0
if self.__record.INFO["SVTYPE"] == "BND":
chrom, end = self. getAltBND()
svlen = end - self.start + 1
else:
svlen = abs(int(self.__record.INFO["SVLEN"][0]))
return svlen
@property
def record(self):
return self.__record
@property
def sr(self):
return self.__record.info['SR']
def addbatch2Id(self, batch=None):
if batch:
self.__record.id += "_" + batch
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def getAltBND(self):
''' extract second breakend chr,pos from ALT field '''
alt = self.__record.ALT[0]
m = re.search(r"[\[\]]([\w\.:0-9]+)[\[\]]", str(alt))
if m:
return m.group(1).split(":")
else:
return 0
def variantSampleOld(self, sample):
if (max(self.sv[sample]["AS"]) > 0
or max(self.sv[sample]["AP"])
or max(self.sv[sample]["ASC"]) > 0):
return True
else:
return False
def variantSample(self, sample):
if self.sv[sample]["SU"] > 0:
return True
else:
return False
def variantSamples(self):
variant_samples = []
for sample in self.sv:
if self.variantSample(sample):
variant_samples.append(sample)
return sorted(variant_samples)
def MaxIndSupportingRP(self):
# Among all the individuals returns the support of the individual with
# the maximum support
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
sv = self.sv
max_ind_supp = 0
for sample in sv:
max_ind_supp = max(max_ind_supp, sv[sample]['SU'])
return max_ind_supp
class LumpyReader(SVReader):
svs_supported = set(["DEL", "INS", "DUP", "INV"])
def __init__(self, file_name, reference_handle=None, svs_to_report=None):
super(LumpyReader, self).__init__(file_name,
lumpy_name,
reference_handle)
self.vcf_reader = VariantFile(file_name)
self.vcf_reader.header.info.add(
'VSAMPLES',
number='.',
type='String',
description='Samples with variant alleles')
if svs_to_report is not None:
self.svs_supported &= set(svs_to_report)
def __iter__(self):
return self
def __next__(self):
while True:
lumpy_record = next(self.vcf_reader)
# BND not supported
if lumpy_record.info['SVTYPE'] == "BND":
continue
record = LumpyRecord(lumpy_record, self.reference_handle)
if record.sv_type in self.svs_supported:
return record
def getHeader(self):
return self.vcf_reader.header
def AreSamplesSpecified(self):
return 1
# def getOrderedSamples(self):
# if not hasattr(self.vcf_reader, "samples"):
# return []
# return self.vcf_reader.samples
samples = self.vcf_reader.header.samples
sample_names = [sample.rsplit('.')[0] for sample in samples]
return sample_names
# First try after sensibility analysis with Mathieu
# return record.sr[0] > 0
# Filtering criteria more than 4 PE for at least one invididual
# or sr > 0
# see
# http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers
return record.MaxIndSupportingRP() > 4 or record.sr[0] > 0
class LumpyWriter(SVWriter):
def __init__(self, file_name, reference_contigs, template_reader):
super(
LumpyWriter,
self).__init__(file_name,
template_reader.tool_name,
template_reader)
def _open(self):
self.vcf_writer = VariantFile(
self.filename,
'w',
header=self.template_reader.getHeader())
self._isopen = True
def _write(self, record):
self.vcf_writer.write(record.record)
def _close(self):
if self._isopen:
self.vcf_writer.close()
else: # nothing was written
# Make sure appropriate samples are added to the vcf