import logging import os import re from pysam import VariantFile from svreader import SVRecord, SVReader, SVWriter logger = logging.getLogger(__name__) mydir = os.path.dirname(os.path.realpath(__file__)) ''' An ultra lightweight vcf reader for lumpy INFO: SVTYPE Type of structural variant SVLEN Difference in length between REF and ALT alleles END End position of the variant described in this record STRANDS Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--) IMPRECISE Imprecise structural variation CIPOS Confidence interval around POS for imprecise variants CIEND Confidence interval around END for imprecise variants CIPOS95 Confidence interval (95%) around POS for imprecise variants CIEND95 Confidence interval (95%) around END for imprecise variants MATEID ID of mate breakends EVENT ID of event associated to breakend SECONDARY Secondary breakend in a multi-line variants SU Number of pieces of evidence supporting the variant across all samples PE Number of paired-end reads supporting the variant across all samples SR Number of split reads supporting the variant across all samples EV Type of LUMPY evidence contributing to the variant call PRPOS LUMPY probability curve of the POS breakend PREND LUMPY probability curve of the END breakend FORMAT: GT Genotype SU Number of pieces of evidence supporting the variant PE Number of paired-end reads supporting the variant SR Number of split reads supporting the variant BD Amount of BED evidence supporting the variant RO Reference allele observation count, with partial observations recorded fractionally AO Alternate allele observations, with partial observations recorded fractionally QR Sum of quality of reference observations QA Sum of quality of alternate observations RS Reference allele split-read observation count, with partial observations recorded fractionally AS Alternate allele split-read observation count, with partial observations recorded fractionally RP Reference allele paired-end observation count, with partial observations recorded fractionally AP Alternate allele paired-end observation count, with partial observations recorded fractionally AB Allele balance, fraction of observations from alternate allele, QA/(QR+QA) ''' lumpy_name = "lumpy" lumpy_source = set([lumpy_name]) class LumpyRecord(SVRecord): def __init__(self, record, reference_handle=None): record.id = "_".join( [lumpy_name, record.chrom, record.info['SVTYPE'], record.id]) super(LumpyRecord, self).__init__(lumpy_name, record.chrom, record.pos, record.stop, record.id, ref=record.ref, qual=record.qual, filter=record.filter) self.__record = record sv = {} for sample in record.samples: sample_name = sample.rsplit('.')[0] sv[sample_name] = record.samples[sample] self.sv = sv # self.__record.info["MAX_IND_SU"] = self.MaxIndSupportingRP() self._sv_type = self.__record.info['SVTYPE'] self.__record.info['VSAMPLES'] = ",".join(self.variantSamples()) @property def svlen(self): svlen = 0 if self.__record.INFO["SVTYPE"] == "BND": chrom, end = self. getAltBND() svlen = end - self.start + 1 else: svlen = abs(int(self.__record.INFO["SVLEN"][0])) return svlen @property def record(self): return self.__record @property def sr(self): return self.__record.info['SR'] def getAltBND(self): ''' extract second breakend chr,pos from ALT field ''' alt = self.__record.ALT[0] m = re.search(r"[\[\]]([\w\.:0-9]+)[\[\]]", str(alt)) if m: return m.group(1).split(":") else: return 0 def variantSampleOld(self, sample): if (max(self.sv[sample]["AS"]) > 0 or max(self.sv[sample]["AP"]) or max(self.sv[sample]["ASC"]) > 0): return True else: return False def variantSample(self, sample): if self.sv[sample]["SU"] > 0: return True else: return False def variantSamples(self): variant_samples = [] for sample in self.sv: if self.variantSample(sample): variant_samples.append(sample) return sorted(variant_samples) def MaxIndSupportingRP(self): sv = self.sv max_ind_supp = 0 for sample in sv: max_ind_supp = max(max_ind_supp, sv[sample]['SU']) return max_ind_supp class LumpyReader(SVReader): svs_supported = set(["DEL", "INS", "DUP", "INV"]) def __init__(self, file_name, reference_handle=None, svs_to_report=None): super(LumpyReader, self).__init__(file_name, lumpy_name, reference_handle) self.vcf_reader = VariantFile(file_name) self.vcf_reader.header.info.add( 'VSAMPLES', number='.', type='String', description='Samples with variant alleles') if svs_to_report is not None: self.svs_supported &= set(svs_to_report) def __iter__(self): return self def __next__(self): while True: lumpy_record = next(self.vcf_reader) # BND not supported if lumpy_record.info['SVTYPE'] == "BND": continue record = LumpyRecord(lumpy_record, self.reference_handle) if record.sv_type in self.svs_supported: return record def getHeader(self): return self.vcf_reader.header def AreSamplesSpecified(self): return 1 def getOrderedSamples(self): if not hasattr(self.vcf_reader, "samples"): return [] return self.vcf_reader.samples def SpecificFilterPass(self, record): # Filtering criteria more than 4 PE # see # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers return record.sr[0] > 0 # return record.MaxIndSupportingRP() > 4 and record.sr[0] > 0 class LumpyWriter(SVWriter): def __init__(self, file_name, reference_contigs, template_reader): super( LumpyWriter, self).__init__(file_name, template_reader.tool_name, template_reader) def _open(self): self.vcf_writer = VariantFile( self.filename, 'w', header=self.template_reader.getHeader()) self._isopen = True def _write(self, record): self.vcf_writer.write(record.record) def _close(self): if self._isopen: self.vcf_writer.close() else: # nothing was written self._dumpemptyvcf()