import logging import os from collections import defaultdict from pysam import VariantFile from svreader import SVRecord, SVReader, SVWriter from svrunner_utils import eprint, vcf_to_pybed logger = logging.getLogger(__name__) mydir = os.path.dirname(os.path.realpath(__file__)) ''' Delly output is a vcf file, fields are described in the header #using awk cat vcf | awk '/##FORMAT/{ last=index($0,","); id=index($0,"ID"); \ desc=substr($0,match($0,"Description")+12) ; gsub(/"/, "", desc); \ printf "%-30s %s\n", substr($0,id+3,last-id-3), \ substr(desc,0,length(desc)-1)}' FILTER PASS All filters passed LowQual PE/SR support below 3 or mapping quality below 20. INFO CIEND PE confidence interval around END CIPOS PE confidence interval around POS CHR2 Chromosome for END coordinate in case of a translocation END End position of the structural variant PE Paired-end support of the structural variant MAPQ Median mapping quality of paired-ends SR Split-read supportSpecificFilterPass SRQ Split-read consensus alignment quality CONSENSUS Split-read consensus sequence CE Consensus sequence entropy CT Paired-end signature induced connection type IMPRECISE Imprecise structural variation PRECISE Precise structural variation SVTYPE Type of structural variant SVMETHOD Type of approach used to detect SV INSLEN Predicted length of the insertion HOMLEN Predicted microhomology length using a max. edit distance of 2 RDRATIO Read-depth ratio of het. SV carrier vs. non-carrier. FORMAT GT Genotype GL Log10-scaled genotype likelihoods for RR,RA,AA genotypes GQ Genotype Quality FT Per-sample genotype filter RC Raw high-quality read counts for the SV RCL Raw high-quality read counts for the left control region RCR Raw high-quality read counts for the right control region CN Read-depth based copy-number estimate for autosomal sites DR high-quality reference pairs DV high-quality variant pairs RR high-quality reference junction reads RV high-quality variant junction reads ''' delly_name = "delly" delly_source = set([delly_name]) class DellyRecord(SVRecord): def __init__(self, record, sample_order, reference_handle=None): # Custom id record.id = "_".join([delly_name, record.chrom, record.id]) super(DellyRecord, self).__init__(delly_name, record.chrom, record.pos, record.stop, record.id, ref=record.ref, qual=record.qual, filter=record.filter) self.__record = record self._sv_type = record.info["SVTYPE"] self.__pe = record.info["PE"] self.__sr = record.info["SR"] if "SR" in list(record.info.keys()) else 0 self.__ct = record.info["CT"] self.__record.info['SR'] = self.__sr sv = {} for sample in record.samples: sample_name = sample.rsplit('.')[0] sv[sample_name] = record.samples[sample] self.sv = sv self.__record.info['VSAMPLES'] = ",".join(self.variantSamples()) @property def record(self): return self.__record @property def start(self): return self.record.pos @property def end(self): return self.record.stop @property def pe(self): return self.__pe @property def sr(self): return self.__sr @property def ct(self): return self.__ct def update(self, start, end, pe_supp): self.record.pos = start self.record.stop = end self.record.info['PE'] = pe_supp def variantSample(self, sample): if self.sv[sample]["DV"] > 0 or self.sv[sample]["RV"] > 0: return True else: return False def variantSamples(self): variant_samples = [] for sample in self.sv: if self.variantSample(sample): variant_samples.append(sample) return sorted(variant_samples) def MaxIndSupportingRP(self): ''' Maximum number of supporting PE among individuals. ''' max_ind_supp = 0 su = 0 rep_sample = "" for sample in self.sv: dv = self.sv[sample]["DV"] su += dv if dv > max_ind_supp: rep_sample = sample max_ind_supp = dv if max_ind_supp == 0: ratio = 0 else: total = self.sv[rep_sample]["DV"] + self.sv[rep_sample]["DR"] ratio = float(self.sv[rep_sample]["DV"])/total return su, max_ind_supp, ratio def to_svcf_record(self): return self.record class DellyReader(SVReader): svs_supported = set(["DEL", "INS", "DUP", "INV"]) def __init__(self, file_name, reference_handle=None, svs_to_report=None): super(DellyReader, self).__init__(file_name, delly_name, reference_handle) self.vcf_reader = VariantFile(file_name) if 'VSAMPLES' not in self.vcf_reader.header.info.keys(): self.vcf_reader.header.info.add('VSAMPLES', number='.', type='String', description='Samples with variant alleles') self.svs_supported = DellyReader.svs_supported if svs_to_report is not None: self.svs_supported &= set(svs_to_report) def __iter__(self): return self def __next__(self): while True: delly_record = next(self.vcf_reader) record = DellyRecord(delly_record, self.reference_handle) if record.sv_type in self.svs_supported: return record def getHeader(self): return self.vcf_reader.header def getOrderedSamples(self): samples = self.vcf_reader.header.samples sample_names = [sample.rsplit('.')[0] for sample in samples] return sample_names def SpecificFilterPass(self, record): # Filtering criteria more than 5 PE or more than SR and PE > 0 # see http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers # TODO : should we modifiy this in order to account for individual # specific filtering (e.g pe > 5 at least for an individual) return record.pe > 5 or record.sr > 0 def bnd_merge(sef, svtype, records): """ Merging delly two lines inversion format according to the CT info field (distal connection) 3to3 tail-to-tail 5to5 head-to-head CT types are separated and for each 3to3, the corresponding 5to5 is searched for (bedtools intersect with >90% overlap) A single line corresponding to the intersection of both signature is kept """ if svtype != "INV": return records # a dictionnary of records with id as key records_dict = defaultdict() records_CT = defaultdict(list) for r in records: records_dict[r.id] = r records_CT[r.ct].append(r) eprint("Converting to bedtools intervals") tail2tail = vcf_to_pybed(records_CT["3to3"]) head2head = vcf_to_pybed(records_CT["5to5"]) # TODO check if the following code really recover balanced inversions balanced = tail2tail.intersect(head2head, r=True, f=0.95, wo=True) confirmed = [] seen = defaultdict() for b in balanced: start = int(b[5]) # tail end = int(b[2]) # head t2t = records_dict[b[3]] h2h = records_dict[b[7]] repre = t2t if repre in seen: continue seen[repre] = 1 pe_supp = t2t.record.info['PE'] + h2h.record.info['PE'] print("merging %s and %s" % (t2t, h2h)) repre.update(start, end, pe_supp) confirmed.append(repre) return confirmed class DellyWriter(SVWriter): def __init__(self, file_name, reference_contigs, template_reader): super(DellyWriter, self).__init__(file_name, template_reader.tool_name, template_reader) def _open(self): self.vcf_writer = VariantFile(self.filename, 'w', header=self.template_reader.getHeader()) self._isopen = True def _write(self, record): self.vcf_writer.write(record.record) def _close(self): if self._isopen: self.vcf_writer.close() else: # nothing was written self._dumpemptyvcf()