import logging import sys import os import gzip import vcf from svrunner_utils import smart_open from svreader.vcf_utils import get_template, get_template_header from svreader import SVRecord, SVReader, SVWriter from pysam import VariantFile, VariantHeader logger = logging.getLogger(__name__) mydir = os.path.dirname(os.path.realpath(__file__)) ''' from http://gmt.genome.wustl.edu/packages/pindel/user-manual.html There is a head line for each variant reported, followed by the alignment of supporting reads to the reference on the second line. The example variants are a 1671bp deletion and a 10bp insertion on chr20. The breakpoints are specified after "BP". Due to microhomology around the breakpoints, the breakpoint coordinates may shift both upstream and downstream,'BP_range' is used to indicate the range of this shift. The header line contains the following data: 1) The index of the indel/SV (57 means that 57 insertions precede this insertion in the file) 2) The type of indel/SV: I for insertion, D for deletion, INV for inversion, TD for tandem duplication 3) The length of the SV 4) "NT" (to indicate that the next number is the length of non-template sequences inserted; insertions are fully covered by the NT-fields, deletions can have NT bases if the deletion is not 'pure', meaning that while bases have been deleted, some bases have been inserted between the breakpoints) 5) the length(s) of the NT fragment(s) 6) the sequence(s) of the NT fragment(s) 7-8) the identifier of the chromosome the read was found on 9-10-11) BP: the start and end positions of the SV 12-13-14) BP_range: if the exact position of the SV is unclear since bases at the edge of one read-half could equally well be appended to the other read-half. In the deletion example, ACA could be on any side of the gap, so the original deletion could have been between 1337143 and 1338815, between 1337144 and 1338816, or between 1337145 and 133817, or between 1337146 and 133818. BP-range is used to indicate this range. 15) "Supports": announces that the total count of reads supporting the SV follow. 16) The number of reads supporting the SV 17) The number of unique reads supporting the SV (so not counting duplicate reads) 18) +: supports from reads whose anchors are upstream of the SV 19-20) total number of supporting reads and unique number of supporting reads whose anchors are upstream of the SV. 21) -: supports from reads whose anchors are downstream of the SV 22-23) total number of supporting reads and unique number of supporting reads whose anchors are downstream of the SV 24-25) S1: a simple score, ("# +" + 1)* ("# -" + 1) ; 26-27) SUM_MS: sum of mapping qualities of anchor reads, The reads with variants or unmapped are called split-read, whose mate is called anchor reads. We use anchor reads to narrow down the search space to speed up and increase sensitivity; 28) the number of different samples scanned 29-30-31) NumSupSamples?: the number of samples supporting the SV, as well as the number of samples having unique reads supporting the SV (in practice, these numbers are the same) 32+) Per sample: the sample name, followed by the total number of supporting reads whose anchors are upstream, the total number of unique supporting reads whose anchors are upstream, the total number of supporting reads whose anchors are downstream, and finally the total number of unique supporting reads whose anchors are downstream. WARNING see below ---- The reported larger insertion (LI) record is rather different than other types of variants. Here is the format: 1) index, 2) type(LI), 3) "ChrID", 4) chrName, 5) left breakpoint, 6) "+" 7) number of supporting reads for the left coordinate, 8) right breakpoint, 9) "-" 10) number of supporting reads for the right coordinate. Following lines are repeated for each sample 11) Sample name 12) "+" 13) upstream supporting reads 14) "-" 15) downstream supporting reads ''' ''' New version of pindel changes the output starting from 32+ Check https://trac.nbic.nl/pipermail/pindel-users/2013-March/000229.html hi mike, This is the output format for the new version where the number of reads supporting the reference allele is also counted on the left and right breakpoints of the variants. TS1_3051 164 167 0 0 2 2 TS1_3054 117 118 0 0 0 0 For example, 164 and 167 reads supporting the reference for sample TS1_3051 but only 2 reads and 2 unique reads from - strand support the alternative. It is less likely that this is a true germline variant. with the read count for the reference allele, you may filter the calls in the same way as snps. ''' pindel_name = "pindel" pindel_source = set([pindel_name]) min_coverage = 10 het_cutoff = 0.2 hom_cutoff = 0.8 GT_REF = "0/0" GT_HET = "0/1" GT_HOM = "1/1" PINDEL_TO_SV_TYPE = {"I": "INS", "D": "DEL", "LI": "INS", "TD": "DUP", "INV": "INV", "RPL":"RPL"} class PindelHeader: def __init__(self, fd): self.header_dict = {} self.header_fields = [] start = fd.tell() # Parse header : read the header until the last line is # encoutered (^#Chr1) break before reading the next line while True: try: line = next(fd) if line.find("ChrID") >= 1: self.parse_firstrecord_line(line) break except StopIteration: break # We return to the start of the file fd.seek(start) def parse_firstrecord_line(self, line): fields = line.split() num_samples = int(fields[27]) pindel024u_or_later = len(fields) > 31 + 5 * num_samples if pindel024u_or_later: self.samples = [fields[i] for i in range(31, len(fields), 7)] else: self.samples = [fields[i] for i in range(31, len(fields), 5)] logger.info("\t".join(self.samples)) def getOrderedSamples(self): # sample order is given by the sample order in the pindel output if not hasattr(self, "samples"): return [] return self.samples def __str__(self): return str(self.__dict__) def pysam_header(pindel_header): # Using the new pysam interface for creating records and header # not implemented in the python3.4 version of pysam (0.10.0) # have to wait at least for the 0.11.2.2 pysam version header = VariantHeader() vcf_header = get_template_header("pindel") with open(vcf_header) as fin: for line in fin: if (line.startswith("##INFO") or line.startswith("##FORMAT") or line.startswith("##FILTER")): header.add_line(line.rstrip()) for sample in pindel_header.getOrderedSamples(): header.add_sample(sample) return header class PindelRecord(SVRecord): counter = {"I": 0, "D": 0, "LI": 0, "TD": 0, "INV": 0, "RPL": 0} # Only DEL, INV and DUP for the moment svs_supported = set(["DEL", "DUP", "INV"]) def __init__(self, record_string, reference_handle=None): self.name = pindel_name fields = record_string.split() index = fields[0] sv_type = fields[1] if PINDEL_TO_SV_TYPE[sv_type] not in PindelRecord.svs_supported: print(("Unsupported structural variation " + sv_type)) exit(1) sv_len = int(fields[2]) num_nt_added = fields[4] # nt_added = fields[5] chrom = fields[7] start_pos = int(fields[9]) end_pos = int(fields[10]) bp_range = (int(fields[12]), int(fields[13])) read_supp = int(fields[15]) uniq_read_supp = int(fields[16]) up_read_supp = int(fields[18]) up_uniq_read_supp = int(fields[19]) down_read_supp = int(fields[21]) down_uniq_read_supp = int(fields[22]) # score = int(fields[24]) # nums_samples = int(fields[27]) num_sample_supp = int(fields[29]) # num_sample_uniq_supp = int(fields[30]) sv = {} samples = [] for i in range(31, len(fields), 7): sv[fields[i]] = {"name": fields[i], "up_ref_read_supp": int(fields[i + 1]), "down_ref_read_supp": int(fields[i + 2]), "up_var_read_supp": int(fields[i + 3]), "up_var_uniq_read_supp": int(fields[i + 4]), "down_var_read_supp": int(fields[i + 5]), "down_var_uniq_read_supp": int(fields[i + 6]) } samples.append(sv[fields[i]]) self.su = read_supp self.pindel_sv_type = sv_type self.up_read_supp = up_read_supp self.down_read_supp = down_read_supp self.up_uniq_read_supp = up_uniq_read_supp self.down_uniq_read_supp = down_uniq_read_supp self.uniq_read_supp = uniq_read_supp # Computing CIPOS and CIEND pos_conf = max(1, abs(int((bp_range[0] - start_pos)/2))) end_conf = max(1, abs(int((bp_range[1] - end_pos)/2))) self.cipos = (-pos_conf, pos_conf) self.ciend = (-end_conf, end_conf) PindelRecord.counter[sv_type] += 1 id = "_".join([pindel_name, chrom, sv_type, str(PindelRecord.counter[sv_type])]) super(PindelRecord, self).__init__(pindel_name, chrom, start_pos, end_pos, id) self._sv_type = PINDEL_TO_SV_TYPE[sv_type] self.__sv_len = sv_len self.__samples = samples self.__sv = sv self.Modinfo = { "END": end_pos, "SU": read_supp, "INDEX": index, "NTLEN": num_nt_added, "CIPOS": ",".join([str(x) for x in self.cipos]), "CIEND": ",".join([str(x) for x in self.ciend]), "NUM_SUPP_SAMPLES": num_sample_supp } def addbatch2Id(self, batch=None): if batch: self.id += "_" + batch def variantSample(self, sample): if (self.__sv[sample]["up_var_read_supp"] > 0 or self.__sv[sample]["down_var_read_supp"] > 0): return True else: return False def variantSamples(self): variant_samples = [] for sample in self.__sv: if self.variantSample(sample): variant_samples.append(sample) return sorted(variant_samples) def getOrderedSamples(self): # sample order is given by the sample order in the pindel output return self.__samples def getSampleCalls(self): # TODO make sure that the order of the sample conform to # the order of the vcf header !! calls = [] for s in self.getOrderedSamples(): sample = s["name"] ad = s["up_var_read_supp"] + s["down_var_read_supp"] called_value = vcf.model.make_calldata_tuple("AD")(AD=ad) calldata = vcf.model._Call(None, sample, called_value) calls.append(calldata) return calls def MaxIndSupportingRP(self): max_ind_supp = 0 for s in self.__samples: ad = s["up_var_read_supp"] + s["down_var_read_supp"] max_ind_supp = max(ad, max_ind_supp) return max_ind_supp def to_vcf_record(self): alt = [vcf.model._SV(self.sv_type)] info = {"SVLEN": self.sv_len, "SVTYPE": self.sv_type} info["MAX_IND_SU"] = self.MaxIndSupportingRP() info["VSAMPLES"] = ",".join(self.variantSamples()) info.update(self.Modinfo) calls = self.getSampleCalls() vcf_record = vcf.model._Record(self.chrom, self.start, self.id, "N", alt, ".", "PASS", info, "AD", [0], calls) return vcf_record class PindelReader(SVReader): svs_supported = set(["DEL", "INS", "DUP", "INV"]) def __init__(self, file_name, reference_handle=None, svs_to_report=None): super(PindelReader, self).__init__(file_name, pindel_name, reference_handle) self.file_fd = smart_open(file_name) self.header = PindelHeader(self.file_fd) self.pysamheader = pysam_header(self.header) self.svs_supported = PindelReader.svs_supported if svs_to_report is not None: self.svs_supported &= set(svs_to_report) def __iter__(self): return self def __next__(self): while True: line = next(self.file_fd) if line.find("ChrID") >= 1: record = PindelRecord(line.strip(), self.reference_handle) if record.sv_type in self.svs_supported: return record def AreSamplesSpecified(self): return 1 def getOrderedSamples(self): return self.header.getOrderedSamples() # def SpecificFilterPass(self, record): # # Filtering criteria more than 4 PE # # see # # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers # return (record.length() > 60) def SpecificFilterPass(self, record): # fILTER if (abs(record.start-record.end+1) >= 2000 or record.MaxIndSupportingRP() <= 3): return False else: return True def remove_duplicate(self, records): """ returns a vector of records where duplicates were removed """ vcf_records_without_duplicate = self.RemoveDuplicates(records) return vcf_records_without_duplicate def RemoveDuplicates(self, records, cutoff=1): """ n analysis of all variants to identify the duplicates that share excatly the same breakpoints (cutoff = 1) or slightly different breakpoints controlled by cutoff """ # Sorting the vcf records records.sort(key=lambda x: (x.chrom, x.start, x.end, -x.su)) vcf_records_without_duplicate = [] for r in records: if self.IsDuplicate(vcf_records_without_duplicate, r, cutoff): print("removed entry %s identified as exact duplicate " % r) continue vcf_records_without_duplicate.append(r) return vcf_records_without_duplicate def IsDuplicate(self, valid_records, record, cutoff): if len(valid_records) == 0: return False last_included = valid_records[-1] start_offset = abs(record.start - last_included.start) end_offset = abs(record.end - last_included.end) if start_offset + end_offset < cutoff: return True return False class PindelWriter(SVWriter): def __init__(self, file_name, reference_contigs, template_reader): super(PindelWriter, self).__init__(file_name, template_reader.tool_name, template_reader) vcf_template_reader = get_template(template_reader.tool_name) vcf_template_reader.samples = template_reader.getOrderedSamples() self.vcf_writer = vcf.Writer(open(self.filename, 'w'), vcf_template_reader) def write(self, record): self.vcf_writer.write_record(record.to_vcf_record()) def _close(self): self.vcf_writer.close()