From 41296a03136b0ae5344d18783a48427c22fafe6d Mon Sep 17 00:00:00 2001 From: Floreal Cabanettes <floreal.cabanettes@inra.fr> Date: Fri, 1 Jun 2018 11:49:04 +0200 Subject: [PATCH] Make snakecnv/lib as submodule + move files --- .gitmodules | 3 + snakecnv/{lib => }/align_snakemake_utils.py | 0 snakecnv/lib | 1 + snakecnv/lib/svfilter.py | 353 -------------- snakecnv/lib/svreader/__init__.py | 274 ----------- snakecnv/lib/svreader/_version.py | 1 - snakecnv/lib/svreader/defaults.py | 68 --- snakecnv/lib/svreader/delly.py | 272 ----------- snakecnv/lib/svreader/genomestrip.py | 214 -------- snakecnv/lib/svreader/lumpy.py | 209 -------- snakecnv/lib/svreader/pindel.py | 389 --------------- snakecnv/lib/svreader/resources/b37.gaps.bed | 457 ------------------ snakecnv/lib/svreader/resources/hg19.gaps.bed | 457 ------------------ snakecnv/lib/svreader/resources/template.vcf | 2 - .../resources/template_breakdancer.vcf | 37 -- .../svreader/resources/template_cnvnator.vcf | 30 -- .../lib/svreader/resources/template_merge.vcf | 154 ------ .../svreader/resources/template_pindel.vcf | 22 - snakecnv/lib/svreader/vcf_utils.py | 32 -- snakecnv/lib/svreader/vcfwrapper.py | 121 ----- snakecnv/lib/svrunner_utils.py | 230 --------- snakecnv/{lib => }/svsnakemake_utils.py | 0 22 files changed, 4 insertions(+), 3322 deletions(-) rename snakecnv/{lib => }/align_snakemake_utils.py (100%) create mode 160000 snakecnv/lib delete mode 100644 snakecnv/lib/svfilter.py delete mode 100644 snakecnv/lib/svreader/__init__.py delete mode 100644 snakecnv/lib/svreader/_version.py delete mode 100644 snakecnv/lib/svreader/defaults.py delete mode 100644 snakecnv/lib/svreader/delly.py delete mode 100644 snakecnv/lib/svreader/genomestrip.py delete mode 100644 snakecnv/lib/svreader/lumpy.py delete mode 100644 snakecnv/lib/svreader/pindel.py delete mode 100644 snakecnv/lib/svreader/resources/b37.gaps.bed delete mode 100644 snakecnv/lib/svreader/resources/hg19.gaps.bed delete mode 100644 snakecnv/lib/svreader/resources/template.vcf delete mode 100644 snakecnv/lib/svreader/resources/template_breakdancer.vcf delete mode 100644 snakecnv/lib/svreader/resources/template_cnvnator.vcf delete mode 100644 snakecnv/lib/svreader/resources/template_merge.vcf delete mode 100644 snakecnv/lib/svreader/resources/template_pindel.vcf delete mode 100644 snakecnv/lib/svreader/vcf_utils.py delete mode 100644 snakecnv/lib/svreader/vcfwrapper.py delete mode 100644 snakecnv/lib/svrunner_utils.py rename snakecnv/{lib => }/svsnakemake_utils.py (100%) diff --git a/.gitmodules b/.gitmodules index 0fb765c..02663c6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "popsim"] path = popsim url = git@forgemia.inra.fr:genotoul-bioinfo/popsim.git +[submodule "snakecnv/lib"] + path = snakecnv/lib + url = git@forgemia.inra.fr:genotoul-bioinfo/svlib.git diff --git a/snakecnv/lib/align_snakemake_utils.py b/snakecnv/align_snakemake_utils.py similarity index 100% rename from snakecnv/lib/align_snakemake_utils.py rename to snakecnv/align_snakemake_utils.py diff --git a/snakecnv/lib b/snakecnv/lib new file mode 160000 index 0000000..765e872 --- /dev/null +++ b/snakecnv/lib @@ -0,0 +1 @@ +Subproject commit 765e87245fcc733e09f9c3471bfa6fd42106f941 diff --git a/snakecnv/lib/svfilter.py b/snakecnv/lib/svfilter.py deleted file mode 100644 index 68c41af..0000000 --- a/snakecnv/lib/svfilter.py +++ /dev/null @@ -1,353 +0,0 @@ - - -from math import isnan -import re - -from collections import defaultdict - -import numpy as np - -from svrunner_utils import warn, vcf_to_pybed - -HOM_REF = (0, 0) -HET_VAR = (0, 1) -HOM_VAR = (1, 1) - -SVTYPER_GENO_LIKELI_TAG = "GL" -GENOMESTRIP_GENO_LIKELI_TAG = "GP" - -Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG - - -def getNonVariantSamples(sv, variants, samples): - return set([s for s in samples if s not in variants]) - - -def getVariantSamples(sv): - if re.search(r"genomestrip", sv.id) is not None: - return set(sv.record.info['GSSAMPLES']) - else: - return set(sv.record.info['VSAMPLES']) - - -def passed_variant(record): - """Did this variant pass?""" - return (record.filter is None or - len(record.filter) == 0 or - "PASS" in record.filter) - - -def Called(record): - called = 0 - for sample in record.samples: - if getGQ(record.samples[sample]) > 13: - called += 1 - # if (record.samples[sample]["GQ"] is not None and - # record.samples[sample]["GQ"]>13): - # called += 1 - return called - - -def getGQ(sample): - return sample["GQ"] if sample["GQ"] is not None else 0 - - -def callRate(sv): - record = sv.record - return float(Called(record))/len(record.samples) - - -def MaxGQ(record, geno): - maxGQ = 0 - for sample in record.samples: - if (record.samples[sample]["GT"] == geno - and record.samples[sample]["GQ"] > maxGQ): - maxGQ = record.samples[sample]["GQ"] - return maxGQ - - -def Polymorph(sv): - """ - At least two different genotypes - """ - record = sv.record - MaxHomRefGQ = MaxGQ(record, HOM_REF) - MaxHomVarGQ = MaxGQ(record, HOM_VAR) - MaxHetVarGQ = MaxGQ(record, HET_VAR) - return MaxHomRefGQ*MaxHomVarGQ + MaxHomRefGQ*MaxHetVarGQ + MaxHomVarGQ*MaxHetVarGQ - - -def NumGeno(record, geno): - numgeno = 0 - for sample in record.samples: - if record.samples[sample]["GT"] == geno: - numgeno += 1 - return numgeno - - -def InbreedingCoef(sv): - """ - The inbreeding coefficient calculated as 1 - #observed het/#expected het - """ - obs_hetero = NumGeno(sv, HET_VAR) - exp_hetero = NumGeno(sv, HOM_VAR) + NumGeno(sv, HET_VAR)/2 - return 1 - float(obs_hetero)/exp_hetero - - -# Redundancy annotator -def getlikelihoods(sample): - # return an array of genotype loglikelihoods - return sample[Geno_likeli_tag] - - -def probas(likelihoods): - # transform log likelihoods into likelihoods - return np.exp(np.array(likelihoods)) - - -def getprobas(sample): - # Transforming likelihods into probabilities - return probas(getlikelihoods(sample))/np.sum(probas(getlikelihoods(sample))) - - -def gstrength(u): - # Sum of phred-like genotype qualities provides a measure of the - # combined genotype quality of the site - return np.sum([s['GQ'] if s['GQ'] is not None else 0 for s in u.samples.values()]) - - -def ondiagonal(s, u, v): - # Probability that, for that individual, the two SVs are identically - # genotyped P1(0/0)*P2(0/0) + ... P1(1/1)*P2(1/1) - # in the same way : - p = getprobas(u.samples[s]) - q = getprobas(v.samples[s]) - proba = 0 - for a, b in zip(p, q): - proba += a*b - # print("Proba on %3.5f" %(proba)) - return proba - - -def offdiagonal(s, u, v): - # Probability that, for that individual, the two SVs are not identically - # in the same way, complement of the previous one - p = getprobas(u.samples[s]) - q = getprobas(v.samples[s]) - proba = 0 - for i, a in enumerate(p): - for j, b in enumerate(q): - if i != j: - proba += a*b - # print("Proba off %3.2f" %(proba)) - return proba - - -def duplicatescore(u, v): - # For the two SVs, max discordant genotype log-ratio proba - # same genotypes against discordant against (worse individual) - - # Valid samples - valid_samples = [] - for s in u.samples: - if (u.samples[s]['GQ'] is not None - and v.samples[s]['GQ'] is not None): - valid_samples.append(s) - - max_disc = 0 - computed = float('NaN') - for s in valid_samples: - # ondiago is not used just for comprehension - # ondiago = ondiagonal(s, u, v) - offdiago = offdiagonal(s, u, v) - if offdiago > max_disc: - max_disc = offdiago - if max_disc > 0 and max_disc < 1: - ratio = (1-max_disc)/max_disc - computed = np.log(ratio) - return computed - - -def getduplicates(u, v): - # select the prefered duplicate - # returns prefered, discarded, strength of both - if gstrength(u) > gstrength(v): - return (u, v, gstrength(u), gstrength(v)) - else: - return (v, u, gstrength(v), gstrength(u)) - - -def GenotypeQuals(u): - # the genotype qualities as an array - return [u.samples[s]['GQ'] if u.samples[s]['GQ'] is not None else 0 - for s in u.samples] - - -def maxGQ(u): - # maximum genotype quality - return max(GenotypeQuals(u)) - - -def passed(u): - # did the variant passed the previous filter - if len(u.filter) == 0 or "PASS" in u.filter: - return True - else: - return False - - -def ordered(a, b): - # simply ordering the two string - return (a, b) if a < b else (b, a) - - -def svlen(u): - # the length of the SV - # vcf format is 1-based - return u.stop-u.pos+1 - - -def getoverlap(u, osize): - # percentage overlap given the size of the overlap - return 100*osize/svlen(u) - - -def setLikeliTag(genotyper): - global Geno_likeli_tag - - if genotyper == "svtyper": - Geno_likeli_tag = SVTYPER_GENO_LIKELI_TAG - warn("Assuming genotypes provided by svtyper software" + - " hence tag is %s" % (Geno_likeli_tag)) - - elif genotyper == "genomestrip": - Geno_likeli_tag = GENOMESTRIP_GENO_LIKELI_TAG - warn("Assuming genotypes provided by genomestrip software" + - " hence tag is %s" % (Geno_likeli_tag)) - else: - print("Unknown genotyping software") - exit(1) - - -def GenomeSTRIPLikeRedundancyAnnotator(SVSet, duplicatescore_threshold=-2, - genotyper="svtyper"): - """ Annotating duplicate candidates based on the genotype likelihoods - - genotype likelihoods can be provided by svtyper or genomestrip - """ - - setLikeliTag(genotyper) - - variants = defaultdict() - for sv in SVSet: - variants[sv.id] = sv - - pybed_variants = vcf_to_pybed(SVSet) - self_overlap = pybed_variants.intersect(pybed_variants, f=0.5, wo=True) - - seen = defaultdict(tuple) - duplicates = defaultdict(list) - overlapping = defaultdict(tuple) - for o in self_overlap: - if o[3] == o[7]: - continue - a, b = ordered(o[3], o[7]) - if seen[(a, b)]: - continue - seen[(a, b)] = True - u = variants[a] - v = variants[b] - score = duplicatescore(u, v) - if score > duplicatescore_threshold: - ref, dupli, s1, s2 = getduplicates(u, v) - overlap_size = int(o[-1]) - overlap = getoverlap(dupli, overlap_size) - if maxGQ(ref) > 0 and passed(dupli): - # are dismissed - # - reference variant with 0 genotype quality for all markers - # - duplicate that are already tagged as filtered out - # dupli.id is considered as a duplicate of ref.id - duplicates[dupli.id].append((ref.id, score, overlap)) - else: - overlap_size = int(o[-1]) - overlap_u = getoverlap(u, overlap_size) - overlap_v = getoverlap(v, overlap_size) - overlapping[(u, v)] = {'dupli_score': score, - 'overlap_left': overlap_u, - 'overlap_right': overlap_v, - } - - for u in SVSet: - if u.id in duplicates: - duplis = sorted([a for (a, s, o) in duplicates[u.id]]) - score = max([s for (a, s, o) in duplicates[u.id]]) - overlap = max([o for (a, s, o) in duplicates[u.id]]) - add_duplicate_info_sv(u, overlap, score, duplis) - - for o in overlapping: - info = overlapping[o] - u, v = o - add_overlap_info_sv(u, info['overlap_left'], info['dupli_score']) - add_overlap_info_sv(v, info['overlap_right'], info['dupli_score']) - - -def add_duplicate_info_sv(sv, duplicateoverlap, duplicatescore, duplicates): - """ - simply adding two information to the sv infos - """ - if isnan(duplicatescore): - sv.record.info['DUPLICATESCORE'] = float('nan') - else: - sv.record.info['DUPLICATESCORE'] = duplicatescore - sv.record.info['DUPLICATEOVERLAP'] = duplicateoverlap - sv.record.info['DUPLICATES'] = duplicates - - -def add_overlap_info_sv(sv, overlap, duplicatescore): - """ - simply adding two information to the sv infos - """ - if isnan(duplicatescore): - sv.record.info['DUPLICATESCORE'] = float('nan') - else: - sv.record.info['DUPLICATESCORE'] = duplicatescore - sv.record.info['NONDUPLICATEOVERLAP'] = overlap - - -def GenomeSTRIPLikefiltering(SVSet): - """ Filtering the candidate CNVs according to the following criteria - - non duplicate sites - - variant sites - - call rate > 0.8 - - at least one variant (homozygous or heterozygote) has a genotype - quality > 20 - - the variant is not everywhere heterozygote or homozygote - (use NONVARIANTSCORE in both cases) - """ - - for sv in SVSet: - info = sv.record.info - if callRate(sv) < 0.75: - sv.filter.add("CALLRATE") - if not Polymorph(sv): - sv.filter.add("POLYMORPH") - # if info['GSDUPLICATESCORE'] and info['GSDUPLICATESCORE']>0: - # sv.filter.add("DUPLICATE") - # DUPLICATE - if 'NONDUPLICATEOVERLAP' in info and info['NONDUPLICATEOVERLAP'] > 0.7: - sv.filter.add("OVERLAP") - - if "DUPLICATESCORE" in info is not None and info['DUPLICATESCORE'] > -2: - sv.filter.add("DUPLICATE") - # NONVARIANT - if 'GSNONVARSCORE' in info and info['GSNONVARSCORE'] and info['GSNONVARSCORE'] > 13: - sv.filter.add("NONVARIANT") - # GTDEPTH - if 'GSDEPTHRATIO' in info and (info['GSDEPTHRATIO'] is None or info['GSDEPTHRATIO'] > 0.8): - sv.filter.add("DEPTH") - if 'DEPTHRATIO' in info and (isnan(info['DEPTHRATIO']) or info['DEPTHRATIO'] > 0.8): - sv.filter.add("DDEPTH") - if 'DEPTHCALLTHRESHOLD' in info and info['DEPTHCALLTHRESHOLD'] > 1: - sv.filter.add("CCOVERAGE") - # CLUSTERSEP - if 'GSM1' in info and (info['GSM1'] <= -0.5 or info['GSM1'] >= 2.0): - sv.filter.add("CLUSTERSEP") diff --git a/snakecnv/lib/svreader/__init__.py b/snakecnv/lib/svreader/__init__.py deleted file mode 100644 index de64423..0000000 --- a/snakecnv/lib/svreader/__init__.py +++ /dev/null @@ -1,274 +0,0 @@ - - -import os -import re -import math - -import vcf -from pybedtools import BedTool -from pybedtools import create_interval_from_list -from pysam import tabix_compress, tabix_index -from svreader.vcf_utils import get_template -from svrunner_utils import eprint, vcf_to_pybed - -''' - -A Vcf wrapper object, following the specification of VCFv4.2 - https://samtools.github.io/hts-specs/VCFv4.2.pdf -''' - -GT_HOMO_REF = "0/0" -GT_HETERO = "0/1" -GT_HOMO_VAR = "1/1" - - -class SVInter(object): - ''' - SVInter object : - a lightweight interval object for the purpose of constructing - connected components of SV - each SVRecord corresponds to a node in the graph - ''' - def __init__(self, chrom, start, end, id): - self.__chrom = chrom - self.__start = start - self.__end = end - self.__id = id - self.__links = set() - - @property - def id(self): - return self.__id - - @id.setter - def id(self, x): - self.__id = x - - @property - def chrom(self): - return self.__chrom - - @property - def start(self): - return self.__start - - @property - def end(self): - return self.__end - - @start.setter - def start(self, x): - self.__start = x - - @end.setter - def end(self, x): - self.__end = x - - def length(self): - return self.end-self.start+1 - - # TODO should wee keep this here or have another object for connected - # component construction ? - @property - def links(self): - return set(self.__links) - - def add_link(self, other): - self.__links.add(other) - other.__links.add(self) - - def __str__(self): - return self.chrom+":"+str(self.start)+"-"+str(self.end) - - -class SVRecord(SVInter): - ''' - SVRecord object : - a generic object for storing SV records - Toolo specific SVrecods will inherit from this tool - ''' - def __init__(self, tool_name, chrom, pos, end, id, - ref=None, qual=None, filter=None): - super(SVRecord, self).__init__(chrom, pos, end, id) - - # VCF specific fields - self.__ref = ref or "N" - self.__qual = qual or "." - self.__filter = filter - - self.__tool_name = tool_name - self._sv_type = None - - @property - def tool_name(self): - return self.__tool_name - - @property - def sv_len(self): - return self.length() - - @property - def sv_type(self): - return self._sv_type - - @property - def filter(self): - return self.__filter - - def to_vcf_record(self): - return None - - def __str__(self): - return self.chrom+":"+str(self.start)+"-"+str(self.end) - - -def from_vcf_records_to_intervals(vcf_records): - - intervals = [] - for record in vcf_records: - chrom = record.chrom - start = record.start - 1 - end = record.end - name = record.id - intervals.append(create_interval_from_list( - map(str, [chrom, start, end, name]))) - return intervals - - -class SVReader(object): - ''' - SVReader object : - virtual object for reading sv-tool output files - The actual reading is made by the child objects - ''' - - svs_supported = set(["DEL", "INS", "DUP", "INV"]) - - def __init__(self, file_name, tool_name, reference_handle=None): - self.file_name = file_name - self.reference_handle = reference_handle - self.__tool_name = tool_name - - def __iter__(self): - return self - - def next(self): - while True: - raw_record = self.vcf_reader.next() - record = SVRecord(self.tool_name, - raw_record.CHROM, - raw_record.POS, - raw_record.INFO['END'], - raw_record.ID, - raw_record.REF, - raw_record.QUAL, - raw_record.FILTER) - return record - - @property - def tool_name(self): - return self.__tool_name - - def filtering(self, all_vcf_records, minlen, maxlen, - excluded_regions=None, cutoff_proximity=50): - """ - Filter the vcfrecords according to specified criteria - - keep only sv > mnilen and < max_len - - specific default filtering criteria for each tool - - sv close to gaps (less than cutoff_proximity) - :param all_vcf_records: list of VcfRecords - :param minlen: minimum SV length - :param maxlen: maximum SV length - :param excluded_regions: file with regions to exclude (e.g gaps) - :param cutoff_proximity: maximum tolerated proximity - with exlucded region - :return: list of VcfRecords - """ - # First filtering on size - vcf_records = [] - eprint("Filtering on size (>%d and <%d)" % (minlen, maxlen)) - for record in all_vcf_records: - if (self.GeneralFilterPass(record, minlen, maxlen) - and self.SpecificFilterPass(record)): - vcf_records.append(record) - - eprint("%d records" % (len(vcf_records))) - - if excluded_regions is not None: - # Convert vcf to intervals - eprint("Converting to bedtools intervals") - variants = vcf_to_pybed(vcf_records) - - # Identify SV overlapping regions to exclude - eprint("Loading exlusion intervals ") - eprint(excluded_regions) - toexclude = BedTool(excluded_regions).sort() - excluded = dict() - if len(toexclude): - eprint("Computing distance with gaps: %d gaps" % len(toexclude)) - # removing cnv in a close proximity to gaps in the assembly - # -d=True In addition to the closest feature in B, - # report its distance to A as an extra column - for sv in variants.closest(toexclude, d=True): - if (sv[4] != "." - and sv[4] != "-1" - and float(sv[-1]) < cutoff_proximity): - print("-".join(sv)) - excluded[sv[3]] = 1 - - vcf_records_filtered = [] - for record in vcf_records: - if record.id not in excluded: - vcf_records_filtered.append(record) - vcf_records = vcf_records_filtered - - return vcf_records - - def GeneralFilterPass(self, record, minlen, maxlen): - return (abs(record.sv_len) >= minlen and abs(record.sv_len) <= maxlen) - - def SpecificFilterPass(self, record): - return True - - def bnd_merge(sef, svtype, records): - return records # nothing to do in the majority of cases - - -class SVWriter(object): - def __init__(self, file_name, tool_name, template_reader): - self.tool_name = tool_name - self.template_reader = template_reader - self._setFilename(file_name) - self._isopen = False - - def _setFilename(self, filename): - if filename.endswith(".gz"): - filename = re.sub('\.gz$', '', filename) - self.__index = True - else: - self.__index = False - self.__filename = filename - - @property - def filename(self): - return self.__filename - - def write(self, record): - if not self._isopen: - self._open() - self._write(record) - - def close(self): - self._close() - if self.__index: - tabix_compress(self.__filename, self.__filename+".gz", force=True) - tabix_index(self.__filename+".gz", force=True, preset="vcf") - if os.path.exists(self.__filename): - os.remove(self.__filename) - - def _dumpemptyvcf(self): - vcf_template_reader = get_template(self.template_reader.tool_name) - vcf_template_reader.samples = self.template_reader.getOrderedSamples() - self.vcf_writer = vcf.Writer(open(self.filename, 'w'), - vcf_template_reader) - self.vcf_writer.close() diff --git a/snakecnv/lib/svreader/_version.py b/snakecnv/lib/svreader/_version.py deleted file mode 100644 index 7225152..0000000 --- a/snakecnv/lib/svreader/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.5.2" diff --git a/snakecnv/lib/svreader/defaults.py b/snakecnv/lib/svreader/defaults.py deleted file mode 100644 index 39f2d14..0000000 --- a/snakecnv/lib/svreader/defaults.py +++ /dev/null @@ -1,68 +0,0 @@ -MIN_SV_LENGTH = 50 -MAX_SV_LENGTH = 1000000 -OVERLAP_RATIO = 0.5 -WIGGLE = 100 -INS_WIGGLE = 100 -TX_WIGGLE = 5 -SVS_SUPPORTED = set(["DEL", "DUP", "INS", "INV", "ITX", "CTX"]) -SVS_ASSEMBLY_SUPPORTED = set(["DEL", "INS" , "INV", "DUP"]) -SVS_SOFTCLIP_SUPPORTED = set(["DEL", "INS" , "INV", "DUP"]) -MEAN_READ_LENGTH=100 - -# For generating candidate intervals for insertion assembly -MIN_SUPPORT_SC_ONLY = 2 -MIN_SUPPORT_INS = 15 -MIN_SUPPORT_DEL = 10 -MIN_SUPPORT_INV = 10 -MIN_SUPPORT_DUP = 10 -MIN_SUPPORT_FRAC_INS = 0.05 -MIN_SUPPORT_FRAC_DEL = 0.04 -MIN_SUPPORT_FRAC_INV = 0.015 -MIN_SUPPORT_FRAC_DUP = 0.015 -MEAN_READ_COVERAGE=50 -MIN_INS_COVERAGE_FRAC=0.5 -MAX_INS_COVERAGE_FRAC=1.5 -MAX_INTERVALS = 10000 -SC_PAD = 500 -SC_MIN_SOFT_CLIP = 20 -SC_MIN_AVG_BASE_QUAL = 20 -SC_MIN_MAPQ = 5 -SC_MAX_NM = 10 -SC_MIN_MATCHES = 50 -SC_OTHER_SCALE = 5 - - -ISIZE_MIN = 250 -ISIZE_MAX = 450 -ISIZE_MEAN = 350.0 -ISIZE_SD = 50.0 - -# For assembly read-extraction -EXTRACTION_MAX_READ_PAIRS = 10000 -EXTRACTION_MAX_NM = 5 -EXTRACTION_MAX_INTERVAL_TRUNCATION = 10000 -EXTRACTION_TRUNCATION_PAD = 4000 - -# For running SPAdes -ASSEMBLY_MAX_TOOLS = 1 -SPADES_TIMEOUT = 300 # in seconds -SPADES_PAD = 500 -SPADES_MAX_INTERVAL_SIZE = 50000 -SPADES_MAX_INTERVAL_SIZE_2BP = 1000000 - -# For running AGE -AGE_TIMEOUT = 300 # in seconds -AGE_MIN_CONTIG_LENGTH = 200 -AGE_PAD = 500 -AGE_MAX_REGION_LENGTH = 1000000 -AGE_MAX_INTERVAL_TRUNCATION = 10000 -AGE_TRUNCATION_PAD = 2000 -AGE_DIST_TO_BP = 400 -MIN_INV_SUBALIGN_LENGTH = 50 -MIN_DEL_SUBALIGN_LENGTH = 50 -AGE_WINDOW_SIZE = 20 -AGE_DIST_TO_EXP_BP_INS = 25 - -# For genotyping -GT_WINDOW = 100 -GT_NORMAL_FRAC = 0.05 \ No newline at end of file diff --git a/snakecnv/lib/svreader/delly.py b/snakecnv/lib/svreader/delly.py deleted file mode 100644 index 3a5ebd4..0000000 --- a/snakecnv/lib/svreader/delly.py +++ /dev/null @@ -1,272 +0,0 @@ -import logging -import os - - -from collections import defaultdict - -from pysam import VariantFile - -from svreader import SVRecord, SVReader, SVWriter -from svrunner_utils import eprint, vcf_to_pybed - -logger = logging.getLogger(__name__) - -mydir = os.path.dirname(os.path.realpath(__file__)) - -''' -Delly output is a vcf file, fields are described in the header - -#using awk -cat vcf | -awk '/##FORMAT/{ last=index($0,","); id=index($0,"ID"); \ - desc=substr($0,match($0,"Description")+12) ; gsub(/"/, "", desc); \ - printf "%-30s %s\n", substr($0,id+3,last-id-3), \ - substr(desc,0,length(desc)-1)}' - -FILTER - PASS All filters passed - LowQual PE/SR support below 3 or mapping quality below 20. - -INFO - CIEND PE confidence interval around END - CIPOS PE confidence interval around POS - CHR2 Chromosome for END coordinate in case of a translocation - END End position of the structural variant - PE Paired-end support of the structural variant - MAPQ Median mapping quality of paired-ends - SR Split-read supportSpecificFilterPass - SRQ Split-read consensus alignment quality - CONSENSUS Split-read consensus sequence - CE Consensus sequence entropy - CT Paired-end signature induced connection type - IMPRECISE Imprecise structural variation - PRECISE Precise structural variation - SVTYPE Type of structural variant - SVMETHOD Type of approach used to detect SV - INSLEN Predicted length of the insertion - HOMLEN Predicted microhomology length using a max. edit distance of 2 - RDRATIO Read-depth ratio of het. SV carrier vs. non-carrier. - -FORMAT - GT Genotype - GL Log10-scaled genotype likelihoods for RR,RA,AA genotypes - GQ Genotype Quality - FT Per-sample genotype filter - RC Raw high-quality read counts for the SV - RCL Raw high-quality read counts for the left control region - RCR Raw high-quality read counts for the right control region - CN Read-depth based copy-number estimate for autosomal sites - DR high-quality reference pairs - DV high-quality variant pairs - RR high-quality reference junction reads - RV high-quality variant junction reads - - -''' - -delly_name = "delly" -delly_source = set([delly_name]) - - -class DellyRecord(SVRecord): - - def __init__(self, record, sample_order, reference_handle=None): - - # Custom id - record.id = "_".join([delly_name, record.chrom, record.id]) - super(DellyRecord, self).__init__(delly_name, - record.chrom, - record.pos, - record.stop, - record.id, - ref=record.ref, - qual=record.qual, - filter=record.filter) - - self.__record = record - self._sv_type = record.info["SVTYPE"] - self.__pe = record.info["PE"] - self.__sr = record.info["SR"] if "SR" in list(record.info.keys()) else 0 - self.__ct = record.info["CT"] - self.__record.info['SR'] = self.__sr - - sv = {} - for sample in record.samples: - sample_name = sample.rsplit('.')[0] - sv[sample_name] = record.samples[sample] - self.sv = sv - self.__record.info['VSAMPLES'] = ",".join(self.variantSamples()) - - @property - def record(self): - return self.__record - - @property - def start(self): - return self.record.pos - - @property - def end(self): - return self.record.stop - - @property - def pe(self): - return self.__pe - - @property - def sr(self): - return self.__sr - - @property - def ct(self): - return self.__ct - - def update(self, start, end, pe_supp): - self.record.pos = start - self.record.stop = end - self.record.info['PE'] = pe_supp - - def variantSample(self, sample): - if self.sv[sample]["DV"] > 0 or self.sv[sample]["RV"] > 0: - return True - else: - return False - - def variantSamples(self): - variant_samples = [] - for sample in self.sv: - if self.variantSample(sample): - variant_samples.append(sample) - return sorted(variant_samples) - - def MaxIndSupportingRP(self): - ''' Maximum number of supporting PE among individuals. - ''' - max_ind_supp = 0 - su = 0 - rep_sample = "" - for sample in self.sv: - dv = self.sv[sample]["DV"] - su += dv - if dv > max_ind_supp: - rep_sample = sample - max_ind_supp = dv - if max_ind_supp == 0: - ratio = 0 - else: - total = self.sv[rep_sample]["DV"] + self.sv[rep_sample]["DR"] - ratio = float(self.sv[rep_sample]["DV"])/total - return su, max_ind_supp, ratio - - def to_svcf_record(self): - return self.record - - -class DellyReader(SVReader): - svs_supported = set(["DEL", "INS", "DUP", "INV"]) - - def __init__(self, file_name, reference_handle=None, svs_to_report=None): - super(DellyReader, self).__init__(file_name, - delly_name, - reference_handle) - self.vcf_reader = VariantFile(file_name) - if 'VSAMPLES' not in self.vcf_reader.header.info.keys(): - self.vcf_reader.header.info.add('VSAMPLES', - number='.', - type='String', - description='Samples with variant alleles') - self.svs_supported = DellyReader.svs_supported - if svs_to_report is not None: - self.svs_supported &= set(svs_to_report) - - def __iter__(self): - return self - - def __next__(self): - while True: - delly_record = next(self.vcf_reader) - record = DellyRecord(delly_record, self.reference_handle) - if record.sv_type in self.svs_supported: - return record - - def getHeader(self): - return self.vcf_reader.header - - def getOrderedSamples(self): - samples = self.vcf_reader.header.samples - sample_names = [sample.rsplit('.')[0] for sample in samples] - return sample_names - - def SpecificFilterPass(self, record): - # Filtering criteria more than 5 PE or more than SR and PE > 0 - # see http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers - # TODO : should we modifiy this in order to account for individual - # specific filtering (e.g pe > 5 at least for an individual) - return (record.pe > 5 or (record.pe > 0 and record.sr > 0)) - - def bnd_merge(sef, svtype, records): - """ - Merging delly two lines inversion format according to the - CT info field (distal connection) - 3to3 tail-to-tail - 5to5 head-to-head - CT types are separated and for each 3to3, the corresponding - 5to5 is searched for (bedtools intersect with >90% overlap) - A single line corresponding to the intersection of both - signature is kept - """ - if svtype != "INV": - return records - # a dictionnary of records with id as key - records_dict = defaultdict() - records_CT = defaultdict(list) - for r in records: - records_dict[r.id] = r - records_CT[r.ct].append(r) - - eprint("Converting to bedtools intervals") - tail2tail = vcf_to_pybed(records_CT["3to3"]) - head2head = vcf_to_pybed(records_CT["5to5"]) - - # TODO check if the following code really recover balanced inversions - - balanced = tail2tail.intersect(head2head, r=True, f=0.95, wo=True) - - confirmed = [] - seen = defaultdict() - for b in balanced: - start = int(b[5]) # tail - end = int(b[2]) # head - t2t = records_dict[b[3]] - h2h = records_dict[b[7]] - repre = t2t - if repre in seen: - continue - seen[repre] = 1 - pe_supp = t2t.record.info['PE'] + h2h.record.info['PE'] - print("merging %s and %s" % (t2t, h2h)) - repre.update(start, end, pe_supp) - confirmed.append(repre) - return confirmed - - -class DellyWriter(SVWriter): - - def __init__(self, file_name, reference_contigs, template_reader): - super(DellyWriter, self).__init__(file_name, - template_reader.tool_name, - template_reader) - - def _open(self): - self.vcf_writer = VariantFile(self.filename, 'w', - header=self.template_reader.getHeader()) - self._isopen = True - - def _write(self, record): - self.vcf_writer.write(record.record) - - def _close(self): - if self._isopen: - self.vcf_writer.close() - else: # nothing was written - self._dumpemptyvcf() diff --git a/snakecnv/lib/svreader/genomestrip.py b/snakecnv/lib/svreader/genomestrip.py deleted file mode 100644 index 8eaf2de..0000000 --- a/snakecnv/lib/svreader/genomestrip.py +++ /dev/null @@ -1,214 +0,0 @@ -import logging -import sys -import os - -from pysam import VariantFile - -from svreader import SVRecord, SVReader, SVWriter - -logger = logging.getLogger(__name__) -mydir = os.path.dirname(os.path.realpath(__file__)) - -''' - -An ultra lightweight vcf reader for genomeSTRIP - -#using awk -cat vcf | - - -awk '/##FORMAT/{ last=index($0,","); id=index($0,"ID"); \ - desc=substr($0,match($0,"Description")+12) ; \ - gsub(/"/, "", desc); printf "%-30s %s\n", \ - substr($0,id+3,last-id-3), substr(desc,0,length(desc)-1)}' - - -FILTER - - COHERENCE GSCOHPVALUE == NA || GSCOHPVALUE <= 0.01 - COVERAGE GSDEPTHCALLTHRESHOLD == NA || GSDEPTHCALLTHRESHOLD >= 1.0 - DEPTH GSDEPTHRATIO == NA || GSDEPTHRATIO > 0.8 || (GSDEPTHRATIO > 0.63 && (GSMEMBPVALUE == NA || GSMEMBPVALUE >= 0.01)) - DEPTHPVAL GSDEPTHPVALUE == NA || GSDEPTHPVALUE >= 0.01 - LQ Low Quality Genotyping filter - PAIRSPERSAMPLE GSNPAIRS <= 1.1 * GSNSAMPLES - -INFO: - CIEND Confidence interval around END for imprecise variants - CIPOS Confidence interval around POS for imprecise variants - END End coordinate of this variant - GSCOHERENCE Value of coherence statistic - GSCOHFN Coherence statistic per pair - GSCOHPVALUE Coherence metric (not a true p-value) - GSCOORDS Original cluster coordinates - GSCORA6 Correlation with array intensity from Affy6 arrays - GSCORI1M Correlation with array intensity from Illumina 1M arrays - GSCORNG Correlation with array intensity from NimbleGen arrays - GSDEPTHCALLS Samples with discrepant read pairs or low read depth - GSDEPTHCALLTHRESHOLD Read depth threshold (median read depth of samples with discrepant read pairs) - GSDEPTHNOBSSAMPLES Number of samples with discrepant read pairs in depth test - GSDEPTHNTOTALSAMPLES Total samples in depth test - GSDEPTHOBSSAMPLES Samples with discrepant read pairs in depth test - GSDEPTHPVALUE Depth p-value using chi-squared test - GSDEPTHPVALUECOUNTS Depth test read counts (carrier inside event, carrier outside event, non-carrier inside, non-carrier outside) - GSDEPTHRANKSUMPVALUE Depth p-value using rank-sum test - GSDEPTHRATIO Read depth ratio test - GSDMAX Maximum value considered for DOpt - GSDMIN Minimum value considered for DOpt - GSDOPT Most likely event length - GSDSPAN Inner span length of read pair cluster - GSELENGTH Effective length - GSEXPMEAN Expected read depth sample mean - GSGMMWEIGHTS Genotyping depth model cluster weights - GSM1 Genotyping depth model parameter M1 - GSM2 Genotyping depth model parameters M2[0],M2[1] - GSMEMBNPAIRS Number of pairs used in membership test - GSMEMBNSAMPLES Number of samples used in membership test - GSMEMBOBSSAMPLES Samples participating in membership test - GSMEMBPVALUE Membership p-value - GSMEMBSTATISTIC Value of membership statistic - GSNDEPTHCALLS Number of samples with discrepant read pairs or low read depth - GSNHET Number of heterozygous snp genotype calls inside the event - GSNHOM Number of homozygous snp genotype calls inside the event - GSNNOCALL Number of snp genotype non-calls inside the event - GSNPAIRS Number of discrepant read pairs - GSNSAMPLES Number of samples with discrepant read pairs - GSNSNPS Number of snps inside the event - GSOUTLEFT Number of outlier read pairs on left - GSOUTLIERS Number of outlier read pairs - GSOUTRIGHT Number of outlier read pairs on right - GSREADGROUPS Read groups contributing discrepant read pairs - GSREADNAMES Discrepant read pair identifiers - GSRPORIENTATION Read pair orientation - GSSAMPLES Samples contributing discrepant read pairs - GSSNPHET Fraction of het snp genotype calls inside the event - HOMLEN Length of base pair identical micro-homology at event breakpoints - HOMSEQ Sequence of base pair identical micro-homology at event breakpoints - IMPRECISE Imprecise structural variation - NOVEL Indicates a novel structural variation - SVLEN Difference in length between REF and ALT alleles - SVTYPE Type of structural variant - -FORMAT: - CN Copy number genotype for imprecise events - CNF Estimate of fractional copy number - CNL Copy number likelihoods with no frequency prior - CNP Copy number likelihoods - CNQ Copy number genotype quality for imprecise events - FT Per-sample genotype filter - GL Genotype likelihoods with no frequency prior - GP Genotype likelihoods - GQ Genotype Quality - GSPC Number of supporting read pairs for structural variant alleles - GT Genotype - PL Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification - -''' - - -genomestrip_name = "genomestrip" -genomestrip_source = set([genomestrip_name]) - - -class GenomeSTRIPRecord(SVRecord): - - def __init__(self, record, reference_handle=None): - record.id = "_".join([genomestrip_name, record.chrom, record.id]) - super(GenomeSTRIPRecord, self).__init__(genomestrip_name, - record.chrom, - record.pos, - record.stop, - record.id, - ref=record.ref, - qual=record.qual, - filter=record.filter) - self.__record = record - self.__sv_type = record.info['SVTYPE'] - sv = {} - for sample in record.samples: - sample_name = sample.rsplit('.')[0] - sv[sample_name] = record.samples[sample] - self.sv = sv - - variant_samples = self.variantSamples() - if len(variant_samples) > 0: - self.__record.info['VSAMPLES'] = ",".join(variant_samples) - - @property - def sv_type(self): - return self.__sv_type - - @property - def record(self): - return self.__record - - def variantSample(self, sample): - if self.sv[sample]["GSPC"] > 0: - return True - else: - return False - - def variantSamples(self): - variant_samples = [] - for sample in self.sv: - if self.variantSample(sample): - variant_samples.append(sample) - return sorted(variant_samples) - - -class GenomeSTRIPReader(SVReader): - svs_supported = set(["DEL"]) - - def __init__(self, file_name, reference_handle=None, svs_to_report=None): - super(GenomeSTRIPReader, self).__init__(file_name, - genomestrip_name, - reference_handle) - self.vcf_reader = VariantFile(file_name) - self.vcf_reader.header.info.add('VSAMPLES', - number='.', - type='String', - description='Samples with variant alleles') - if svs_to_report is not None: - self.svs_supported &= set(svs_to_report) - - def __iter__(self): - return self - - def __next__(self): - while True: - genomestrip_record = next(self.vcf_reader) - record = GenomeSTRIPRecord(genomestrip_record, - self.reference_handle) - if record.sv_type in self.svs_supported: - return record - - def getHeader(self): - return self.vcf_reader.header - - def getOrderedSamples(self): - samples = self.vcf_reader.header.samples - sample_names = [sample.rsplit('.')[0] for sample in samples] - return sample_names - - -class GenomeSTRIPWriter(SVWriter): - - def __init__(self, file_name, reference_contigs, template_reader): - super(GenomeSTRIPWriter, self).__init__(file_name, - template_reader.tool_name, - template_reader) - self.__template_reader = template_reader - - def _open(self): - header = self.__template_reader.getHeader() - self.vcf_writer = VariantFile(self.filename, 'w', - header=header) - self._isopen = True - - def _write(self, record): - self.vcf_writer.write(record.record) - - def _close(self): - if self._isopen: - self.vcf_writer.close() - else: # nothing was written - self._dumpemptyvcf() diff --git a/snakecnv/lib/svreader/lumpy.py b/snakecnv/lib/svreader/lumpy.py deleted file mode 100644 index cbb149d..0000000 --- a/snakecnv/lib/svreader/lumpy.py +++ /dev/null @@ -1,209 +0,0 @@ - -import logging -import os - -import re - -from pysam import VariantFile - -from svreader import SVRecord, SVReader, SVWriter - -logger = logging.getLogger(__name__) -mydir = os.path.dirname(os.path.realpath(__file__)) - -''' -An ultra lightweight vcf reader for lumpy - -INFO: - SVTYPE Type of structural variant - SVLEN Difference in length between REF and ALT alleles - END End position of the variant described in this record - STRANDS Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--) - IMPRECISE Imprecise structural variation - CIPOS Confidence interval around POS for imprecise variants - CIEND Confidence interval around END for imprecise variants - CIPOS95 Confidence interval (95%) around POS for imprecise variants - CIEND95 Confidence interval (95%) around END for imprecise variants - MATEID ID of mate breakends - EVENT ID of event associated to breakend - SECONDARY Secondary breakend in a multi-line variants - SU Number of pieces of evidence supporting the variant across all samples - PE Number of paired-end reads supporting the variant across all samples - SR Number of split reads supporting the variant across all samples - EV Type of LUMPY evidence contributing to the variant call - PRPOS LUMPY probability curve of the POS breakend - PREND LUMPY probability curve of the END breakend - -FORMAT: - GT Genotype - SU Number of pieces of evidence supporting the variant - PE Number of paired-end reads supporting the variant - SR Number of split reads supporting the variant - BD Amount of BED evidence supporting the variant - RO Reference allele observation count, with partial observations recorded fractionally - AO Alternate allele observations, with partial observations recorded fractionally - QR Sum of quality of reference observations - QA Sum of quality of alternate observations - RS Reference allele split-read observation count, with partial observations recorded fractionally - AS Alternate allele split-read observation count, with partial observations recorded fractionally - RP Reference allele paired-end observation count, with partial observations recorded fractionally - AP Alternate allele paired-end observation count, with partial observations recorded fractionally - AB Allele balance, fraction of observations from alternate allele, QA/(QR+QA) - - -''' - -lumpy_name = "lumpy" -lumpy_source = set([lumpy_name]) - - -class LumpyRecord(SVRecord): - - def __init__(self, record, reference_handle=None): - record.id = "_".join( - [lumpy_name, - record.chrom, - record.info['SVTYPE'], - record.id]) - super(LumpyRecord, self).__init__(lumpy_name, - record.chrom, - record.pos, - record.stop, - record.id, - ref=record.ref, - qual=record.qual, - filter=record.filter) - self.__record = record - - sv = {} - for sample in record.samples: - sample_name = sample.rsplit('.')[0] - sv[sample_name] = record.samples[sample] - self.sv = sv - # self.__record.info["MAX_IND_SU"] = self.MaxIndSupportingRP() - self._sv_type = self.__record.info['SVTYPE'] - self.__record.info['VSAMPLES'] = ",".join(self.variantSamples()) - - @property - def svlen(self): - svlen = 0 - if self.__record.INFO["SVTYPE"] == "BND": - chrom, end = self. getAltBND() - svlen = end - self.start + 1 - else: - svlen = abs(int(self.__record.INFO["SVLEN"][0])) - return svlen - - @property - def record(self): - return self.__record - - def getAltBND(self): - ''' extract second breakend chr,pos from ALT field ''' - alt = self.__record.ALT[0] - m = re.search(r"[\[\]]([\w\.:0-9]+)[\[\]]", str(alt)) - if m: - return m.group(1).split(":") - else: - return 0 - - def variantSampleOld(self, sample): - if (max(self.sv[sample]["AS"]) > 0 - or max(self.sv[sample]["AP"]) - or max(self.sv[sample]["ASC"]) > 0): - return True - else: - return False - - def variantSample(self, sample): - if self.sv[sample]["SU"] > 0: - return True - else: - return False - - def variantSamples(self): - variant_samples = [] - for sample in self.sv: - if self.variantSample(sample): - variant_samples.append(sample) - return sorted(variant_samples) - - def MaxIndSupportingRP(self): - sv = self.sv - max_ind_supp = 0 - for sample in sv: - max_ind_supp = max(max_ind_supp, sv[sample]['SU']) - return max_ind_supp - - -class LumpyReader(SVReader): - svs_supported = set(["DEL", "INS", "DUP", "INV"]) - - def __init__(self, file_name, reference_handle=None, svs_to_report=None): - super(LumpyReader, self).__init__(file_name, - lumpy_name, - reference_handle) - self.vcf_reader = VariantFile(file_name) - self.vcf_reader.header.info.add( - 'VSAMPLES', - number='.', - type='String', - description='Samples with variant alleles') - if svs_to_report is not None: - self.svs_supported &= set(svs_to_report) - - def __iter__(self): - return self - - def __next__(self): - while True: - lumpy_record = next(self.vcf_reader) - # BND not supported - if lumpy_record.info['SVTYPE'] == "BND": - continue - record = LumpyRecord(lumpy_record, self.reference_handle) - if record.sv_type in self.svs_supported: - return record - - def getHeader(self): - return self.vcf_reader.header - - def AreSamplesSpecified(self): - return 1 - - def getOrderedSamples(self): - if not hasattr(self.vcf_reader, "samples"): - return [] - return self.vcf_reader.samples - - def SpecificFilterPass(self, record): - # Filtering criteria more than 4 PE - # see - # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers - return (record.MaxIndSupportingRP() > 4) - - -class LumpyWriter(SVWriter): - - def __init__(self, file_name, reference_contigs, template_reader): - super( - LumpyWriter, - self).__init__(file_name, - template_reader.tool_name, - template_reader) - - def _open(self): - self.vcf_writer = VariantFile( - self.filename, - 'w', - header=self.template_reader.getHeader()) - self._isopen = True - - def _write(self, record): - self.vcf_writer.write(record.record) - - def _close(self): - if self._isopen: - self.vcf_writer.close() - else: # nothing was written - self._dumpemptyvcf() diff --git a/snakecnv/lib/svreader/pindel.py b/snakecnv/lib/svreader/pindel.py deleted file mode 100644 index 8195297..0000000 --- a/snakecnv/lib/svreader/pindel.py +++ /dev/null @@ -1,389 +0,0 @@ -import logging -import sys -import os -import gzip - -import vcf - -from svrunner_utils import smart_open -from svreader.vcf_utils import get_template, get_template_header -from svreader import SVRecord, SVReader, SVWriter - -from pysam import VariantFile, VariantHeader - -logger = logging.getLogger(__name__) - -mydir = os.path.dirname(os.path.realpath(__file__)) - -''' -from http://gmt.genome.wustl.edu/packages/pindel/user-manual.html - -There is a head line for each variant reported, followed by the alignment of supporting reads to the reference on the -second line. The example variants are a 1671bp deletion and a 10bp insertion on chr20. The breakpoints are specified -after "BP". Due to microhomology around the breakpoints, the breakpoint coordinates may shift both upstream and -downstream,'BP_range' is used to indicate the range of this shift. The header line contains the following data: - -1) The index of the indel/SV (57 means that 57 insertions precede this insertion in the file) - -2) The type of indel/SV: I for insertion, D for deletion, INV for inversion, TD for tandem duplication - -3) The length of the SV - -4) "NT" (to indicate that the next number is the length of non-template sequences inserted; insertions are fully covered - by the NT-fields, deletions can have NT bases if the deletion is not 'pure', meaning that while bases have been - deleted, some bases have been inserted between the breakpoints) - -5) the length(s) of the NT fragment(s) - -6) the sequence(s) of the NT fragment(s) - -7-8) the identifier of the chromosome the read was found on - -9-10-11) BP: the start and end positions of the SV - -12-13-14) BP_range: if the exact position of the SV is unclear since bases at the edge of one read-half could equally -well be appended to the other read-half. In the deletion example, ACA could be on any side of the gap, so the original -deletion could have been between 1337143 and 1338815, between 1337144 and 1338816, or between 1337145 and 133817, or -between 1337146 and 133818. BP-range is used to indicate this range. - -15) "Supports": announces that the total count of reads supporting the SV follow. - -16) The number of reads supporting the SV - -17) The number of unique reads supporting the SV (so not counting duplicate reads) - -18) +: supports from reads whose anchors are upstream of the SV - -19-20) total number of supporting reads and unique number of supporting reads whose anchors are upstream of the SV. - -21) -: supports from reads whose anchors are downstream of the SV - -22-23) total number of supporting reads and unique number of supporting reads whose anchors are downstream of the SV - -24-25) S1: a simple score, ("# +" + 1)* ("# -" + 1) ; - -26-27) SUM_MS: sum of mapping qualities of anchor reads, The reads with variants or unmapped are called split-read, -whose mate is called anchor reads. We use anchor reads to narrow down the search space to speed up and increase -sensitivity; - -28) the number of different samples scanned - -29-30-31) NumSupSamples?: the number of samples supporting the SV, as well as the number of samples having unique -reads supporting the SV (in practice, these numbers are the same) - -32+) Per sample: the sample name, followed by the total number of supporting reads whose anchors are upstream, the -total number of unique supporting reads whose anchors are upstream, the total number of supporting reads whose anchors - are downstream, and finally the total number of unique supporting reads whose anchors are downstream. - -WARNING see below - ----- - -The reported larger insertion (LI) record is rather different than other types of variants. Here is the format: -1) index, -2) type(LI), -3) "ChrID", -4) chrName, -5) left breakpoint, -6) "+" -7) number of supporting reads for the left coordinate, -8) right breakpoint, -9) "-" -10) number of supporting reads for the right coordinate. - -Following lines are repeated for each sample -11) Sample name -12) "+" -13) upstream supporting reads -14) "-" -15) downstream supporting reads - -''' - -''' -New version of pindel changes the output starting from 32+ - Check https://trac.nbic.nl/pipermail/pindel-users/2013-March/000229.html - -hi mike, - -This is the output format for the new version where the number of reads -supporting the reference allele is also counted on the left and right -breakpoints of the variants. - -TS1_3051 164 167 0 0 2 2 TS1_3054 117 118 0 0 0 0 - -For example, 164 and 167 reads supporting the reference for sample TS1_3051 -but only 2 reads and 2 unique reads from - strand support the alternative. -It is less likely that this is a true germline variant. with the read count -for the reference allele, you may filter the calls in the same way as snps. - - -''' - -pindel_name = "pindel" -pindel_source = set([pindel_name]) -min_coverage = 10 -het_cutoff = 0.2 -hom_cutoff = 0.8 - -GT_REF = "0/0" -GT_HET = "0/1" -GT_HOM = "1/1" - -PINDEL_TO_SV_TYPE = {"I": "INS", "D": "DEL", "LI": "INS", - "TD": "DUP", "INV": "INV", "RPL":"RPL"} - - -class PindelHeader: - def __init__(self, fd): - self.header_dict = {} - self.header_fields = [] - - start = fd.tell() - # Parse header : read the header until the last line is - # encoutered (^#Chr1) break before reading the next line - while True: - try: - line = next(fd) - if line.find("ChrID") >= 1: - self.parse_firstrecord_line(line) - break - except StopIteration: - break - # We return to the start of the file - fd.seek(start) - - def parse_firstrecord_line(self, line): - fields = line.split() - num_samples = int(fields[27]) - pindel024u_or_later = len(fields) > 31 + 5 * num_samples - if pindel024u_or_later: - self.samples = [fields[i] for i in range(31, len(fields), 7)] - else: - self.samples = [fields[i] for i in range(31, len(fields), 5)] - logger.info("\t".join(self.samples)) - - def getOrderedSamples(self): - # sample order is given by the sample order in the pindel output - if not hasattr(self, "samples"): - return [] - return self.samples - - def __str__(self): - return str(self.__dict__) - - -def pysam_header(pindel_header): - # Using the new pysam interface for creating records and header - # not implemented in the python3.4 version of pysam (0.10.0) - # have to wait at least for the 0.11.2.2 pysam version - header = VariantHeader() - vcf_header = get_template_header("pindel") - - with open(vcf_header) as fin: - for line in fin: - if (line.startswith("##INFO") - or line.startswith("##FORMAT") - or line.startswith("##FILTER")): - header.add_line(line.rstrip()) - for sample in pindel_header.getOrderedSamples(): - header.add_sample(sample) - - return header - - -class PindelRecord(SVRecord): - counter = {"I": 0, "D": 0, "LI": 0, "TD": 0, "INV": 0, "RPL": 0} - - # Only DEL and DUP for the moment - svs_supported = set(["DEL", "DUP", "INV"]) - - def __init__(self, record_string, reference_handle=None): - self.name = pindel_name - - fields = record_string.split() - index = fields[0] - sv_type = fields[1] - - if PINDEL_TO_SV_TYPE[sv_type] not in PindelRecord.svs_supported: - print(("Unsupported structural variation " + sv_type)) - exit(1) - sv_len = int(fields[2]) - num_nt_added = fields[4] - # nt_added = fields[5] - chrom = fields[7] - start_pos = int(fields[9]) - end_pos = int(fields[10]) - bp_range = (int(fields[12]), int(fields[13])) - read_supp = int(fields[15]) - uniq_read_supp = int(fields[16]) - up_read_supp = int(fields[18]) - up_uniq_read_supp = int(fields[19]) - down_read_supp = int(fields[21]) - down_uniq_read_supp = int(fields[22]) - # score = int(fields[24]) - # nums_samples = int(fields[27]) - num_sample_supp = int(fields[29]) - # num_sample_uniq_supp = int(fields[30]) - - sv = {} - samples = [] - for i in range(31, len(fields), 7): - sv[fields[i]] = {"name": fields[i], - "up_ref_read_supp": int(fields[i + 1]), - "down_ref_read_supp": int(fields[i + 2]), - "up_var_read_supp": int(fields[i + 3]), - "up_var_uniq_read_supp": int(fields[i + 4]), - "down_var_read_supp": int(fields[i + 5]), - "down_var_uniq_read_supp": int(fields[i + 6]) - } - samples.append(sv[fields[i]]) - - self.pindel_sv_type = sv_type - self.up_read_supp = up_read_supp - self.down_read_supp = down_read_supp - self.up_uniq_read_supp = up_uniq_read_supp - self.down_uniq_read_supp = down_uniq_read_supp - self.uniq_read_supp = uniq_read_supp - - # Computing CIPOS and CIEND - pos_conf = max(1, abs(int((bp_range[0] - start_pos)/2))) - end_conf = max(1, abs(int((bp_range[1] - end_pos)/2))) - self.cipos = (-pos_conf, pos_conf) - self.ciend = (-end_conf, end_conf) - - PindelRecord.counter[sv_type] += 1 - id = "_".join([pindel_name, chrom, sv_type, - str(PindelRecord.counter[sv_type])]) - super(PindelRecord, self).__init__(pindel_name, chrom, - start_pos, end_pos, id) - - self._sv_type = PINDEL_TO_SV_TYPE[sv_type] - self.__sv_len = sv_len - self.__samples = samples - self.__sv = sv - - self.Modinfo = { - "END": end_pos, - "SU": read_supp, - "INDEX": index, - "NTLEN": num_nt_added, - "CIPOS": ",".join([str(x) for x in self.cipos]), - "CIEND": ",".join([str(x) for x in self.ciend]), - "NUM_SUPP_SAMPLES": num_sample_supp - } - - def variantSample(self, sample): - if (self.__sv[sample]["up_var_read_supp"] > 0 - or self.__sv[sample]["down_var_read_supp"] > 0): - return True - else: - return False - - def variantSamples(self): - variant_samples = [] - for sample in self.__sv: - if self.variantSample(sample): - variant_samples.append(sample) - return sorted(variant_samples) - - def getOrderedSamples(self): - # sample order is given by the sample order in the pindel output - return self.__samples - - def getSampleCalls(self): - # TODO make sure that the order of the sample conform to - # the order of the vcf header !! - calls = [] - for s in self.getOrderedSamples(): - sample = s["name"] - ad = s["up_var_read_supp"] + s["down_var_read_supp"] - called_value = vcf.model.make_calldata_tuple("AD")(AD=ad) - calldata = vcf.model._Call(None, sample, called_value) - calls.append(calldata) - return calls - - def MaxIndSupportingRP(self): - max_ind_supp = 0 - for s in self.__samples: - ad = s["up_var_read_supp"] + s["down_var_read_supp"] - max_ind_supp = max(ad, max_ind_supp) - return max_ind_supp - - def to_vcf_record(self): - alt = [vcf.model._SV(self.sv_type)] - info = {"SVLEN": self.sv_len, "SVTYPE": self.sv_type} - - info["MAX_IND_SU"] = self.MaxIndSupportingRP() - info["VSAMPLES"] = ",".join(self.variantSamples()) - - info.update(self.Modinfo) - - calls = self.getSampleCalls() - - vcf_record = vcf.model._Record(self.chrom, - self.start, - self.id, - "N", - alt, - ".", - "PASS", - info, - "AD", - [0], - calls) - return vcf_record - - -class PindelReader(SVReader): - svs_supported = set(["DEL", "INS", "DUP", "INV"]) - - def __init__(self, file_name, reference_handle=None, svs_to_report=None): - super(PindelReader, self).__init__(file_name, pindel_name, - reference_handle) - self.file_fd = smart_open(file_name) - self.header = PindelHeader(self.file_fd) - self.pysamheader = pysam_header(self.header) - self.svs_supported = PindelReader.svs_supported - if svs_to_report is not None: - self.svs_supported &= set(svs_to_report) - - def __iter__(self): - return self - - def __next__(self): - while True: - line = next(self.file_fd) - if line.find("ChrID") >= 1: - record = PindelRecord(line.strip(), self.reference_handle) - if record.sv_type in self.svs_supported: - return record - - def AreSamplesSpecified(self): - return 1 - - def getOrderedSamples(self): - return self.header.getOrderedSamples() - - def SpecificFilterPass(self, record): - # Filtering criteria more than 4 PE - # see - # http://bcb.io/2014/08/12/validated-whole-genome-structural-variation-detection-using-multiple-callers - return (record.length() > 60) - - -class PindelWriter(SVWriter): - def __init__(self, file_name, reference_contigs, template_reader): - super(PindelWriter, self).__init__(file_name, - template_reader.tool_name, - template_reader) - vcf_template_reader = get_template(template_reader.tool_name) - vcf_template_reader.samples = template_reader.getOrderedSamples() - self.vcf_writer = vcf.Writer(open(self.filename, 'w'), - vcf_template_reader) - - def write(self, record): - self.vcf_writer.write_record(record.to_vcf_record()) - - def _close(self): - self.vcf_writer.close() diff --git a/snakecnv/lib/svreader/resources/b37.gaps.bed b/snakecnv/lib/svreader/resources/b37.gaps.bed deleted file mode 100644 index 0f8c1f8..0000000 --- a/snakecnv/lib/svreader/resources/b37.gaps.bed +++ /dev/null @@ -1,457 +0,0 @@ -1 0 10000 1 N 10000 telomere no -1 177417 227417 4 N 50000 clone yes -1 267719 317719 6 N 50000 contig no -1 471368 521368 8 N 50000 contig no -1 2634220 2684220 32 N 50000 clone yes -1 3845268 3995268 47 N 150000 contig no -1 13052998 13102998 151 N 50000 clone yes -1 13219912 13319912 154 N 100000 contig no -1 13557162 13607162 157 N 50000 clone yes -1 17125658 17175658 196 N 50000 clone yes -1 29878082 30028082 337 N 150000 contig no -1 103863906 103913906 1088 N 50000 clone yes -1 120697156 120747156 1263 N 50000 clone yes -1 120936695 121086695 1265 N 150000 contig no -1 121485434 121535434 1269 N 50000 clone no -1 121535434 124535434 1270 N 3000000 centromere no -1 124535434 142535434 1271 N 18000000 heterochromatin no -1 142731022 142781022 1273 N 50000 clone yes -1 142967761 143117761 1275 N 150000 contig no -1 143292816 143342816 1277 N 50000 clone yes -1 143544525 143644525 1279 N 100000 contig no -1 143771002 143871002 1281 N 100000 contig no -1 144095783 144145783 1285 N 50000 contig no -1 144224481 144274481 1287 N 50000 contig no -1 144401744 144451744 1289 N 50000 clone yes -1 144622413 144672413 1291 N 50000 contig no -1 144710724 144810724 1293 N 100000 clone yes -1 145833118 145883118 1307 N 50000 clone yes -1 146164650 146214650 1310 N 50000 clone yes -1 146253299 146303299 1312 N 50000 clone yes -1 148026038 148176038 1328 N 150000 contig no -1 148361358 148511358 1331 N 150000 contig no -1 148684147 148734147 1333 N 50000 clone yes -1 148954460 149004460 1337 N 50000 clone yes -1 149459645 149509645 1342 N 50000 clone yes -1 205922707 206072707 1875 N 150000 contig no -1 206332221 206482221 1878 N 150000 contig no -1 223747846 223797846 2038 N 50000 clone yes -1 235192211 235242211 2159 N 50000 clone yes -1 248908210 249058210 2293 N 150000 contig no -1 249240621 249250621 2302 N 10000 telomere no -10 0 10000 1 N 10000 telomere no -10 10000 60000 2 N 50000 contig no -10 17974675 18024675 161 N 50000 clone yes -10 38818835 38868835 337 N 50000 clone yes -10 39154935 39254935 340 N 100000 contig no -10 39254935 42254935 341 N 3000000 centromere no -10 42254935 42354935 342 N 100000 contig no -10 42546687 42596687 344 N 50000 clone yes -10 46426964 46476964 375 N 50000 contig no -10 47429169 47529169 387 N 100000 contig no -10 47792476 47892476 390 N 100000 contig no -10 48055707 48105707 392 N 50000 contig no -10 49095536 49195536 403 N 100000 contig no -10 51137410 51187410 419 N 50000 clone yes -10 51398845 51448845 421 N 50000 clone yes -10 125869472 125919472 1061 N 50000 clone yes -10 128616069 128766069 1088 N 150000 contig no -10 133381404 133431404 1129 N 50000 clone yes -10 133677527 133727527 1135 N 50000 clone yes -10 135524747 135534747 1156 N 10000 telomere no -11 0 10000 1 N 10000 telomere no -11 10000 60000 2 N 50000 contig no -11 1162759 1212759 14 N 50000 clone yes -11 50783853 50833853 439 N 50000 clone no -11 50833853 51040853 440 N 207000 heterochromatin no -11 51040853 51090853 441 N 50000 clone no -11 51594205 51644205 446 N 50000 clone no -11 51644205 54644205 447 N 3000000 centromere no -11 54644205 54694205 448 N 50000 clone no -11 69089801 69139801 575 N 50000 clone yes -11 69724695 69774695 583 N 50000 clone yes -11 87688378 87738378 736 N 50000 clone yes -11 96287584 96437584 808 N 150000 contig no -11 134946516 134996516 1134 N 50000 contig no -11 134996516 135006516 1135 N 10000 telomere no -12 0 10000 1 N 10000 telomere no -12 10000 60000 2 N 50000 contig no -12 95739 145739 4 N 50000 clone yes -12 7189876 7239876 66 N 50000 contig no -12 34856694 37856694 304 N 3000000 centromere no -12 109373470 109423470 954 N 50000 contig no -12 122530623 122580623 1075 N 50000 contig no -12 132706992 132806992 1171 N 100000 contig no -12 133841895 133851895 1183 N 10000 telomere no -13 0 10000 1 N 10000 telomere no -13 10000 16000000 2 N 15990000 short_arm no -13 16000000 19000000 3 N 3000000 centromere no -13 19000000 19020000 4 N 20000 heterochromatin no -13 86760324 86910324 617 N 150000 contig no -13 112353994 112503994 848 N 150000 contig no -13 114325993 114425993 866 N 100000 contig no -13 114639948 114739948 870 N 100000 contig no -13 115109878 115159878 875 N 50000 contig no -13 115159878 115169878 876 N 10000 telomere no -14 0 10000 1 N 10000 telomere no -14 10000 16000000 2 N 15990000 short_arm no -14 16000000 19000000 3 N 3000000 centromere no -14 107289540 107339540 667 N 50000 contig no -14 107339540 107349540 668 N 10000 telomere no -15 0 10000 1 N 10000 telomere no -15 10000 17000000 2 N 16990000 short_arm no -15 17000000 20000000 3 N 3000000 centromere no -15 20894633 20935075 12 N 40442 clone yes -15 21398819 21885000 16 N 486181 clone yes -15 22212114 22262114 19 N 50000 contig no -15 22596193 22646193 23 N 50000 contig no -15 23514853 23564853 31 N 50000 contig no -15 29159443 29209443 83 N 50000 contig no -15 82829645 82879645 518 N 50000 contig no -15 84984473 85034473 539 N 50000 contig no -15 102521392 102531392 690 N 10000 telomere no -16 0 10000 1 N 10000 telomere no -16 10000 60000 2 N 50000 contig no -16 8636921 8686921 128 N 50000 clone yes -16 34023150 34173150 343 N 150000 contig no -16 35285801 35335801 353 N 50000 clone no -16 35335801 38335801 354 N 3000000 centromere no -16 38335801 46335801 355 N 8000000 heterochromatin no -16 46335801 46385801 356 N 50000 clone no -16 88389383 88439383 712 N 50000 contig no -16 90294753 90344753 731 N 50000 contig no -16 90344753 90354753 732 N 10000 telomere no -17 296626 396626 4 N 100000 contig no -17 21566608 21666608 185 N 100000 contig no -17 22263006 25263006 192 N 3000000 centromere no -17 34675848 34725848 278 N 50000 contig no -17 62410760 62460760 523 N 50000 clone yes -17 77546461 77596461 644 N 50000 clone yes -17 79709049 79759049 665 N 50000 contig no -17_ctg5_hap1 1256794 1306794 12 N 50000 clone yes -17_ctg5_hap1 1588968 1638968 15 N 50000 clone yes -18 0 10000 1 N 10000 telomere no -18 15410898 15460898 124 N 50000 clone no -18 15460898 18460898 125 N 3000000 centromere no -18 18460898 18510898 126 N 50000 clone no -18 52059136 52209136 398 N 150000 contig no -18 72283353 72333353 555 N 50000 clone yes -18 75721820 75771820 585 N 50000 clone yes -18 78017248 78067248 604 N 50000 contig no -18 78067248 78077248 605 N 10000 telomere no -19 0 10000 1 N 10000 telomere no -19 10000 60000 2 N 50000 contig no -19 7346004 7396004 170 N 50000 contig no -19 8687198 8737198 190 N 50000 contig no -19 20523415 20573415 367 N 50000 clone yes -19 24631782 24681782 409 N 50000 heterochromatin no -19 24681782 27681782 410 N 3000000 centromere no -19 27681782 27731782 411 N 50000 heterochromatin no -19 59118983 59128983 870 N 10000 telomere no -2 0 10000 1 N 10000 telomere no -2 3529312 3579312 38 N 50000 contig no -2 5018788 5118788 52 N 100000 contig no -2 16279724 16329724 149 N 50000 contig no -2 21153113 21178113 192 N 25000 contig no -2 87668206 87718206 739 N 50000 clone yes -2 89630436 89830436 755 N 200000 contig no -2 90321525 90371525 760 N 50000 clone yes -2 90545103 91545103 762 N 1000000 heterochromatin no -2 91545103 91595103 763 N 50000 clone no -2 92326171 95326171 770 N 3000000 centromere no -2 110109337 110251337 893 N 142000 contig no -2 149690582 149790582 1221 N 100000 contig no -2 234003741 234053741 1942 N 50000 contig no -2 239801978 239831978 1992 N 30000 contig no -2 240784132 240809132 2001 N 25000 contig no -2 243102476 243152476 2025 N 50000 clone yes -2 243189373 243199373 2027 N 10000 telomere no -20 0 10000 1 N 10000 telomere no -20 10000 60000 2 N 50000 contig no -20 26319569 26369569 274 N 50000 clone no -20 26369569 29369569 275 N 3000000 centromere no -20 29369569 29419569 276 N 50000 clone no -20 29653908 29803908 279 N 150000 contig no -20 34897085 34947085 336 N 50000 clone yes -20 61091437 61141437 624 N 50000 clone yes -20 61213369 61263369 628 N 50000 contig no -20 62965520 63015520 648 N 50000 contig no -20 63015520 63025520 649 N 10000 telomere no -21 0 10000 1 N 10000 telomere no -21 10000 5211193 2 N 5201193 short_arm no -21 5211193 9411193 3 N 4200000 contig no -21 9595548 9645548 5 N 50000 contig no -21 9775437 9825437 7 N 50000 contig no -21 10034920 10084920 10 N 50000 contig no -21 10215976 10365976 12 N 150000 contig no -21 10647896 10697896 15 N 50000 contig no -21 11188129 11238129 20 N 50000 clone no -21 11238129 11288129 21 N 50000 heterochromatin no -21 11288129 14288129 22 N 3000000 centromere no -21 14288129 14338129 23 N 50000 clone no -21 42955559 43005559 419 N 50000 contig no -21 44632664 44682664 447 N 50000 clone yes -21 48119895 48129895 515 N 10000 telomere no -22 0 10000 1 N 10000 telomere no -22 10000 13000000 2 N 12990000 short_arm no -22 13000000 16000000 3 N 3000000 centromere no -22 16000000 16050000 4 N 50000 contig no -22 16697850 16847850 27 N 150000 contig no -22 20509431 20609431 81 N 100000 contig no -22 50364777 50414777 563 N 50000 contig no -22 51244566 51294566 591 N 50000 contig no -22 51294566 51304566 592 N 10000 telomere no -3 0 10000 1 N 10000 telomere no -3 10000 60000 2 N 50000 clone no -3 66170270 66270270 565 N 100000 contig no -3 90504854 93504854 784 N 3000000 centromere no -3 194041961 194047251 1693 N 5290 contig no -3 197962430 198012430 1732 N 50000 contig no -3 198012430 198022430 1733 N 10000 telomere no -4 0 10000 1 N 10000 telomere no -4 1423146 1478646 15 N 55500 contig no -4 8799203 8818203 83 N 19000 contig no -4 9274642 9324642 88 N 50000 clone yes -4 31820917 31837417 283 N 16500 contig no -4 32834638 32840638 294 N 6000 contig no -4 40296396 40297096 365 N 700 contig no -4 49338941 49488941 445 N 150000 contig no -4 49660117 52660117 447 N 3000000 centromere no -4 59739333 59789333 510 N 50000 contig no -4 75427379 75452279 651 N 24900 contig no -4 191044276 191144276 1657 N 100000 clone no -4 191144276 191154276 1658 N 10000 telomere no -5 0 10000 1 N 10000 telomere no -5 17530657 17580657 171 N 50000 clone yes -5 46405641 49405641 452 N 3000000 centromere no -5 91636128 91686128 873 N 50000 contig no -5 138787073 138837073 1350 N 50000 contig no -5 155138727 155188727 1514 N 50000 contig no -5 180905260 180915260 1775 N 10000 telomere no -6 0 10000 1 N 10000 telomere no -6 10000 60000 2 N 50000 contig no -6 58087659 58137659 620 N 50000 clone yes -6 58780166 58830166 627 N 50000 clone no -6 58830166 61830166 628 N 3000000 centromere no -6 61830166 61880166 629 N 50000 clone no -6 62128589 62178589 633 N 50000 clone yes -6 95680543 95830543 992 N 150000 contig no -6 157559467 157609467 1652 N 50000 clone yes -6 157641300 157691300 1654 N 50000 clone yes -6 167942073 168042073 1761 N 100000 clone yes -6 170279972 170329972 1787 N 50000 clone yes -6 171055067 171105067 1797 N 50000 contig no -6 171105067 171115067 1798 N 10000 telomere no -6_apd_hap1 207807 252060 4 N 44253 clone yes -6_apd_hap1 327074 356458 6 N 29384 clone yes -6_apd_hap1 448574 536172 9 N 87598 clone yes -6_apd_hap1 588761 692953 11 N 104192 clone yes -6_apd_hap1 944435 963558 16 N 19123 clone yes -6_apd_hap1 970040 1029417 18 N 59377 clone yes -6_apd_hap1 1173934 1198528 23 N 24594 clone yes -6_apd_hap1 1307951 1317697 26 N 9746 clone yes -6_apd_hap1 1341208 1344139 28 N 2931 clone yes -6_apd_hap1 1451581 1556172 31 N 104591 clone yes -6_apd_hap1 1582549 1598989 33 N 16440 clone yes -6_apd_hap1 1736596 1786349 37 N 49753 clone yes -6_apd_hap1 1833811 1937682 39 N 103871 clone yes -6_apd_hap1 2009922 2164480 41 N 154558 clone yes -6_apd_hap1 2169002 2254169 43 N 85167 clone yes -6_apd_hap1 2359974 2761234 48 N 401260 clone yes -6_apd_hap1 2821679 2858431 51 N 36752 clone yes -6_apd_hap1 2894657 3037482 54 N 142825 clone yes -6_apd_hap1 3122973 3136480 56 N 13507 clone yes -6_apd_hap1 3180323 3355295 59 N 174972 clone yes -6_apd_hap1 3370179 3415446 62 N 45267 clone yes -6_apd_hap1 3491066 3594101 64 N 103035 clone yes -6_apd_hap1 3638667 3642320 66 N 3653 clone yes -6_apd_hap1 3696623 3736194 68 N 39571 clone yes -6_apd_hap1 3762870 3766192 70 N 3322 clone yes -6_apd_hap1 3790366 3970602 72 N 180236 clone yes -6_apd_hap1 4220467 4274176 76 N 53709 clone yes -6_apd_hap1 4390029 4597885 79 N 207856 clone yes -6_dbb_hap3 138055 245960 4 N 107905 clone yes -6_dbb_hap3 622457 640380 11 N 17923 clone yes -6_dbb_hap3 1337861 1344119 21 N 6258 clone yes -6_dbb_hap3 1836690 1837200 29 N 510 clone yes -6_dbb_hap3 2117295 2119813 33 N 2518 clone yes -6_dbb_hap3 2606257 2723615 42 N 117358 clone yes -6_dbb_hap3 3407000 3481304 52 N 74304 clone yes -6_dbb_hap3 3736386 3772614 59 N 36228 clone yes -6_dbb_hap3 4206542 4226964 66 N 20422 clone yes -6_dbb_hap3 4354126 4376794 69 N 22668 clone yes -6_mann_hap4 145852 184243 3 N 38391 clone yes -6_mann_hap4 1388812 1438812 19 N 50000 clone yes -6_mann_hap4 2047420 2056121 28 N 8701 clone yes -6_mann_hap4 2252213 2277026 32 N 24813 clone yes -6_mann_hap4 2312291 2324610 34 N 12319 clone yes -6_mann_hap4 2517052 2540962 37 N 23910 clone yes -6_mann_hap4 2772006 2810953 42 N 38947 clone yes -6_mann_hap4 2921096 2929556 45 N 8460 clone yes -6_mann_hap4 3062107 3151453 47 N 89346 clone yes -6_mann_hap4 3174271 3227203 49 N 52932 clone yes -6_mann_hap4 3261361 3374492 52 N 113131 clone yes -6_mann_hap4 3412258 3450867 54 N 38609 clone yes -6_mann_hap4 3553455 3557741 56 N 4286 clone yes -6_mann_hap4 3895092 3926341 62 N 31249 clone yes -6_mann_hap4 4236950 4245095 68 N 8145 clone yes -6_mann_hap4 4441658 4480941 73 N 39283 clone yes -6_mcf_hap5 135141 138043 3 N 2902 clone yes -6_mcf_hap5 179429 186757 5 N 7328 clone yes -6_mcf_hap5 229133 282732 7 N 53599 clone yes -6_mcf_hap5 383988 404415 10 N 20427 clone yes -6_mcf_hap5 491590 560421 12 N 68831 clone yes -6_mcf_hap5 676599 685588 15 N 8989 clone yes -6_mcf_hap5 994250 1009296 20 N 15046 clone yes -6_mcf_hap5 1177759 1292220 23 N 114461 clone yes -6_mcf_hap5 1562952 1726699 28 N 163747 clone yes -6_mcf_hap5 1815769 1838106 30 N 22337 clone yes -6_mcf_hap5 2091199 2189158 37 N 97959 clone yes -6_mcf_hap5 2241824 2251998 39 N 10174 clone yes -6_mcf_hap5 2300850 2305647 41 N 4797 clone yes -6_mcf_hap5 2712451 2811903 49 N 99452 clone yes -6_mcf_hap5 2947576 2958999 52 N 11423 clone yes -6_mcf_hap5 3154498 3188719 55 N 34221 clone yes -6_mcf_hap5 3329672 3355524 58 N 25852 clone yes -6_mcf_hap5 3602941 3659279 65 N 56338 clone yes -6_mcf_hap5 3899294 3929849 69 N 30555 clone yes -6_mcf_hap5 4149579 4149626 73 N 47 clone yes -6_mcf_hap5 4284353 4302274 76 N 17921 clone yes -6_mcf_hap5 4422172 4594253 78 N 172081 clone yes -6_qbl_hap6 521900 681210 9 N 159310 clone yes -6_qbl_hap6 2680914 2732064 37 N 51150 clone yes -6_qbl_hap6 3023176 3049607 42 N 26431 clone yes -6_qbl_hap6 3316246 3369039 46 N 52793 clone yes -6_qbl_hap6 3993031 4020006 57 N 26975 clone yes -6_ssto_hap7 861800 941242 15 N 79442 clone yes -6_ssto_hap7 1727717 1744408 28 N 16691 clone yes -6_ssto_hap7 1836632 1838892 34 N 2260 clone yes -6_ssto_hap7 1983561 1997824 37 N 14263 clone yes -6_ssto_hap7 2052735 2087415 39 N 34680 clone yes -6_ssto_hap7 2101474 2113875 41 N 12401 clone yes -6_ssto_hap7 2138491 2303018 43 N 164527 clone yes -6_ssto_hap7 2411247 2461193 48 N 49946 clone yes -6_ssto_hap7 3040184 3054966 60 N 14782 clone yes -6_ssto_hap7 3093568 3154699 62 N 61131 clone yes -6_ssto_hap7 3191196 3214366 64 N 23170 clone yes -6_ssto_hap7 3368931 3445987 68 N 77056 clone yes -6_ssto_hap7 4370564 4420564 83 N 50000 clone yes -6_ssto_hap7 4580410 4610218 86 N 29808 clone yes -6_ssto_hap7 4639806 4661556 88 N 21750 clone yes -6_ssto_hap7 4723509 4774367 90 N 50858 clone yes -6_ssto_hap7 4783798 4836049 92 N 52251 clone yes -7 0 10000 1 N 10000 telomere no -7 232484 282484 5 N 50000 clone yes -7 50370631 50410631 488 N 40000 contig no -7 58054331 61054331 564 N 3000000 centromere no -7 61310513 61360513 569 N 50000 heterochromatin no -7 61460465 61510465 571 N 50000 clone yes -7 61677020 61727020 573 N 50000 clone yes -7 61917157 61967157 575 N 50000 heterochromatin no -7 74715724 74765724 693 N 50000 clone yes -7 100556043 100606043 952 N 50000 clone yes -7 130154523 130254523 1278 N 100000 clone yes -7 139379377 139404377 1368 N 25000 contig no -7 142048195 142098195 1397 N 50000 clone yes -7 142276197 142326197 1399 N 50000 clone yes -7 143347897 143397897 1409 N 50000 clone yes -7 154270634 154370634 1517 N 100000 contig no -7 159128663 159138663 1563 N 10000 telomere no -8 0 10000 1 N 10000 telomere no -8 7474649 7524649 66 N 50000 contig no -8 12091854 12141854 105 N 50000 contig no -8 43838887 46838887 376 N 3000000 centromere no -8 48130499 48135599 389 N 5100 contig no -8 86576451 86726451 716 N 150000 contig no -8 142766515 142816515 1177 N 50000 clone yes -8 145332588 145432588 1209 N 100000 contig no -8 146304022 146354022 1216 N 50000 contig no -8 146354022 146364022 1217 N 10000 telomere no -9 0 10000 1 N 10000 telomere no -9 39663686 39713686 343 N 50000 clone yes -9 39974796 40024796 346 N 50000 contig no -9 40233029 40283029 348 N 50000 clone yes -9 40425834 40475834 350 N 50000 contig no -9 40940341 40990341 354 N 50000 contig no -9 41143214 41193214 357 N 50000 clone yes -9 41365793 41415793 359 N 50000 contig no -9 42613955 42663955 370 N 50000 contig no -9 43213698 43313698 375 N 100000 contig no -9 43946569 43996569 381 N 50000 clone yes -9 44676646 44726646 386 N 50000 clone yes -9 44908293 44958293 388 N 50000 clone yes -9 45250203 45350203 391 N 100000 contig no -9 45815521 45865521 397 N 50000 contig no -9 46216430 46266430 400 N 50000 clone yes -9 46461039 46561039 403 N 100000 contig no -9 47060133 47160133 408 N 100000 contig no -9 47317679 47367679 410 N 50000 clone no -9 47367679 50367679 411 N 3000000 centromere no -9 50367679 65367679 412 N 15000000 heterochromatin no -9 65367679 65467679 413 N 100000 contig no -9 65918360 65968360 418 N 50000 contig no -9 66192215 66242215 421 N 50000 clone yes -9 66404656 66454656 423 N 50000 clone yes -9 66614195 66664195 425 N 50000 clone yes -9 66863343 66913343 428 N 50000 clone yes -9 67107834 67207834 430 N 100000 contig no -9 67366296 67516296 432 N 150000 contig no -9 67987998 68137998 437 N 150000 contig no -9 68514181 68664181 441 N 150000 contig no -9 68838946 68988946 443 N 150000 contig no -9 69278385 69328385 446 N 50000 clone yes -9 70010542 70060542 452 N 50000 clone yes -9 70218729 70318729 454 N 100000 contig no -9 70506535 70556535 456 N 50000 contig no -9 70735468 70835468 458 N 100000 contig no -9 92343416 92443416 641 N 100000 clone yes -9 92528796 92678796 643 N 150000 clone yes -9 133073060 133223060 982 N 150000 contig no -9 137041193 137091193 1020 N 50000 contig no -9 139166997 139216997 1040 N 50000 contig no -9 141153431 141203431 1057 N 50000 contig no -9 141203431 141213431 1058 N 10000 telomere no -X 0 10000 1 N 10000 telomere no -X 10000 60000 2 N 50000 contig no -X 94821 144821 4 N 50000 contig no -X 231384 281384 8 N 50000 contig no -X 1047557 1097557 16 N 50000 contig no -X 1134113 1184113 18 N 50000 contig no -X 1264234 1314234 21 N 50000 contig no -X 2068238 2118238 31 N 50000 contig no -X 7623882 7673882 82 N 50000 clone yes -X 10738674 10788674 111 N 50000 clone yes -X 37098256 37148256 355 N 50000 contig no -X 49242997 49292997 484 N 50000 contig no -X 49974173 50024173 494 N 50000 contig no -X 52395914 52445914 518 N 50000 contig no -X 58582012 58632012 582 N 50000 clone no -X 58632012 61632012 583 N 3000000 centromere no -X 61632012 61682012 584 N 50000 clone no -X 76653692 76703692 724 N 50000 contig no -X 113517668 113567668 1183 N 50000 contig no -X 115682290 115732290 1214 N 50000 contig no -X 120013235 120063235 1268 N 50000 clone yes -X 143507324 143557324 1536 N 50000 contig no -X 148906424 148956424 1591 N 50000 clone yes -X 149032062 149082062 1594 N 50000 contig no -X 152277099 152327099 1624 N 50000 clone yes -X 155260560 155270560 1668 N 10000 telomere no -Y 0 10000 1 N 10000 telomere no -Y 44821 94821 3 N 50000 contig no -Y 181384 231384 7 N 50000 contig no -Y 997557 1047557 15 N 50000 contig no -Y 1084113 1134113 17 N 50000 contig no -Y 1214234 1264234 20 N 50000 contig no -Y 2018238 2068238 30 N 50000 contig no -Y 8914955 8964955 92 N 50000 contig no -Y 9241322 9291322 97 N 50000 contig no -Y 10104553 13104553 105 N 3000000 centromere no -Y 13143954 13193954 107 N 50000 contig no -Y 13748578 13798578 112 N 50000 contig no -Y 20143885 20193885 169 N 50000 clone yes -Y 22369679 22419679 190 N 50000 clone yes -Y 23901428 23951428 204 N 50000 contig no -Y 28819361 58819361 249 N 30000000 heterochromatin no -Y 58917656 58967656 251 N 50000 contig no -Y 59363566 59373566 255 N 10000 telomere no diff --git a/snakecnv/lib/svreader/resources/hg19.gaps.bed b/snakecnv/lib/svreader/resources/hg19.gaps.bed deleted file mode 100644 index d516325..0000000 --- a/snakecnv/lib/svreader/resources/hg19.gaps.bed +++ /dev/null @@ -1,457 +0,0 @@ -chr1 0 10000 1 N 10000 telomere no -chr1 177417 227417 4 N 50000 clone yes -chr1 267719 317719 6 N 50000 contig no -chr1 471368 521368 8 N 50000 contig no -chr1 2634220 2684220 32 N 50000 clone yes -chr1 3845268 3995268 47 N 150000 contig no -chr1 13052998 13102998 151 N 50000 clone yes -chr1 13219912 13319912 154 N 100000 contig no -chr1 13557162 13607162 157 N 50000 clone yes -chr1 17125658 17175658 196 N 50000 clone yes -chr1 29878082 30028082 337 N 150000 contig no -chr1 103863906 103913906 1088 N 50000 clone yes -chr1 120697156 120747156 1263 N 50000 clone yes -chr1 120936695 121086695 1265 N 150000 contig no -chr1 121485434 121535434 1269 N 50000 clone no -chr1 121535434 124535434 1270 N 3000000 centromere no -chr1 124535434 142535434 1271 N 18000000 heterochromatin no -chr1 142731022 142781022 1273 N 50000 clone yes -chr1 142967761 143117761 1275 N 150000 contig no -chr1 143292816 143342816 1277 N 50000 clone yes -chr1 143544525 143644525 1279 N 100000 contig no -chr1 143771002 143871002 1281 N 100000 contig no -chr1 144095783 144145783 1285 N 50000 contig no -chr1 144224481 144274481 1287 N 50000 contig no -chr1 144401744 144451744 1289 N 50000 clone yes -chr1 144622413 144672413 1291 N 50000 contig no -chr1 144710724 144810724 1293 N 100000 clone yes -chr1 145833118 145883118 1307 N 50000 clone yes -chr1 146164650 146214650 1310 N 50000 clone yes -chr1 146253299 146303299 1312 N 50000 clone yes -chr1 148026038 148176038 1328 N 150000 contig no -chr1 148361358 148511358 1331 N 150000 contig no -chr1 148684147 148734147 1333 N 50000 clone yes -chr1 148954460 149004460 1337 N 50000 clone yes -chr1 149459645 149509645 1342 N 50000 clone yes -chr1 205922707 206072707 1875 N 150000 contig no -chr1 206332221 206482221 1878 N 150000 contig no -chr1 223747846 223797846 2038 N 50000 clone yes -chr1 235192211 235242211 2159 N 50000 clone yes -chr1 248908210 249058210 2293 N 150000 contig no -chr1 249240621 249250621 2302 N 10000 telomere no -chr10 0 10000 1 N 10000 telomere no -chr10 10000 60000 2 N 50000 contig no -chr10 17974675 18024675 161 N 50000 clone yes -chr10 38818835 38868835 337 N 50000 clone yes -chr10 39154935 39254935 340 N 100000 contig no -chr10 39254935 42254935 341 N 3000000 centromere no -chr10 42254935 42354935 342 N 100000 contig no -chr10 42546687 42596687 344 N 50000 clone yes -chr10 46426964 46476964 375 N 50000 contig no -chr10 47429169 47529169 387 N 100000 contig no -chr10 47792476 47892476 390 N 100000 contig no -chr10 48055707 48105707 392 N 50000 contig no -chr10 49095536 49195536 403 N 100000 contig no -chr10 51137410 51187410 419 N 50000 clone yes -chr10 51398845 51448845 421 N 50000 clone yes -chr10 125869472 125919472 1061 N 50000 clone yes -chr10 128616069 128766069 1088 N 150000 contig no -chr10 133381404 133431404 1129 N 50000 clone yes -chr10 133677527 133727527 1135 N 50000 clone yes -chr10 135524747 135534747 1156 N 10000 telomere no -chr11 0 10000 1 N 10000 telomere no -chr11 10000 60000 2 N 50000 contig no -chr11 1162759 1212759 14 N 50000 clone yes -chr11 50783853 50833853 439 N 50000 clone no -chr11 50833853 51040853 440 N 207000 heterochromatin no -chr11 51040853 51090853 441 N 50000 clone no -chr11 51594205 51644205 446 N 50000 clone no -chr11 51644205 54644205 447 N 3000000 centromere no -chr11 54644205 54694205 448 N 50000 clone no -chr11 69089801 69139801 575 N 50000 clone yes -chr11 69724695 69774695 583 N 50000 clone yes -chr11 87688378 87738378 736 N 50000 clone yes -chr11 96287584 96437584 808 N 150000 contig no -chr11 134946516 134996516 1134 N 50000 contig no -chr11 134996516 135006516 1135 N 10000 telomere no -chr12 0 10000 1 N 10000 telomere no -chr12 10000 60000 2 N 50000 contig no -chr12 95739 145739 4 N 50000 clone yes -chr12 7189876 7239876 66 N 50000 contig no -chr12 34856694 37856694 304 N 3000000 centromere no -chr12 109373470 109423470 954 N 50000 contig no -chr12 122530623 122580623 1075 N 50000 contig no -chr12 132706992 132806992 1171 N 100000 contig no -chr12 133841895 133851895 1183 N 10000 telomere no -chr13 0 10000 1 N 10000 telomere no -chr13 10000 16000000 2 N 15990000 short_arm no -chr13 16000000 19000000 3 N 3000000 centromere no -chr13 19000000 19020000 4 N 20000 heterochromatin no -chr13 86760324 86910324 617 N 150000 contig no -chr13 112353994 112503994 848 N 150000 contig no -chr13 114325993 114425993 866 N 100000 contig no -chr13 114639948 114739948 870 N 100000 contig no -chr13 115109878 115159878 875 N 50000 contig no -chr13 115159878 115169878 876 N 10000 telomere no -chr14 0 10000 1 N 10000 telomere no -chr14 10000 16000000 2 N 15990000 short_arm no -chr14 16000000 19000000 3 N 3000000 centromere no -chr14 107289540 107339540 667 N 50000 contig no -chr14 107339540 107349540 668 N 10000 telomere no -chr15 0 10000 1 N 10000 telomere no -chr15 10000 17000000 2 N 16990000 short_arm no -chr15 17000000 20000000 3 N 3000000 centromere no -chr15 20894633 20935075 12 N 40442 clone yes -chr15 21398819 21885000 16 N 486181 clone yes -chr15 22212114 22262114 19 N 50000 contig no -chr15 22596193 22646193 23 N 50000 contig no -chr15 23514853 23564853 31 N 50000 contig no -chr15 29159443 29209443 83 N 50000 contig no -chr15 82829645 82879645 518 N 50000 contig no -chr15 84984473 85034473 539 N 50000 contig no -chr15 102521392 102531392 690 N 10000 telomere no -chr16 0 10000 1 N 10000 telomere no -chr16 10000 60000 2 N 50000 contig no -chr16 8636921 8686921 128 N 50000 clone yes -chr16 34023150 34173150 343 N 150000 contig no -chr16 35285801 35335801 353 N 50000 clone no -chr16 35335801 38335801 354 N 3000000 centromere no -chr16 38335801 46335801 355 N 8000000 heterochromatin no -chr16 46335801 46385801 356 N 50000 clone no -chr16 88389383 88439383 712 N 50000 contig no -chr16 90294753 90344753 731 N 50000 contig no -chr16 90344753 90354753 732 N 10000 telomere no -chr17 296626 396626 4 N 100000 contig no -chr17 21566608 21666608 185 N 100000 contig no -chr17 22263006 25263006 192 N 3000000 centromere no -chr17 34675848 34725848 278 N 50000 contig no -chr17 62410760 62460760 523 N 50000 clone yes -chr17 77546461 77596461 644 N 50000 clone yes -chr17 79709049 79759049 665 N 50000 contig no -chr17_ctg5_hap1 1256794 1306794 12 N 50000 clone yes -chr17_ctg5_hap1 1588968 1638968 15 N 50000 clone yes -chr18 0 10000 1 N 10000 telomere no -chr18 15410898 15460898 124 N 50000 clone no -chr18 15460898 18460898 125 N 3000000 centromere no -chr18 18460898 18510898 126 N 50000 clone no -chr18 52059136 52209136 398 N 150000 contig no -chr18 72283353 72333353 555 N 50000 clone yes -chr18 75721820 75771820 585 N 50000 clone yes -chr18 78017248 78067248 604 N 50000 contig no -chr18 78067248 78077248 605 N 10000 telomere no -chr19 0 10000 1 N 10000 telomere no -chr19 10000 60000 2 N 50000 contig no -chr19 7346004 7396004 170 N 50000 contig no -chr19 8687198 8737198 190 N 50000 contig no -chr19 20523415 20573415 367 N 50000 clone yes -chr19 24631782 24681782 409 N 50000 heterochromatin no -chr19 24681782 27681782 410 N 3000000 centromere no -chr19 27681782 27731782 411 N 50000 heterochromatin no -chr19 59118983 59128983 870 N 10000 telomere no -chr2 0 10000 1 N 10000 telomere no -chr2 3529312 3579312 38 N 50000 contig no -chr2 5018788 5118788 52 N 100000 contig no -chr2 16279724 16329724 149 N 50000 contig no -chr2 21153113 21178113 192 N 25000 contig no -chr2 87668206 87718206 739 N 50000 clone yes -chr2 89630436 89830436 755 N 200000 contig no -chr2 90321525 90371525 760 N 50000 clone yes -chr2 90545103 91545103 762 N 1000000 heterochromatin no -chr2 91545103 91595103 763 N 50000 clone no -chr2 92326171 95326171 770 N 3000000 centromere no -chr2 110109337 110251337 893 N 142000 contig no -chr2 149690582 149790582 1221 N 100000 contig no -chr2 234003741 234053741 1942 N 50000 contig no -chr2 239801978 239831978 1992 N 30000 contig no -chr2 240784132 240809132 2001 N 25000 contig no -chr2 243102476 243152476 2025 N 50000 clone yes -chr2 243189373 243199373 2027 N 10000 telomere no -chr20 0 10000 1 N 10000 telomere no -chr20 10000 60000 2 N 50000 contig no -chr20 26319569 26369569 274 N 50000 clone no -chr20 26369569 29369569 275 N 3000000 centromere no -chr20 29369569 29419569 276 N 50000 clone no -chr20 29653908 29803908 279 N 150000 contig no -chr20 34897085 34947085 336 N 50000 clone yes -chr20 61091437 61141437 624 N 50000 clone yes -chr20 61213369 61263369 628 N 50000 contig no -chr20 62965520 63015520 648 N 50000 contig no -chr20 63015520 63025520 649 N 10000 telomere no -chr21 0 10000 1 N 10000 telomere no -chr21 10000 5211193 2 N 5201193 short_arm no -chr21 5211193 9411193 3 N 4200000 contig no -chr21 9595548 9645548 5 N 50000 contig no -chr21 9775437 9825437 7 N 50000 contig no -chr21 10034920 10084920 10 N 50000 contig no -chr21 10215976 10365976 12 N 150000 contig no -chr21 10647896 10697896 15 N 50000 contig no -chr21 11188129 11238129 20 N 50000 clone no -chr21 11238129 11288129 21 N 50000 heterochromatin no -chr21 11288129 14288129 22 N 3000000 centromere no -chr21 14288129 14338129 23 N 50000 clone no -chr21 42955559 43005559 419 N 50000 contig no -chr21 44632664 44682664 447 N 50000 clone yes -chr21 48119895 48129895 515 N 10000 telomere no -chr22 0 10000 1 N 10000 telomere no -chr22 10000 13000000 2 N 12990000 short_arm no -chr22 13000000 16000000 3 N 3000000 centromere no -chr22 16000000 16050000 4 N 50000 contig no -chr22 16697850 16847850 27 N 150000 contig no -chr22 20509431 20609431 81 N 100000 contig no -chr22 50364777 50414777 563 N 50000 contig no -chr22 51244566 51294566 591 N 50000 contig no -chr22 51294566 51304566 592 N 10000 telomere no -chr3 0 10000 1 N 10000 telomere no -chr3 10000 60000 2 N 50000 clone no -chr3 66170270 66270270 565 N 100000 contig no -chr3 90504854 93504854 784 N 3000000 centromere no -chr3 194041961 194047251 1693 N 5290 contig no -chr3 197962430 198012430 1732 N 50000 contig no -chr3 198012430 198022430 1733 N 10000 telomere no -chr4 0 10000 1 N 10000 telomere no -chr4 1423146 1478646 15 N 55500 contig no -chr4 8799203 8818203 83 N 19000 contig no -chr4 9274642 9324642 88 N 50000 clone yes -chr4 31820917 31837417 283 N 16500 contig no -chr4 32834638 32840638 294 N 6000 contig no -chr4 40296396 40297096 365 N 700 contig no -chr4 49338941 49488941 445 N 150000 contig no -chr4 49660117 52660117 447 N 3000000 centromere no -chr4 59739333 59789333 510 N 50000 contig no -chr4 75427379 75452279 651 N 24900 contig no -chr4 191044276 191144276 1657 N 100000 clone no -chr4 191144276 191154276 1658 N 10000 telomere no -chr5 0 10000 1 N 10000 telomere no -chr5 17530657 17580657 171 N 50000 clone yes -chr5 46405641 49405641 452 N 3000000 centromere no -chr5 91636128 91686128 873 N 50000 contig no -chr5 138787073 138837073 1350 N 50000 contig no -chr5 155138727 155188727 1514 N 50000 contig no -chr5 180905260 180915260 1775 N 10000 telomere no -chr6 0 10000 1 N 10000 telomere no -chr6 10000 60000 2 N 50000 contig no -chr6 58087659 58137659 620 N 50000 clone yes -chr6 58780166 58830166 627 N 50000 clone no -chr6 58830166 61830166 628 N 3000000 centromere no -chr6 61830166 61880166 629 N 50000 clone no -chr6 62128589 62178589 633 N 50000 clone yes -chr6 95680543 95830543 992 N 150000 contig no -chr6 157559467 157609467 1652 N 50000 clone yes -chr6 157641300 157691300 1654 N 50000 clone yes -chr6 167942073 168042073 1761 N 100000 clone yes -chr6 170279972 170329972 1787 N 50000 clone yes -chr6 171055067 171105067 1797 N 50000 contig no -chr6 171105067 171115067 1798 N 10000 telomere no -chr6_apd_hap1 207807 252060 4 N 44253 clone yes -chr6_apd_hap1 327074 356458 6 N 29384 clone yes -chr6_apd_hap1 448574 536172 9 N 87598 clone yes -chr6_apd_hap1 588761 692953 11 N 104192 clone yes -chr6_apd_hap1 944435 963558 16 N 19123 clone yes -chr6_apd_hap1 970040 1029417 18 N 59377 clone yes -chr6_apd_hap1 1173934 1198528 23 N 24594 clone yes -chr6_apd_hap1 1307951 1317697 26 N 9746 clone yes -chr6_apd_hap1 1341208 1344139 28 N 2931 clone yes -chr6_apd_hap1 1451581 1556172 31 N 104591 clone yes -chr6_apd_hap1 1582549 1598989 33 N 16440 clone yes -chr6_apd_hap1 1736596 1786349 37 N 49753 clone yes -chr6_apd_hap1 1833811 1937682 39 N 103871 clone yes -chr6_apd_hap1 2009922 2164480 41 N 154558 clone yes -chr6_apd_hap1 2169002 2254169 43 N 85167 clone yes -chr6_apd_hap1 2359974 2761234 48 N 401260 clone yes -chr6_apd_hap1 2821679 2858431 51 N 36752 clone yes -chr6_apd_hap1 2894657 3037482 54 N 142825 clone yes -chr6_apd_hap1 3122973 3136480 56 N 13507 clone yes -chr6_apd_hap1 3180323 3355295 59 N 174972 clone yes -chr6_apd_hap1 3370179 3415446 62 N 45267 clone yes -chr6_apd_hap1 3491066 3594101 64 N 103035 clone yes -chr6_apd_hap1 3638667 3642320 66 N 3653 clone yes -chr6_apd_hap1 3696623 3736194 68 N 39571 clone yes -chr6_apd_hap1 3762870 3766192 70 N 3322 clone yes -chr6_apd_hap1 3790366 3970602 72 N 180236 clone yes -chr6_apd_hap1 4220467 4274176 76 N 53709 clone yes -chr6_apd_hap1 4390029 4597885 79 N 207856 clone yes -chr6_dbb_hap3 138055 245960 4 N 107905 clone yes -chr6_dbb_hap3 622457 640380 11 N 17923 clone yes -chr6_dbb_hap3 1337861 1344119 21 N 6258 clone yes -chr6_dbb_hap3 1836690 1837200 29 N 510 clone yes -chr6_dbb_hap3 2117295 2119813 33 N 2518 clone yes -chr6_dbb_hap3 2606257 2723615 42 N 117358 clone yes -chr6_dbb_hap3 3407000 3481304 52 N 74304 clone yes -chr6_dbb_hap3 3736386 3772614 59 N 36228 clone yes -chr6_dbb_hap3 4206542 4226964 66 N 20422 clone yes -chr6_dbb_hap3 4354126 4376794 69 N 22668 clone yes -chr6_mann_hap4 145852 184243 3 N 38391 clone yes -chr6_mann_hap4 1388812 1438812 19 N 50000 clone yes -chr6_mann_hap4 2047420 2056121 28 N 8701 clone yes -chr6_mann_hap4 2252213 2277026 32 N 24813 clone yes -chr6_mann_hap4 2312291 2324610 34 N 12319 clone yes -chr6_mann_hap4 2517052 2540962 37 N 23910 clone yes -chr6_mann_hap4 2772006 2810953 42 N 38947 clone yes -chr6_mann_hap4 2921096 2929556 45 N 8460 clone yes -chr6_mann_hap4 3062107 3151453 47 N 89346 clone yes -chr6_mann_hap4 3174271 3227203 49 N 52932 clone yes -chr6_mann_hap4 3261361 3374492 52 N 113131 clone yes -chr6_mann_hap4 3412258 3450867 54 N 38609 clone yes -chr6_mann_hap4 3553455 3557741 56 N 4286 clone yes -chr6_mann_hap4 3895092 3926341 62 N 31249 clone yes -chr6_mann_hap4 4236950 4245095 68 N 8145 clone yes -chr6_mann_hap4 4441658 4480941 73 N 39283 clone yes -chr6_mcf_hap5 135141 138043 3 N 2902 clone yes -chr6_mcf_hap5 179429 186757 5 N 7328 clone yes -chr6_mcf_hap5 229133 282732 7 N 53599 clone yes -chr6_mcf_hap5 383988 404415 10 N 20427 clone yes -chr6_mcf_hap5 491590 560421 12 N 68831 clone yes -chr6_mcf_hap5 676599 685588 15 N 8989 clone yes -chr6_mcf_hap5 994250 1009296 20 N 15046 clone yes -chr6_mcf_hap5 1177759 1292220 23 N 114461 clone yes -chr6_mcf_hap5 1562952 1726699 28 N 163747 clone yes -chr6_mcf_hap5 1815769 1838106 30 N 22337 clone yes -chr6_mcf_hap5 2091199 2189158 37 N 97959 clone yes -chr6_mcf_hap5 2241824 2251998 39 N 10174 clone yes -chr6_mcf_hap5 2300850 2305647 41 N 4797 clone yes -chr6_mcf_hap5 2712451 2811903 49 N 99452 clone yes -chr6_mcf_hap5 2947576 2958999 52 N 11423 clone yes -chr6_mcf_hap5 3154498 3188719 55 N 34221 clone yes -chr6_mcf_hap5 3329672 3355524 58 N 25852 clone yes -chr6_mcf_hap5 3602941 3659279 65 N 56338 clone yes -chr6_mcf_hap5 3899294 3929849 69 N 30555 clone yes -chr6_mcf_hap5 4149579 4149626 73 N 47 clone yes -chr6_mcf_hap5 4284353 4302274 76 N 17921 clone yes -chr6_mcf_hap5 4422172 4594253 78 N 172081 clone yes -chr6_qbl_hap6 521900 681210 9 N 159310 clone yes -chr6_qbl_hap6 2680914 2732064 37 N 51150 clone yes -chr6_qbl_hap6 3023176 3049607 42 N 26431 clone yes -chr6_qbl_hap6 3316246 3369039 46 N 52793 clone yes -chr6_qbl_hap6 3993031 4020006 57 N 26975 clone yes -chr6_ssto_hap7 861800 941242 15 N 79442 clone yes -chr6_ssto_hap7 1727717 1744408 28 N 16691 clone yes -chr6_ssto_hap7 1836632 1838892 34 N 2260 clone yes -chr6_ssto_hap7 1983561 1997824 37 N 14263 clone yes -chr6_ssto_hap7 2052735 2087415 39 N 34680 clone yes -chr6_ssto_hap7 2101474 2113875 41 N 12401 clone yes -chr6_ssto_hap7 2138491 2303018 43 N 164527 clone yes -chr6_ssto_hap7 2411247 2461193 48 N 49946 clone yes -chr6_ssto_hap7 3040184 3054966 60 N 14782 clone yes -chr6_ssto_hap7 3093568 3154699 62 N 61131 clone yes -chr6_ssto_hap7 3191196 3214366 64 N 23170 clone yes -chr6_ssto_hap7 3368931 3445987 68 N 77056 clone yes -chr6_ssto_hap7 4370564 4420564 83 N 50000 clone yes -chr6_ssto_hap7 4580410 4610218 86 N 29808 clone yes -chr6_ssto_hap7 4639806 4661556 88 N 21750 clone yes -chr6_ssto_hap7 4723509 4774367 90 N 50858 clone yes -chr6_ssto_hap7 4783798 4836049 92 N 52251 clone yes -chr7 0 10000 1 N 10000 telomere no -chr7 232484 282484 5 N 50000 clone yes -chr7 50370631 50410631 488 N 40000 contig no -chr7 58054331 61054331 564 N 3000000 centromere no -chr7 61310513 61360513 569 N 50000 heterochromatin no -chr7 61460465 61510465 571 N 50000 clone yes -chr7 61677020 61727020 573 N 50000 clone yes -chr7 61917157 61967157 575 N 50000 heterochromatin no -chr7 74715724 74765724 693 N 50000 clone yes -chr7 100556043 100606043 952 N 50000 clone yes -chr7 130154523 130254523 1278 N 100000 clone yes -chr7 139379377 139404377 1368 N 25000 contig no -chr7 142048195 142098195 1397 N 50000 clone yes -chr7 142276197 142326197 1399 N 50000 clone yes -chr7 143347897 143397897 1409 N 50000 clone yes -chr7 154270634 154370634 1517 N 100000 contig no -chr7 159128663 159138663 1563 N 10000 telomere no -chr8 0 10000 1 N 10000 telomere no -chr8 7474649 7524649 66 N 50000 contig no -chr8 12091854 12141854 105 N 50000 contig no -chr8 43838887 46838887 376 N 3000000 centromere no -chr8 48130499 48135599 389 N 5100 contig no -chr8 86576451 86726451 716 N 150000 contig no -chr8 142766515 142816515 1177 N 50000 clone yes -chr8 145332588 145432588 1209 N 100000 contig no -chr8 146304022 146354022 1216 N 50000 contig no -chr8 146354022 146364022 1217 N 10000 telomere no -chr9 0 10000 1 N 10000 telomere no -chr9 39663686 39713686 343 N 50000 clone yes -chr9 39974796 40024796 346 N 50000 contig no -chr9 40233029 40283029 348 N 50000 clone yes -chr9 40425834 40475834 350 N 50000 contig no -chr9 40940341 40990341 354 N 50000 contig no -chr9 41143214 41193214 357 N 50000 clone yes -chr9 41365793 41415793 359 N 50000 contig no -chr9 42613955 42663955 370 N 50000 contig no -chr9 43213698 43313698 375 N 100000 contig no -chr9 43946569 43996569 381 N 50000 clone yes -chr9 44676646 44726646 386 N 50000 clone yes -chr9 44908293 44958293 388 N 50000 clone yes -chr9 45250203 45350203 391 N 100000 contig no -chr9 45815521 45865521 397 N 50000 contig no -chr9 46216430 46266430 400 N 50000 clone yes -chr9 46461039 46561039 403 N 100000 contig no -chr9 47060133 47160133 408 N 100000 contig no -chr9 47317679 47367679 410 N 50000 clone no -chr9 47367679 50367679 411 N 3000000 centromere no -chr9 50367679 65367679 412 N 15000000 heterochromatin no -chr9 65367679 65467679 413 N 100000 contig no -chr9 65918360 65968360 418 N 50000 contig no -chr9 66192215 66242215 421 N 50000 clone yes -chr9 66404656 66454656 423 N 50000 clone yes -chr9 66614195 66664195 425 N 50000 clone yes -chr9 66863343 66913343 428 N 50000 clone yes -chr9 67107834 67207834 430 N 100000 contig no -chr9 67366296 67516296 432 N 150000 contig no -chr9 67987998 68137998 437 N 150000 contig no -chr9 68514181 68664181 441 N 150000 contig no -chr9 68838946 68988946 443 N 150000 contig no -chr9 69278385 69328385 446 N 50000 clone yes -chr9 70010542 70060542 452 N 50000 clone yes -chr9 70218729 70318729 454 N 100000 contig no -chr9 70506535 70556535 456 N 50000 contig no -chr9 70735468 70835468 458 N 100000 contig no -chr9 92343416 92443416 641 N 100000 clone yes -chr9 92528796 92678796 643 N 150000 clone yes -chr9 133073060 133223060 982 N 150000 contig no -chr9 137041193 137091193 1020 N 50000 contig no -chr9 139166997 139216997 1040 N 50000 contig no -chr9 141153431 141203431 1057 N 50000 contig no -chr9 141203431 141213431 1058 N 10000 telomere no -chrX 0 10000 1 N 10000 telomere no -chrX 10000 60000 2 N 50000 contig no -chrX 94821 144821 4 N 50000 contig no -chrX 231384 281384 8 N 50000 contig no -chrX 1047557 1097557 16 N 50000 contig no -chrX 1134113 1184113 18 N 50000 contig no -chrX 1264234 1314234 21 N 50000 contig no -chrX 2068238 2118238 31 N 50000 contig no -chrX 7623882 7673882 82 N 50000 clone yes -chrX 10738674 10788674 111 N 50000 clone yes -chrX 37098256 37148256 355 N 50000 contig no -chrX 49242997 49292997 484 N 50000 contig no -chrX 49974173 50024173 494 N 50000 contig no -chrX 52395914 52445914 518 N 50000 contig no -chrX 58582012 58632012 582 N 50000 clone no -chrX 58632012 61632012 583 N 3000000 centromere no -chrX 61632012 61682012 584 N 50000 clone no -chrX 76653692 76703692 724 N 50000 contig no -chrX 113517668 113567668 1183 N 50000 contig no -chrX 115682290 115732290 1214 N 50000 contig no -chrX 120013235 120063235 1268 N 50000 clone yes -chrX 143507324 143557324 1536 N 50000 contig no -chrX 148906424 148956424 1591 N 50000 clone yes -chrX 149032062 149082062 1594 N 50000 contig no -chrX 152277099 152327099 1624 N 50000 clone yes -chrX 155260560 155270560 1668 N 10000 telomere no -chrY 0 10000 1 N 10000 telomere no -chrY 44821 94821 3 N 50000 contig no -chrY 181384 231384 7 N 50000 contig no -chrY 997557 1047557 15 N 50000 contig no -chrY 1084113 1134113 17 N 50000 contig no -chrY 1214234 1264234 20 N 50000 contig no -chrY 2018238 2068238 30 N 50000 contig no -chrY 8914955 8964955 92 N 50000 contig no -chrY 9241322 9291322 97 N 50000 contig no -chrY 10104553 13104553 105 N 3000000 centromere no -chrY 13143954 13193954 107 N 50000 contig no -chrY 13748578 13798578 112 N 50000 contig no -chrY 20143885 20193885 169 N 50000 clone yes -chrY 22369679 22419679 190 N 50000 clone yes -chrY 23901428 23951428 204 N 50000 contig no -chrY 28819361 58819361 249 N 30000000 heterochromatin no -chrY 58917656 58967656 251 N 50000 contig no -chrY 59363566 59373566 255 N 10000 telomere no diff --git a/snakecnv/lib/svreader/resources/template.vcf b/snakecnv/lib/svreader/resources/template.vcf deleted file mode 100644 index 68d8147..0000000 --- a/snakecnv/lib/svreader/resources/template.vcf +++ /dev/null @@ -1,2 +0,0 @@ -##fileformat=VCFv4.2 -#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/snakecnv/lib/svreader/resources/template_breakdancer.vcf b/snakecnv/lib/svreader/resources/template_breakdancer.vcf deleted file mode 100644 index c097539..0000000 --- a/snakecnv/lib/svreader/resources/template_breakdancer.vcf +++ /dev/null @@ -1,37 +0,0 @@ -##fileformat=VCFv4.1 -##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> -##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> -##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> -##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> -##INFO=<ID=NUM_READS,Number=1,Type=Integer,Description="Number of reads supporting event"> -##INFO=<ID=SCORE,Number=1,Type=Integer,Description="Score reported by tool"> -##INFO=<ID=SVMETHOD,Number=.,Type=String,Description="Type of approach used to detect SV: RP (read pair), RD (read depth), SR (split read), JT (junction) or AS (assembly)"> -##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation"> -##INFO=<ID=POS2,Number=1,Type=String,Description="The new position of the translocated or duplicated region. Used of ITX, CTX, and IDP SVs"> -##INFO=<ID=BD_CHR1,Number=1,Type=String,Description="BreakDancer: chr1 field"> -##INFO=<ID=BD_POS1,Number=1,Type=Integer,Description="BreakDancer: pos1 field"> -##INFO=<ID=BD_ORI1,Number=1,Type=String,Description="BreakDancer: orientation1 field"> -##INFO=<ID=BD_CHR2,Number=1,Type=String,Description="BreakDancer: chr2 field"> -##INFO=<ID=BD_POS2,Number=1,Type=Integer,Description="BreakDancer: pos2 field"> -##INFO=<ID=BD_ORI2,Number=1,Type=String,Description="BreakDancer: orientation2 field"> -##INFO=<ID=BD_SCORE,Number=1,Type=Float,Description="BreakDancer: score field"> -##INFO=<ID=BD_SU,Number=1,Type=Integer,Description="BreakDancer: num_Reads field"> -##INFO=<ID=MAX_IND_SU,Number=1,Type=Integer,Description="Maximum individual PE support of the variant"> -##INFO=<ID=VSAMPLES,Number=.,Type=String,Description="Samples with variant alleles"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=DV,Number=1,Type=Integer,Description="supporting read pair for the sample"> -##FILTER=<ID=LowQual,Description="Low Quality"> -##ALT=<ID=DEL,Description="Deletion"> -##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element"> -##ALT=<ID=DEL:ME:L1,Description="Deletion of L1 element"> -##ALT=<ID=DUP,Description="Duplication"> -##ALT=<ID=DUP:TANDEM,Description="Tandem Duplication"> -##ALT=<ID=IDP,Description="Interspersed Duplication"> -##ALT=<ID=INS,Description="Insertion of novel sequence"> -##ALT=<ID=INS:ME:ALU,Description="Insertion of ALU element"> -##ALT=<ID=INS:ME:L1,Description="Insertion of L1 element"> -##ALT=<ID=INV,Description="Inversion"> -##ALT=<ID=CNV,Description="Copy number variable region"> -##ALT=<ID=ITX,Description="Intra-chromosomal translocation"> -##ALT=<ID=CTX,Description="Inter-chromosomal translocation"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT diff --git a/snakecnv/lib/svreader/resources/template_cnvnator.vcf b/snakecnv/lib/svreader/resources/template_cnvnator.vcf deleted file mode 100644 index 112ce67..0000000 --- a/snakecnv/lib/svreader/resources/template_cnvnator.vcf +++ /dev/null @@ -1,30 +0,0 @@ -##fileformat=VCFv4.1 -##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> -##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> -##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> -##INFO=<ID=SVCNV,Number=0,Type=Flag,Description="Structural variation or copy number variation"> -##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY"> -##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> -##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> -##INFO=<ID=VT,Number=1,Type=String,Description="indicates what type of variant the line represents"> -##INFO=<ID=SVMETHOD,Number=.,Type=String,Description="Type of approach used to detect SV: RP (read pair), RD (read depth), SR (split read), JT (junction) or AS (assembly)"> -##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation"> -##INFO=<ID=POS2,Number=1,Type=String,Description="The new position of the translocated or duplicated region. Used of ITX, CTX, and IDP SVs"> -##INFO=<ID=CN_NORMALIZED_RD,Number=1,Type=Float,Description="CNVnator: Normalized RD"> -##INFO=<ID=CN_EVAL1,Number=1,Type=Float,Description="CNVnator: e-val by t-test"> -##INFO=<ID=CN_EVAL2,Number=1,Type=Float,Description="CNVnator: e-val by Gaussian tail"> -##INFO=<ID=CN_EVAL3,Number=1,Type=Float,Description="CNVnator: e-val by t-test (middle)"> -##INFO=<ID=CN_EVAL4,Number=1,Type=Float,Description="CNVnator: e-val by Gaussian tail (middle)"> -##INFO=<ID=CN_Q0,Number=1,Type=Float,Description="CNVnator: Fraction of reads with 0 mapping quality"> -##INFO=<ID=CN,Number=1,Type=Float,Description="CNVnator: genotype"> -##INFO=<ID=NUM_IND,Number=1,Type=Integer,Description="CNVnator merge: number of individuals merged in this call"> -##INFO=<ID=VSAMPLES,Number=.,Type=String,Description="Samples with variant alleles"> -##FILTER=<ID=LowQual,Description="Low Quality"> -##ALT=<ID=DEL,Description="Deletion"> -##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element"> -##ALT=<ID=DEL:ME:L1,Description="Deletion of L1 element"> -##ALT=<ID=DUP,Description="Duplication"> -##ALT=<ID=DUP:TANDEM,Description="Tandem Duplication"> -##ALT=<ID=CNV,Description="Copy number variable region"> -##FORMAT=<ID=CN,Number=A,Type=Float,Description="Copy number"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT diff --git a/snakecnv/lib/svreader/resources/template_merge.vcf b/snakecnv/lib/svreader/resources/template_merge.vcf deleted file mode 100644 index 1d3a706..0000000 --- a/snakecnv/lib/svreader/resources/template_merge.vcf +++ /dev/null @@ -1,154 +0,0 @@ -##fileformat=VCFv4.1 -##ALT=<ID=DEL,Description="Deletion"> -##ALT=<ID=DUP,Description="Duplication"> -##ALT=<ID=DUP:TANDEM,Description="Tandem Duplication"> -##ALT=<ID=INV,Description="Inversion"> -##ALT=<ID=CNV,Description="Copy number variable region"> -##FILTER=<ID=COHERENCE,Description="GSCOHPVALUE == \NA\ || GSCOHPVALUE <= 0.01"> -##FILTER=<ID=COVERAGE,Description="GSDEPTHCALLTHRESHOLD == \NA\ || GSDEPTHCALLTHRESHOLD >= 1.0"> -##FILTER=<ID=DEPTH,Description="GSDEPTHRATIO == \NA\ || GSDEPTHRATIO > 0.8 || (GSDEPTHRATIO > 0.63 && (GSMEMBPVALUE == \NA\ || GSMEMBPVALUE >= 0.01))"> -##FILTER=<ID=DEPTHPVAL,Description="GSDEPTHPVALUE == \NA\ || GSDEPTHPVALUE >= 0.01"> -##FILTER=<ID=LQ,Description="Low Quality Genotyping filter"> -##FILTER=<ID=LowQual,Description="Low Quality"> -##FILTER=<ID=PAIRSPERSAMPLE,Description="GSNPAIRS <= 1.1 * GSNSAMPLES"> -##FILTER=<ID=PASS,Description="All filters passed"> -##FILTER=<ID=POLYMORPH,Description="All samples have the same genotype"> -##FILTER=<ID=ALIGNLENGTH,Description="Site has insufficient alignable bases (default 200)"> -##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> -##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> -##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> -##INFO=<ID=CALLERS,Number=.,Type=String,Description="Callers that made this call"> -##INFO=<ID=CNV,Number=.,Type=String,Description="CNV merged in this CNVR"> -##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> -##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants"> -##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> -##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants"> -##INFO=<ID=PRECISION,Number=2,Type=Integer,Description="Precision measure"> -##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> -##INFO=<ID=SVMETHOD,Number=.,Type=String,Description="Type of approach used to detect SV: RP (read pair), RD (read depth), SR (split read), JT (junction) or AS (assembly)"> -##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation"> -##INFO=<ID=POS2,Number=1,Type=String,Description="The new position of the translocated or duplicated region. Used of ITX, CTX, and IDP SVs"> -##INFO=<ID=BD_CHR1,Number=1,Type=String,Description="BreakDancer: chr1 field"> -##INFO=<ID=BD_POS1,Number=1,Type=Integer,Description="BreakDancer: pos1 field"> -##INFO=<ID=BD_ORI1,Number=1,Type=String,Description="BreakDancer: orientation1 field"> -##INFO=<ID=BD_CHR2,Number=1,Type=String,Description="BreakDancer: chr2 field"> -##INFO=<ID=BD_POS2,Number=1,Type=Integer,Description="BreakDancer: pos2 field"> -##INFO=<ID=BD_ORI2,Number=1,Type=String,Description="BreakDancer: orientation2 field"> -##INFO=<ID=BD_SCORE,Number=1,Type=Float,Description="BreakDancer: score field"> -##INFO=<ID=BD_SU,Number=1,Type=Integer,Description="BreakDancer: num_Reads field"> -##INFO=<ID=VSAMPLES,Number=.,Type=String,Description="Samples with variant alleles"> -##INFO=<ID=NORMAL_COUNT,Number=1,Type=Integer,Description="Number of normal reads supporting reference"> -##INFO=<ID=NUM_READS,Number=1,Type=Integer,Description="Number of reads supporting event"> -##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Split-read consensus sequence"> -##INFO=<ID=CT,Number=1,Type=String,Description="Paired-end signature induced connection type"> -##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call"> -##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> -##INFO=<ID=GCFRACTION,Number=1,Type=Float,Description="GC content fraction"> -##INFO=<ID=GCLENGTH,Number=1,Type=Integer,Description="Denominator of GC content"> -##INFO=<ID=GSCLUSTERSEP,Number=1,Type=Float,Description="Cluster separation in depth-based genotyping model"> -##INFO=<ID=GSCLUSTERSEPWEIGHTEDMEAN,Number=1,Type=Float,Description="Weighted cluster separation mean"> -##INFO=<ID=GSCLUSTERSEPWEIGHTEDMEDIAN,Number=1,Type=Float,Description="Weighted cluster separation median"> -##INFO=<ID=GSCOHERENCE,Number=1,Type=Float,Description="Value of coherence statistic"> -##INFO=<ID=GSCOHFN,Number=1,Type=Float,Description="Coherence statistic per pair"> -##INFO=<ID=GSCOHPVALUE,Number=1,Type=Float,Description="Coherence metric (not a true p-value)"> -##INFO=<ID=GSCOORDS,Number=4,Type=Integer,Description="Original cluster coordinates"> -##INFO=<ID=GSCORA6,Number=1,Type=Float,Description="Correlation with array intensity from Affy6 arrays"> -##INFO=<ID=GSCORI1M,Number=1,Type=Float,Description="Correlation with array intensity from Illumina 1M arrays"> -##INFO=<ID=GSCORNG,Number=1,Type=Float,Description="Correlation with array intensity from NimbleGen arrays"> -##INFO=<ID=GSDEPTHCALLS,Number=.,Type=String,Description="Samples with discrepant read pairs or low read depth"> -##INFO=<ID=GSDEPTHCALLTHRESHOLD,Number=1,Type=Float,Description="Read depth threshold (median read depth of samples with discrepant read pairs)"> -##INFO=<ID=GSDEPTHNOBSSAMPLES,Number=1,Type=Integer,Description="Number of samples with discrepant read pairs in depth test"> -##INFO=<ID=GSDEPTHNTOTALSAMPLES,Number=1,Type=Integer,Description="Total samples in depth test"> -##INFO=<ID=GSDEPTHOBSSAMPLES,Number=.,Type=String,Description="Samples with discrepant read pairs in depth test"> -##INFO=<ID=GSDEPTHPVALUE,Number=1,Type=Float,Description="Depth p-value using chi-squared test"> -##INFO=<ID=GSDEPTHPVALUECOUNTS,Number=4,Type=Integer,Description="Depth test read counts (carrier inside event, carrier outside event, non-carrier inside, non-carrier outside)"> -##INFO=<ID=GSDEPTHRANKSUMPVALUE,Number=1,Type=Float,Description="Depth p-value using rank-sum test"> -##INFO=<ID=GSDEPTHRATIO,Number=1,Type=Float,Description="Read depth ratio test"> -##INFO=<ID=GSDMAX,Number=1,Type=Integer,Description="Maximum value considered for DOpt"> -##INFO=<ID=GSDMIN,Number=1,Type=Integer,Description="Minimum value considered for DOpt"> -##INFO=<ID=GSDOPT,Number=1,Type=Integer,Description="Most likely event length"> -##INFO=<ID=GSDSPAN,Number=1,Type=Integer,Description="Inner span length of read pair cluster"> -##INFO=<ID=GSDUPLICATEOVERLAP,Number=1,Type=Float,Description="Highest overlap with a duplicate event"> -##INFO=<ID=GSDUPLICATES,Number=.,Type=String,Description="List of duplicate events preferred to this one"> -##INFO=<ID=GSDUPLICATESCORE,Number=1,Type=Float,Description="LOD score that the events are distinct based on the genotypes of the most discordant sample"> -##INFO=<ID=GSELENGTH,Number=1,Type=Integer,Description="Effective length"> -##INFO=<ID=GSEXPMEAN,Number=1,Type=Float,Description="Expected read depth sample mean"> -##INFO=<ID=GSGMMWEIGHTS,Number=.,Type=Float,Description="Genotyping depth model cluster weights"> -##INFO=<ID=GSM1,Number=1,Type=Float,Description="Genotyping depth model parameter M1"> -##INFO=<ID=GSM2,Number=2,Type=Float,Description="Genotyping depth model parameters M2[0],M2[1]"> -##INFO=<ID=GSMEMBNPAIRS,Number=1,Type=Integer,Description="Number of pairs used in membership test"> -##INFO=<ID=GSMEMBNSAMPLES,Number=1,Type=Integer,Description="Number of samples used in membership test"> -##INFO=<ID=GSMEMBOBSSAMPLES,Number=.,Type=String,Description="Samples participating in membership test"> -##INFO=<ID=GSMEMBPVALUE,Number=1,Type=Float,Description="Membership p-value"> -##INFO=<ID=GSMEMBSTATISTIC,Number=1,Type=Float,Description="Value of membership statistic"> -##INFO=<ID=GSNDEPTHCALLS,Number=1,Type=Integer,Description="Number of samples with discrepant read pairs or low read depth"> -##INFO=<ID=GSNHET,Number=1,Type=Integer,Description="Number of heterozygous snp genotype calls inside the event"> -##INFO=<ID=GSNHOM,Number=1,Type=Integer,Description="Number of homozygous snp genotype calls inside the event"> -##INFO=<ID=GSNNOCALL,Number=1,Type=Integer,Description="Number of snp genotype non-calls inside the event"> -##INFO=<ID=GSNONVARSCORE,Number=1,Type=Float,Description="For homozygous reference sites, the minimum genotype quality score"> -##INFO=<ID=GSNPAIRS,Number=1,Type=Integer,Description="Number of discrepant read pairs"> -##INFO=<ID=GSNSAMPLES,Number=1,Type=Integer,Description="Number of samples with discrepant read pairs"> -##INFO=<ID=GSNSNPS,Number=1,Type=Integer,Description="Number of snps inside the event"> -##INFO=<ID=GSOUTLEFT,Number=1,Type=Integer,Description="Number of outlier read pairs on left"> -##INFO=<ID=GSOUTLIERS,Number=1,Type=Integer,Description="Number of outlier read pairs"> -##INFO=<ID=GSOUTRIGHT,Number=1,Type=Integer,Description="Number of outlier read pairs on right"> -##INFO=<ID=GSREADGROUPS,Number=.,Type=String,Description="Read groups contributing discrepant read pairs"> -##INFO=<ID=GSREADNAMES,Number=.,Type=String,Description="Discrepant read pair identifiers"> -##INFO=<ID=GSRPORIENTATION,Number=1,Type=String,Description="Read pair orientation"> -##INFO=<ID=GSSAMPLES,Number=.,Type=String,Description="Samples contributing discrepant read pairs"> -##INFO=<ID=GSSNPHET,Number=1,Type=Float,Description="Fraction of het snp genotype calls inside the event"> -##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Predicted microhomology length using a max. edit distance of 2"> -##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints"> -##INFO=<ID=INSLEN,Number=1,Type=Integer,Description="Predicted length of the insertion"> -##INFO=<ID=MAPQ,Number=1,Type=Integer,Description="Median mapping quality of paired-ends"> -##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends"> -##INFO=<ID=MAX_IND_SU,Number=1,Type=Integer,Description="Maximum individual PE support of the variant"> -##INFO=<ID=NOVEL,Number=0,Type=Flag,Description="Indicates a novel structural variation"> -##INFO=<ID=NUM_READS,Number=1,Type=Integer,Description="Number of reads supporting event"> -##INFO=<ID=PE,Number=1,Type=Integer,Description="Paired-end support of the structural variant"> -##INFO=<ID=POS2,Number=1,Type=String,Description="The new position of the translocated or duplicated region. Used of ITX, CTX, and IDP SVs"> -##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation"> -##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend"> -##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend"> -##INFO=<ID=RDRATIO,Number=1,Type=Float,Description="Read-depth ratio of het. SV carrier vs. non-carrier."> -##INFO=<ID=SCORE,Number=1,Type=Integer,Description="Score reported by tool"> -##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants"> -##INFO=<ID=SR,Number=1,Type=Integer,Description="Split-read support"> -##INFO=<ID=SRQ,Number=1,Type=Float,Description="Split-read consensus alignment quality"> -##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)"> -##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples"> -##FORMAT=<ID=AB,Number=A,Type=Float,Description="Allele balance, fraction of observations from alternate allele, QA/(QR+QA)"> -##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observations, with partial observations recorded fractionally"> -##FORMAT=<ID=AP,Number=A,Type=Integer,Description="Alternate allele paired-end observation count, with partial observations recorded fractionally"> -##FORMAT=<ID=AS,Number=A,Type=Integer,Description="Alternate allele split-read observation count, with partial observations recorded fractionally"> -##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant"> -##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events"> -##FORMAT=<ID=CNF,Number=1,Type=Float,Description="Estimate of fractional copy number"> -##FORMAT=<ID=CNL,Number=.,Type=Float,Description="Copy number likelihoods with no frequency prior"> -##FORMAT=<ID=CNP,Number=.,Type=Float,Description="Copy number likelihoods"> -##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events"> -##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth"> -##FORMAT=<ID=DR,Number=1,Type=Integer,Description="# high-quality reference pairs"> -##FORMAT=<ID=DV,Number=1,Type=Integer,Description="supporting read pair for the sample"> -##FORMAT=<ID=FT,Number=1,Type=String,Description="Per-sample genotype filter"> -##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihoods with no frequency prior"> -##FORMAT=<ID=GP,Number=G,Type=Float,Description="Genotype likelihoods"> -##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> -##FORMAT=<ID=GSPC,Number=1,Type=Integer,Description="Number of supporting read pairs for structural variant alleles"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant"> -##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> -##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of alternate observations"> -##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of reference observations"> -##FORMAT=<ID=RC,Number=1,Type=Integer,Description="Raw high-quality read counts for the SV"> -##FORMAT=<ID=RCL,Number=1,Type=Integer,Description="Raw high-quality read counts for the left control region"> -##FORMAT=<ID=RCR,Number=1,Type=Integer,Description="Raw high-quality read counts for the right control region"> -##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count, with partial observations recorded fractionally"> -##FORMAT=<ID=RP,Number=1,Type=Integer,Description="Reference allele paired-end observation count, with partial observations recorded fractionally"> -##FORMAT=<ID=RR,Number=1,Type=Integer,Description="# high-quality reference junction reads"> -##FORMAT=<ID=RS,Number=1,Type=Integer,Description="Reference allele split-read observation count, with partial observations recorded fractionally"> -##FORMAT=<ID=RV,Number=1,Type=Integer,Description="# high-quality variant junction reads"> -##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Phred-scaled probability that this site is variant (non-reference in this sample"> -##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant"> -##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant"> -#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/snakecnv/lib/svreader/resources/template_pindel.vcf b/snakecnv/lib/svreader/resources/template_pindel.vcf deleted file mode 100644 index e4645d2..0000000 --- a/snakecnv/lib/svreader/resources/template_pindel.vcf +++ /dev/null @@ -1,22 +0,0 @@ -##fileformat=VCFv4.1 -##fileDate=2015-9-16 -##source=pindel -##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> -##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints"> -##INFO=<ID=PF,Number=1,Type=Integer,Description="The number of samples carry the variant"> -##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints"> -##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles"> -##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> -##INFO=<ID=NTLEN,Number=.,Type=Integer,Description="Number of bases inserted in place of deleted code"> -##INFO=<ID=NUM_SUPP_SAMPLES,Number=.,Type=Integer,Description="Number of samples with supporting reads"> -##INFO=<ID=SU,Number=.,Type=Integer,Description="The number of unique reads supporting SV (not count duplicate reads)"> -##INFO=<ID=INDEX,Number=1,Type=Integer,Description="The index of the indel/SV"> -##INFO=<ID=MAX_IND_SU,Number=1,Type=Integer,Description="The mamximum number (among all individuals) of supporting PE"> -##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> -##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> -##INFO=<ID=VSAMPLES,Number=.,Type=String,Description="Samples with variant alleles"> -##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> -##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> -##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Reference depth, how many reads support the reference"> -##FORMAT=<ID=AD,Number=2,Type=Integer,Description="Allele depth, how many reads support this allele"> -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT diff --git a/snakecnv/lib/svreader/vcf_utils.py b/snakecnv/lib/svreader/vcf_utils.py deleted file mode 100644 index 6b53046..0000000 --- a/snakecnv/lib/svreader/vcf_utils.py +++ /dev/null @@ -1,32 +0,0 @@ - -import os -import vcf - - -def get_template_header(toolname): - """ - Add contig infos to vcf - :param toolname: the name of the tool - :type toolname: string - :return: file (path) with vcf header with toll-specific fields - """ - thismoduledir = os.path.dirname(os.path.realpath(__file__)) - templateRessources = os.path.join(thismoduledir, "resources") - if os.path.exists(os.path.join(templateRessources, - "template_" + toolname + ".vcf")): - template = os.path.join(templateRessources, - "template_" + toolname + ".vcf") - else: - template = os.path.join(templateRessources, "template.vcf") - return template - - -def get_template(toolname): - """ - Add contig infos to vcf - :param toolname: the name of the tool - :type toolname: string - :return: a vcf.Reader object with tool specific vcf header - """ - template = get_template_header(toolname) - return vcf.Reader(open(template)) diff --git a/snakecnv/lib/svreader/vcfwrapper.py b/snakecnv/lib/svreader/vcfwrapper.py deleted file mode 100644 index 94dedde..0000000 --- a/snakecnv/lib/svreader/vcfwrapper.py +++ /dev/null @@ -1,121 +0,0 @@ - -from svreader import SVRecord, SVReader, SVWriter - -from pysam import VariantFile - -generic_name = "generic" -generic_source = set([generic_name]) - - -class VCFRecord(SVRecord): - """ - - """ - def __init__(self, record): - """ The required vcf record values and adittional optional values - required: chrom, pos, sropt, id, - optional: reference, quality - - """ - super(VCFRecord, self).__init__(generic_name, - record.chrom, - record.pos, - record.stop, - record.id, - ref=record.ref, - qual=record.qual, - filter=record.filter) - self.__record = record - if "SVTYPE" in record.info.keys(): - self._sv_type = record.info["SVTYPE"] - - @property - def record(self): - return self.__record - - @property - def id(self): - return self.record.id - - @property - def pos(self): - return self.record.pos - - @property - def chrom(self): - return self.record.chrom - - @property - def stop(self): - return self.record.stop - - @property - def filter(self): - return self.record.filter - - @property - def samples(self): - return self.record.samples - - -class VCFReader(SVReader): - - def __init__(self, file_name, reference_handle=None, svs_to_report=None): - super(VCFReader, self).__init__(file_name, - generic_name, - reference_handle) - self.vcf_reader = VariantFile(file_name) - - def __iter__(self): - return self - - def __next__(self): - while True: - raw_record = next(self.vcf_reader) - record = VCFRecord(raw_record) - return record - - def getHeader(self): - return self.vcf_reader.header - - def addInfo(self, name, number, type, description): - self.vcf_reader.header.info.add(id=name, - number=number, - type=type, - description=description) - - def addFilter(self, name, description): - self.vcf_reader.header.filters.add(id=name, - number=None, - type=None, - description=description) - - def getOrderedSamples(self): - samples = self.vcf_reader.header.samples - sample_names = [sample.rsplit('.')[0] for sample in samples] - return sample_names - - def numSamples(self): - return len(self.vcf_reader.header.samples) - - -class VCFWriter(SVWriter): - - def __init__(self, file_name, template_reader, index=True): - super(VCFWriter, self).__init__(file_name, - template_reader.tool_name, - template_reader) - - def _open(self): - self.vcf_writer = VariantFile(self.filename, 'w', - header=self.template_reader.getHeader()) - self._isopen = True - - def _write(self, record): - self.vcf_writer.write(record.record) - - def _close(self): - if self._isopen: - self.vcf_writer.close() - else: # nothing was written - self._dumpemptyvcf() diff --git a/snakecnv/lib/svrunner_utils.py b/snakecnv/lib/svrunner_utils.py deleted file mode 100644 index 6215b95..0000000 --- a/snakecnv/lib/svrunner_utils.py +++ /dev/null @@ -1,230 +0,0 @@ - -import sys -import os -import collections -import gzip -from subprocess import call -import re - -from collections import defaultdict - -from pybedtools import BedTool, create_interval_from_list -from pysam import AlignmentFile -import numpy as np - -VERBOSE = True - -def get_insert_size(bam): - """ - Compute mean insert size - :param bam: bam file - :type bam: str - :return: mean insert size - :rtype: int - """ - fbam = AlignmentFile(bam, "rb") - count = 0 - isizes_raw = [] - for read in fbam: - if read.is_proper_pair: - isizes_raw.append(np.abs(read.template_length)) - count += 1 - if count == 50000: - break - n99 = np.percentile(isizes_raw, 99) - n1 = np.percentile(isizes_raw, 1) - isizes = [] - for isize in isizes_raw: - if n1 < isize < n99: - isizes.append(isize) - fbam.close() - return int(np.round(np.mean(np.round(isizes)))) - - -def getChromLength(reference, chrom): - """ - Retrieve the length of chromosome chrom from .fai file - :param reference: the genome fasta file - :param chrom: the chromosome of interest - :type reference: file - :type chrom: str - :return: chromosome length - :rtype: int - """ - reference_fai = str(reference) + ".fai" - chroms = {} - if reference is not None and os.path.isfile(reference_fai): - with open(reference_fai) as fai_file: - for line in fai_file: - line_items = line.strip().split("\t") - name, length = line_items[0:2] - name = name.split(" ")[0] - chroms[name] = int(length) - if chrom not in chroms: - raise KeyError('the chromosome '+chrom+' is not in the *.fai file....') - return chroms[chrom] - - -def fetchId(bamfile): - """ - Fetch sample id in a bam file - :param bamfile: the bam file - :type bamfile: file - :return: sample name - :rtype: str - """ - bamfile_fin = AlignmentFile(bamfile, 'rb') - name = bamfile_fin.header['RG'][0]['SM'] - bamfile_fin.close() - return name - - -_Contig = collections.namedtuple('Contig', ['name', 'length']) - - -def get_contigs(reference): - """ - Retrieve the contigs (chromosomes) from the genome fasta file - :param reference: the genome fasta file - :type reference: fasta filefile - :return: list of contigs in the format specified by PyCvf - :rtype: list - """ - contigs = [] - reference_fai = str(reference) + ".fai" - if reference is not None and os.path.isfile(reference_fai): - with open(reference_fai) as fai_file: - for line in fai_file: - line_items = line.strip().split("\t") - name, length = line_items[0:2] - name = name.split(" ")[0] - contigs.append(_Contig(name, int(length))) - return contigs - - -def chromNum(chrom): - m = re.search(r"[cC]hr(?P<num>[\dXY]+)", chrom) - if m is not None: - if m.group('num') == 'X': - num = 100 - elif m.group('num') == 'Y': - num = 101 - else: - num = int(m.group('num')) - return num - else: - return chrom - - -def addContigInfosTovcf(vcffile, outvcffile, reference): - """ - Add contig infos to vcf - :param vcffile: input vcffile - :param outvcffile: output vcffile - :param reference: the reference genome fasta filefile - :type vcffile: file - :type outvcffile: vcffile - :type reference: fasta filefile - :return: nothing - """ - # compute contig information - contigs = get_contigs(reference) - filename, extension = os.path.splitext(outvcffile) - if extension == ".gz": - outputfile = filename - gzip = True - else: - outputfile = outvcffile - gzip = False - with open(outputfile, "w") as fout: - with smart_open(vcffile) as fd: - for line in fd: - if re.search(r"^##contig", line) is not None: - continue - if re.search(r"^#CHROM", line): - for ctg in sorted(contigs, key=lambda x: chromNum(x.name)): - fout.write('##contig=<ID=%s,length=%d>\n' - % (ctg.name, ctg.length)) - fout.write(line) - # bgzip file - if gzip: - cmd = "bgzip -f "+outputfile - call(cmd, shell=True) - cmd = "tabix -p vcf " + outvcffile - call(cmd, shell=True) - - -def sorting(records, reference_contigs): - if reference_contigs: - contigs_order_dict = defaultdict() - for (index, contig) in enumerate(reference_contigs): - contigs_order_dict[contig.name] = index - records.sort( - key=lambda x: (contigs_order_dict[x.chrom], x.start)) - else: - records.sort(key=lambda x: (x.chrom, x.start)) - return records - - -def vcf_to_pybed(vcf_records): - """ - Convert vcf records to bed records - :param vcf_records: the vcf records - :type vcfrecords: a list of vcf records - :return: the corresponding betools records - :rtype: a bedtools object - """ - intervals = [] - for record in vcf_records: - chrom = record.chrom - start = record.start - 1 - end = record.end - name = record.id - interval_list = list(map(str, [chrom, start, end, name])) - intervals.append(create_interval_from_list(interval_list)) - return BedTool(intervals).sort() - - -# A small wrapper to print to stderr -def eprint(*args, **kwargs): - if VERBOSE: - print(*args, file=sys.stderr, **kwargs) - - -def warn(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -def log(objs): - print("-- log -- ", objs, file=sys.stderr) - - -def warning(*objs): - print("WARNING: ", *objs, file=sys.stderr) - - -def smart_open(file_name): - """ - Opening compressed or uncompressed file according to the extension (.gz) - :param file_name: the file to open - :type file_name: str - :return: the corresponding file handle - :rtype: a file handle - """ - if file_name is not None: - if file_name.endswith(".gz"): - file_fd = gzip.open(file_name, "rt") - else: - file_fd = open(file_name) - else: - file_fd = sys.stdin - return file_fd - - -def flatten(l): - for el in l: - if isinstance(el, collections.Iterable) and not isinstance(el, str): - for sub in flatten(el): - yield sub - else: - yield el diff --git a/snakecnv/lib/svsnakemake_utils.py b/snakecnv/svsnakemake_utils.py similarity index 100% rename from snakecnv/lib/svsnakemake_utils.py rename to snakecnv/svsnakemake_utils.py -- GitLab