Skip to content
Snippets Groups Projects
Commit ea770be0 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Add genotype_results script (new implementation based on T. Faraut script)

parent 794af239
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
from collections import defaultdict
from pybedtools import BedTool
from pybedtools import create_interval_from_list
from pysam import VariantFile
def variants_to_pybed(variants):
intervals = []
for variant in variants:
r = variants[variant]
chrom = r.chrom
start = r.start
end = r.stop
name = r.id
intervals.append(create_interval_from_list(list(map(str, [chrom, start, end, name]))))
return BedTool(intervals).sort()
def passed_variant(record):
"""
Did this variant pass?
:param record: vcf record object
:return: True if pass, False else
"""
return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter
def readgenotypes(vcffile, dofilter=False):
vcfin = VariantFile(vcffile)
variants = defaultdict()
for r in vcfin:
if not dofilter or passed_variant(r):
variants[r.id] = r
return variants
def canonize(geno):
if geno == (1, 0):
return 0, 1
else:
return geno
def getgenotypes(record):
genotypes = []
for s in record.samples:
genotypes.append(canonize(record.samples[s]['GT']))
return genotypes
def precision(true_genos, pred_genos):
if len(true_genos) != len(pred_genos):
print("genotypes not of the same length")
exit(1)
correct, wrong = (0, 0)
for g, h in zip(true_genos, pred_genos):
if g == h:
correct += 1
else:
wrong += 1
return correct, wrong
def getvarsize(variant):
return variant.stop - variant.start + 1
def main(genotypes, predicted, filtered, output, verbose=False):
true_genotypes = readgenotypes(genotypes)
pred_genotypes = defaultdict()
filtered_genotypes = {}
with open(predicted, "r") as preds:
for pred in preds:
pred = pred.rstrip()
if pred != "":
pred_genotypes.update(readgenotypes(pred))
if filtered:
filtered_genotypes.update(readgenotypes(filtered, True))
true_pybed = variants_to_pybed(true_genotypes)
pred_pybed = variants_to_pybed(pred_genotypes)
pred_with_true = pred_pybed.intersect(true_pybed, r=True, f=0.7, wo=True)
with open(output, "w") as out:
for sv in pred_with_true:
predicted = getgenotypes(pred_genotypes[sv[3]])
true_geno = getgenotypes(true_genotypes[sv[7]])
left_prec = abs(int(sv[1]) - int(sv[5]))
right_prec = abs(int(sv[2]) - int(sv[6]))
correct, wrong = precision(true_geno, predicted)
varsize = getvarsize(true_genotypes[sv[7]])
soft = sv[3].split('_')[0]
if sv[3] in filtered_genotypes:
out.write("%s\t%s\t%d\t%3.2f\t%d\t%d" %
(sv[3], "pass", varsize, correct / (wrong + correct), left_prec, right_prec))
out.write("%s\t%s\t%d\t%3.2f\t%d\t%d" % (sv[3], soft, varsize, correct / (wrong + correct), left_prec, right_prec))
if verbose:
print("%d variants" % (len(true_genotypes)))
print("%d predicted variants" % (len(pred_genotypes)))
print("%d detected variants (%3.2f)" % (len(pred_with_true), 100 * len(pred_with_true) / len(true_genotypes)))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="Build results for genotype precision of tools")
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-p', '--predictions-vcf', type=str, required=True,
help='File listing VCF files containing the predicted results, with genotypes')
parser.add_argument('--filter', action='store_const', const=True, default=False,
help='Predicted results has filter information')
parser.add_argument('-o', '--output', type=str, required=True, help="Output file")
parser.add_argument('-v', '--verbose', action='store_const', const=True, default=False,
help='Verbose mode')
args = parser.parse_args()
main(genotypes=args.true_vcf,
predicted=args.predictions_vcf,
filtered=args.filter,
output=args.output,
verbose=args.verbode)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment