Skip to content
Snippets Groups Projects
Commit 15f5c616 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

adding svtyper_utils

parent 42ac8f47
No related branches found
No related tags found
No related merge requests found
import sys
import gzip
import shutil
import tempfile
import os
import multiprocessing
from functools import partial
from subprocess import call
import random
import string
from pysam import AlignmentFile, tabix_compress, tabix_index
from svtyper import singlesample as single
def fetchId(bamfile):
"""
Fetch sample id in a bam file
:param bamfile: the bam file
:type bamfile: file
:return: sample name
:rtype: str
"""
bamfile_fin = AlignmentFile(bamfile, 'rb')
name = bamfile_fin.header['RG'][0]['SM']
bamfile_fin.close()
return name
def get_bamfiles(bamlist):
with open(bamlist, "r") as fin:
bamfiles = [os.path.abspath(f.strip()) for f in fin]
return bamfiles
def random_string(length=10):
ramdom_str = [random.choice(string.ascii_uppercase + string.digits)
for _ in range(length)]
return ''.join(ramdom_str)
def genotype_multiple_samples(bamlist, vcf_in, vcf_out, cores=1):
bamfiles = get_bamfiles(bamlist)
if vcf_out == sys.stdout:
tmp_dir = tempfile.mkdtemp()
else:
tmp_dir = os.path.join(os.path.dirname(vcf_out), "tmp" + random_string())
os.mkdir(tmp_dir)
if vcf_in == sys.stdin:
vcf_in = single.dump_piped_vcf_to_file(vcf_in, tmp_dir)
elif vcf_in.endswith(".gz"):
vcf_in_gz = vcf_in
vcf_in = os.path.join(tmp_dir, os.path.basename(vcf_in[:-3]))
with gzip.open(vcf_in_gz, 'rb') as f_in, open(vcf_in, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
pool = multiprocessing.Pool(processes=cores)
launch_ind = partial(genotype_single_sample,
vcf_in=vcf_in, out_dir=tmp_dir)
vcf_files = pool.map(launch_ind, bamfiles)
merge_cmd = "bcftools merge -m id "
if vcf_out != sys.stdout:
merge_cmd += "-O z " + " -o " + vcf_out + " "
merge_cmd += " ".join(vcf_files)
exit_code = call(merge_cmd, shell=True)
if exit_code == 0:
if vcf_out != sys.stdout:
tabix_index(vcf_out, force=True, preset="vcf")
shutil.rmtree(tmp_dir)
else:
print("Failed: bcftools merge exits with status %d" % exit_code)
exit(1)
def genotype_single_sample(bam, vcf_in, out_dir):
lib_info_json = bam + ".json"
sample = fetchId(bam)
out_vcf = os.path.join(out_dir, sample + ".gt.vcf")
with open(vcf_in, "r") as inf, open(out_vcf, "w") as outf:
single.sso_genotype(bam_string=bam,
vcf_in=inf,
vcf_out=outf,
min_aligned=20,
split_weight=1,
disc_weight=1,
num_samp=1000000,
lib_info_path=lib_info_json,
debug=False,
ref_fasta=None,
sum_quals=False,
max_reads=1000,
max_ci_dist=1e10,
cores=None,
batch_size=None)
out_gz = out_vcf + ".gz"
tabix_compress(out_vcf, out_gz, force=True)
tabix_index(out_gz, force=True, preset="vcf")
return out_gz
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment