Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sys
import gzip
import shutil
import tempfile
import os
import multiprocessing
from functools import partial
from subprocess import call
import random
import string
from pysam import AlignmentFile, tabix_compress, tabix_index
from svtyper import singlesample as single
def fetchId(bamfile):
"""
Fetch sample id in a bam file
:param bamfile: the bam file
:type bamfile: file
:return: sample name
:rtype: str
"""
bamfile_fin = AlignmentFile(bamfile, 'rb')
name = bamfile_fin.header['RG'][0]['SM']
bamfile_fin.close()
return name
def get_bamfiles(bamlist):
with open(bamlist, "r") as fin:
bamfiles = [os.path.abspath(f.strip()) for f in fin]
return bamfiles
def random_string(length=10):
ramdom_str = [random.choice(string.ascii_uppercase + string.digits)
for _ in range(length)]
return ''.join(ramdom_str)
def genotype_multiple_samples(bamlist, vcf_in, vcf_out, cores=1):
bamfiles = get_bamfiles(bamlist)
if vcf_out == sys.stdout:
tmp_dir = tempfile.mkdtemp()
else:
tmp_dir = os.path.join(os.path.dirname(vcf_out), "tmp" + random_string())
os.mkdir(tmp_dir)
if vcf_in == sys.stdin:
vcf_in = single.dump_piped_vcf_to_file(vcf_in, tmp_dir)
elif vcf_in.endswith(".gz"):
vcf_in_gz = vcf_in
vcf_in = os.path.join(tmp_dir, os.path.basename(vcf_in[:-3]))
with gzip.open(vcf_in_gz, 'rb') as f_in, open(vcf_in, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
pool = multiprocessing.Pool(processes=cores)
launch_ind = partial(genotype_single_sample,
vcf_in=vcf_in, out_dir=tmp_dir)
vcf_files = pool.map(launch_ind, bamfiles)
merge_cmd = "bcftools merge -m id "
if vcf_out != sys.stdout:
merge_cmd += "-O z " + " -o " + vcf_out + " "
merge_cmd += " ".join(vcf_files)
exit_code = call(merge_cmd, shell=True)
if exit_code == 0:
if vcf_out != sys.stdout:
tabix_index(vcf_out, force=True, preset="vcf")
shutil.rmtree(tmp_dir)
else:
print("Failed: bcftools merge exits with status %d" % exit_code)
exit(1)
def genotype_single_sample(bam, vcf_in, out_dir):
lib_info_json = bam + ".json"
sample = fetchId(bam)
out_vcf = os.path.join(out_dir, sample + ".gt.vcf")
with open(vcf_in, "r") as inf, open(out_vcf, "w") as outf:
single.sso_genotype(bam_string=bam,
vcf_in=inf,
vcf_out=outf,
min_aligned=20,
split_weight=1,
disc_weight=1,
num_samp=1000000,
lib_info_path=lib_info_json,
debug=False,
ref_fasta=None,
sum_quals=False,
max_reads=1000,
max_ci_dist=1e10,
cores=None,
batch_size=None)
out_gz = out_vcf + ".gz"
tabix_compress(out_vcf, out_gz, force=True)
tabix_index(out_gz, force=True, preset="vcf")
return out_gz