Newer
Older
#! /usr/bin/env python3
"""mergeSV : merge SV on the basis of their intervals
The merging is done by identifiying connected components in a graph.
The graph is constructing taking the SV a node and an edge connects two nodes
if the conresponding intervals exhibits a reciprocal overlap of a least R0%.
Additional constraints on the precision of breakpoints (left and right) can be specified.
Each connected component is termed a CNVR (Copy Number Variation Region)
and is associated to the corresponding CNVs.
"""
import argparse, sys
import glob
import vcf
import string
import re
from svreader.vcfwrapper import VCFReader
from svinterval import construct_overlap_graph, connected_components
import xlsxwriter
prg_path = os.path.dirname(os.path.realpath(__file__))
alphabet = list(string.ascii_uppercase)
xlsx_cols = alphabet.copy()
xlsx_cols.append(alp + j)
color_not_found = "#FE2E2E"
color_col_filter = "#BEF781"
color_is_kept = "#81F781"
color_false_positive = "#FE642E"
color_wrong_gt = "#B40404"
def get_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="\
mergeIndividualSV \n \
description: Merge SV based on reciprocal overlap")
#parser.add_argument('-v', '--vcf', type=str, required=True, help='vcf file(s), comma-separated')
parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files')
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
help='VCF file containing the filtered results')
parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap')
parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision')
parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision')
parser.add_argument('-o', '--output', type=str, default="results", help='output prefix')
parser.add_argument('--no-xls', action='store_const', const=True, default=False, help='Do not build Excel file')
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# parse the arguments
args = parser.parse_args()
if args.left_precision == -1:
args.left_precision = sys.maxsize
if args.right_precision == -1:
args.right_precision = sys.maxsize
# send back the user input
return args
# A small wrapper to print to stderr
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def to_vcf_record(cnv):
info = { "SVLEN" : cnv.length(),
"SVTYPE" : "DEL",
"END" : cnv.end,
"NUM_CNV" : cnv.NumCNV(),
"CNV":",".join(cnv.CNV()),
"CALLERS":",".join(cnv.Callers()),
"NUM_CALLERS":cnv.NumCallers(),
"CIPOS":cnv.cipos(),
"CIEND":cnv.ciend(),
"PRECISION":",".join(map(str,cnv.precision())),
"CALLERSVSAMPLES":",".join(cnv.CallersVsamples())
}
alt = [vcf.model._SV("DEL")]
if not cnv.IsPrecise():
info['IMPRECISE'] = True
else:
info['REPR_CNV'] = cnv.repr_cnv()
vcf_record = vcf.model._Record(cnv.chrom,
cnv.start,
cnv.name,
"N",
alt,
".",
".",
info,
"",
[0],
[])
return vcf_record
def passed_variant(record):
"""Did this variant pass?"""
return record.filter is None or len(record.filter) == 0 or "PASS" in record.filter
# --------------------------------------
# main function
def read_vcf_file(infile):
SVSet=[]
ids = []
for record in VCFReader(infile):
if not passed_variant(record):
continue
SVSet.append(record)
ids.append(record.id)
return SVSet, ids
def svsort(sv, records):
"""
Function to sort regions
"""
if records[sv]["start"] != "":
return int(records[sv]["start"])
first_tool = list(records[sv]["tools"].keys())[0]
return int(records[sv]["tools"][first_tool]["start"])
def get_gt(geno, true_gt):
"""
Get the genotype: consider 1/0 and 0/1 as equivalent (we can't identify a specific chromosome)
We want geno equal to true_gt in these cases
:param geno: the genotype
:param true_gt: the true genotype
:return: the genotype
"""
if true_gt is not None:
if geno == "1/0" and true_gt == "0/1":
geno = "0/1"
elif geno == "0/1" and true_gt == "1/0":
geno = "1/0"
return geno
def get_max_len(cell, col, max_col_len):
"""
Get the max content length of a column
:param cell: the cell content
:param col: the column number
:param max_col_len: current max content length for each column
:return: the new max content length for the given column
"""
return max(len(str(cell)), max_col_len[col] if col in max_col_len else 0)
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def get_quality_color(quality):
"""
Returns the color associated to a genotype quality value
:param quality: the quality value
:return: the associated color
"""
color_very_low_quality = "#FE2E2E"
color_low_quality = "#FE9A2E"
color_medium_quality = "#FFFF00"
color_high_quality = "#81F781"
if quality > 60:
return color_high_quality
elif quality > 40:
return color_medium_quality
elif quality > 20:
return color_low_quality
else:
return color_very_low_quality
def get_genotypes(genotypes_file, true_vcf_file):
"""
Get genotype of each individual for each SV
:param genotypes_file: VCF file containing genotypes
:param true_vcf_file: VCF file containing real data
:return: genotypes of each individual for each SV, quality of each genotype, number of individuals
"""
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# Genotypes:
genotypes_raw = os.popen("zcat " + genotypes_file + " " + true_vcf_file + " | " +
os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GT}'").read().split("\n")
for gt in genotypes_raw:
if gt != "":
gt_l = gt.split("\t")
genotypes[gt_l[0]] = gt_l[1:]
nb_inds = len(gt_l[1:])
gt_quality_raw = os.popen("zcat " + genotypes_file + " | " +
os.path.join(prg_path, "vawk") + " '{ print $3,S$*$GQ}'").read().split("\n")
for gq in gt_quality_raw:
if gq != "":
gq_l = gq.split("\t")
gt_quality[gq_l[0]] = gq_l[1:]
return genotypes, gt_quality, nb_inds
def build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality):
"""
Build records for each SV
:param genotypes: list of genotypes of each individual for each SV
:param SVSet: set of all SVs
:param true_ones_records: records of the real data
:param filtered_records: records of the filtered data
:param gt_quality: list of qualities of each genotype of individuals for each SV
:return: records dict, tools set, list of orphans records (associated to None real data)
"""
number = 1
records = {}
tools = set()
orphans = 0
for components in connected_components(SVSet):
names = []
results = {}
true_one = None
for node in components:
names.append(node.id)
if node.id in true_ones_records:
true_one = node.id
records[true_one] = {
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"tools": {},
"orphan": False,
"genotypes": genotypes[true_one] if true_one in genotypes else None
}
else:
tool_name = node.id.split("_")[0]
results[tool_name] = {
"start": node.start,
"end": node.end,
"length": int(node.end) - int(node.start),
"filtered": (node.id in filtered_records) if filtered_records is not None else False,
"genotypes": genotypes[node.id] if node.id in genotypes else None,
"qualities": gt_quality[node.id] if node.id in gt_quality else None
}
tools.add(tool_name)
if true_one is not None:
records[true_one]["tools"] = results
else:
orphans += 1
records["orphan_" + str(orphans)] = {
"start": "",
"end": "",
"length": "",
"tools": results,
"orphan": True,
"genotypes": None
}
names.sort()
eprint("Group #%i: %s" % (number, ", ".join(names)))
number += 1
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
return records, tools, orphans
def build_header(tools, cells, cells_gt, cells_gq, filtered_records, nb_records, max_col_len, nb_inds):
"""
Build tools headers, and header cells for each tool
:param tools: list of tools
:param cells: cells of the first sheet (sv description)
:param cells_gt: cells of the second sheet (genotype)
:param cells_gq: cells of the third sheet (genotype quality)
:param filtered_records: (bool) is there filtered records
:param nb_records: number of records
:param max_col_len: max content length for each column
:param nb_inds: number of individuals
:return: headers (list of tools + read and filtered data) ; cells, cells_gt, cells_gq and max_col_len updated
"""
i = 1
j = 1
headers = []
for tool in ["Real data"] + tools + (["Filtered results"] if filtered_records else []):
headers.append(tool)
l_format = {"bold": True}
if tool == "Filtered results":
l_format["bg_color"] = color_col_filter
cells[xlsx_cols[i] + "2"] = cells[xlsx_cols[i] + str(2 + nb_records + 3)] = {"text": "Start",
"format": l_format}
max_col_len[i] = get_max_len("Start", i, max_col_len)
cells[xlsx_cols[i + 1] + "2"] = cells[xlsx_cols[i + 1] + str(2 + nb_records + 3)] = {"text": "End",
"format": l_format}
max_col_len[i + 1] = get_max_len("End", i + 1, max_col_len)
cells[xlsx_cols[i + 2] + "2"] = cells[xlsx_cols[i + 2] + str(2 + nb_records + 3)] = {"text": "Length",
"format": l_format}
max_col_len[i + 2] = get_max_len("Length", i + 2, max_col_len)
# Genotypes:
for k in range(0, nb_inds):
cells_gt[xlsx_cols[j + k] + "2"] = cells_gq[xlsx_cols[j + k] + "2"] = \
{"text": "INDIV_" + str(k + 1), "format": {"bold": True}}
i += 3
j += nb_inds
return headers, cells, cells_gt, cells_gq, max_col_len
def fill_real_data(row, cells, cells_gt, cells_gq, max_col_len, record, rec_id, nb_records):
"""
Fill cells of the first data column (real data simulated)
:param cells: cells of the first sheet (sv description)
:param cells_gt: cells of the second sheet (genotype)
:param cells_gq: cells of the third sheet (genotype quality)
:param max_col_len: max text length for each column
:param record: data of the record
:param nb_records: number of records (total)
:return: cells, cells_gt, cells_gq and max_col_len updated
"""
# SV ID:
cells[xlsx_cols[0] + str(row)] = cells[xlsx_cols[0] + str(row + nb_records + 3)] = {"text": rec_id, "format": {}}
max_col_len[0] = get_max_len(rec_id, 0, max_col_len)
cells_gt[xlsx_cols[0] + str(row)] = cells_gq[xlsx_cols[0] + str(row)] = {"text": rec_id, "format": {}}
# START:
cells[xlsx_cols[1] + str(row)] = cells[xlsx_cols[1] + str(row + nb_records + 3)] = {"text": record["start"],
"format": {}}
max_col_len[1] = get_max_len(record["start"], 1, max_col_len)
# END:
cells[xlsx_cols[2] + str(row)] = cells[xlsx_cols[2] + str(row + nb_records + 3)] = {"text": record["end"],
"format": {}}
max_col_len[2] = get_max_len(record["end"], 2, max_col_len)
# LENGTH:
cells[xlsx_cols[3] + str(row)] = cells[xlsx_cols[3] + str(row + nb_records + 3)] = {"text": record["length"],
"format": {}}
max_col_len[3] = get_max_len(record["length"], 3, max_col_len)
# GENOTYPES:
if record["genotypes"] is not None:
for gt in range(0, len(record["genotypes"])):
cells_gt[xlsx_cols[1 + gt] + str(row)] = cells_gq[xlsx_cols[1 + 0 + gt] + str(row)] = \
{"text": record["genotypes"][gt], "format": {}}
return cells, cells_gt, cells_gq, max_col_len
def fill_tool_data(row, col, cells, max_col_len, record, nb_records, my_start, my_end, my_length,
sv_format=None):
"""
Fill cells for a tool and a record
:param row: row position
:param col: columns position
:param cells: cells of the first sheet (sv description)
:param max_col_len: max text length for each column
:param record: data of the record for a tool
:param nb_records: number of records (total)
:param my_start: real start of the SV
:param my_end: real end of the SV
:param my_length: real length of the SV
:param sv_format: format for the cell
:return: cells and max_col_len updated
"""
if sv_format is None:
sv_format = {}
#############
# RAW DATA: #
#############
# START:
cells[xlsx_cols[col] + str(row)] = {"text": record["start"], "format": sv_format}
max_col_len[col] = get_max_len(record["start"], col, max_col_len)
# END:
cells[xlsx_cols[col + 1] + str(row)] = {"text": record["end"], "format": sv_format}
max_col_len[col + 1] = get_max_len(record["end"], col + 1, max_col_len)
# LENGTH:
cells[xlsx_cols[col + 2] + str(row)] = {"text": record["length"], "format": sv_format}
max_col_len[col + 2] = get_max_len(record["length"], col + 2, max_col_len)
###########
# DIFFS: #
###########
if my_start != "":
start = record["start"] - my_start
end = record["end"] - my_end
length = record["length"] - my_length
else:
start = end = length = "NA"
# START:
cells[xlsx_cols[col] + str(row + nb_records + 3)] = {"text": start, "format": sv_format}
# END:
cells[xlsx_cols[col + 1] + str(row + nb_records + 3)] = {"text": end, "format": sv_format}
# LENGTH:
cells[xlsx_cols[col + 2] + str(row + nb_records + 3)] = {"text": length,
"format": sv_format}
return cells, max_col_len
def fill_genotypes_data(row, col, cells_gt, cells_gq, record, my_genotypes):
"""
Fill cells for a tool and a record (genotype/genotype quality parts)
:param row: row position
:param col: column position
:param cells_gt: cells for genotype sheet (2)
:param cells_gq: cells for the genotype quality sheet (3)
:param record: data of the record for a tool
:param my_genotypes: real genotypes for each individual
:return: cells_gt and cells_gq updated
"""
the_genotypes = record["genotypes"]
for gt in range(0, len(the_genotypes)):
true_gt = my_genotypes[gt] if my_genotypes is not None and gt in my_genotypes else ""
geno = get_gt(the_genotypes[gt], true_gt)
# Format:
gt_format = {"bg_color": get_quality_color(int(record["qualities"][gt]))}
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
gt_format["font_color"] = "#ff0000"
# Genotype:
cells_gt[xlsx_cols[col + gt] + str(row)] = {"text": geno, "format": gt_format}
# Quality:
cells_gq[xlsx_cols[col + gt] + str(row)] = {"text": int(record["qualities"][gt]), "format": gt_format}
return cells_gt, cells_gq
def write_headers(workbook, worksheet, headers, cell_len, rows, filtered_records):
"""
Write header on the given sheet of the XLSX file
:param workbook: the XSLX file workbook
:param worksheet: th sheet
:param headers: headers values
:param cell_len: length of each cell of the header
:param rows: (list) rows into write the header
:param filtered_records: (bool) is there filtered records
"""
i = 0
for header in headers:
if i < len(headers) - 1 or not filtered_records:
merge_format = workbook.add_format({'align': 'center'})
else:
merge_format = workbook.add_format({'align': 'center', 'bg_color': color_col_filter})
for row in rows:
worksheet.merge_range(xlsx_cols[1 + i * cell_len] + str(row) + ":" + xlsx_cols[cell_len + i * cell_len] + str(row),
header, merge_format)
i += 1
def create_xls_document(args, headers, filtered_records, nb_records, nb_inds, cells, cells_gt, cells_gq, max_col_len):
"""
Create the XLSX file
:param args: arguments given by the user (argparse)
:param headers: headers of each sheet
:param filtered_records: (bool) has filtered records
:param nb_records: number of records
:param nb_inds: number of individuals
:param cells: cells for first sheet (SV description)
:param cells_gt: cells for second sheet (genotypes)
:param cells_gq: cells for third sheet (genotype quality)
:param max_col_len: max content length for each column
"""
with xlsxwriter.Workbook(args.output + ".xslx") as workbook:
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
#################################
# First sheet (SV description): #
#################################
worksheet = workbook.add_worksheet("SVs")
write_headers(workbook, worksheet, headers, 3, [1, 1+nb_records+3], filtered_records)
# Body:
for cell_id, cell_content in cells.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet.write(cell_id, cell_content["text"], cell_format)
# Resize columns:
for col, max_len in max_col_len.items():
worksheet.set_column(col, col, max_len+1)
if args.genotypes:
#############################
# Second sheet (Genotypes): #
#############################
worksheet_gt = workbook.add_worksheet("Genotypes")
write_headers(workbook, worksheet_gt, headers, nb_inds, [1], filtered_records)
# Body:
for cell_id, cell_content in cells_gt.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gt.write(cell_id, cell_content["text"], cell_format)
worksheet_gt.freeze_panes(0, 9)
# Resize columns:
worksheet_gt.set_column(0, 0, max_col_len[0]+1)
####################################
# Third sheet (Genotypes quality): #
####################################
worksheet_gq = workbook.add_worksheet("Gt quality")
write_headers(workbook, worksheet_gq, headers, nb_inds, [1], filtered_records)
# Body
for cell_id, cell_content in cells_gq.items():
cell_format = workbook.add_format(cell_content["format"])
worksheet_gq.write(cell_id, cell_content["text"], cell_format)
worksheet_gq.freeze_panes(0, 9)
# Resize columns:
worksheet_gq.set_column(0, 0, max_col_len[0]+1)
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
def create_tsv_file(filename: str, headers: list, cells: dict, nb_tools: int, nb_per_tool: int, records_range: ()):
# Init rows:
head = [""]
top_headers = {}
h = 1
for header in headers:
# Define top headers to each column:
for i in range(0, nb_per_tool):
top_headers[h] = header
head.append("")
h += 1
rows = [head]
for i in range(0, records_range[1]-records_range[0]+1):
rows.append(["" for x in range(0, (nb_tools * nb_per_tool) + 1)])
# Fill content:
for id_cell, cell in cells.items():
id_m = re.match(r"^([A-Z]+)(\d+)$", id_cell)
col = xlsx_cols.index(id_m.group(1))
row = int(id_m.group(2))
if records_range[0] <= row <= records_range[1]:
r = row - records_range[0]
if r == 0 and col > 0:
rows[r][col] = top_headers[col] + " / " + cell["text"]
else:
rows[r][col] = str(cell["text"])
# List as text:
for r in range(0, len(rows)):
rows[r] = "\t".join(rows[r])
tsv = "\n".join(rows)
with open(filename, "w") as tsv_file:
tsv_file.write(tsv)
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
# noinspection PyUnresolvedReferences
def main():
# parse the command line args
args = get_args()
genotypes = {}
gt_quality = {}
nb_inds = 0
if args.genotypes:
genotypes, gt_quality, nb_inds = get_genotypes(args.genotypes, args.true_vcf)
overlap_cutoff = args.overlap_cutoff
left_precision = args.left_precision
right_precision = args.right_precision
filenames = []
filenames += glob.glob(args.vcf + "/**/*.vcf", recursive=True)
filenames += glob.glob(args.vcf + "/**/*.vcf.gz", recursive=True)
true_ones = args.true_vcf
# Reading all the vcf files
SVSet=[]
for infile in filenames:
eprint(" Reading file %s" % (infile))
SVSet += read_vcf_file(infile)[0]
eprint(" Reading file %s" % (true_ones))
SVSet_to, true_ones_records = read_vcf_file(true_ones)
SVSet += SVSet_to
filtered_records = None
if args.filtered_vcf:
eprint(" Reading file %s" % (args.filtered_vcf))
filtered_records = read_vcf_file(args.filtered_vcf)[1]
# Compute connected components:
eprint("Computing Connected components")
construct_overlap_graph(SVSet,overlap_cutoff,left_precision,right_precision)
# Build records:
records, tools, orphans = build_records(genotypes, SVSet, true_ones_records, filtered_records, gt_quality)
#################################################
# Define cells of each sheet of the excel file: #
#################################################
# First one (SV description)
cells = {
"A1": {"text": "RESULTS", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}},
"A" + str(1+nb_records+3): {"text": "DIFFS", "format": {"bg_color": "#ffe856"}},
"A" + str(2+nb_records+3): {"text": "Deletion", "format": {"bold": True}}
}
cells_gt = {
"A1": {"text": "GENOTYPES", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
# Third one (genotype quality)
cells_gq = {
"A1": {"text": "GT QUALITY", "format": {"bg_color": "#ffe856"}},
"A2": {"text": "Deletion", "format": {"bold": True}}
}
tools = sorted(tools)
nb_tools = len(tools)
######################
# BUILD HEADER CELLS #
######################
headers, cells, cells_gt, cells_gq, max_col_len = build_header(tools, cells, cells_gt, cells_gq,
filtered_records is not None,
nb_records, max_col_len, nb_inds)
rec_keys = sorted(records.keys(), key=lambda x:svsort(x, records))
####################
# BUILD BODY CELLS #
####################
for rec_id in rec_keys:
record = records[rec_id]
my_start = record["start"]
my_end = record["end"]
my_length = record["length"]
my_genotypes = record["genotypes"]
# Real data of the simulation:
cells, cells_gt, cells_gq, max_col_len = fill_real_data(i, cells, cells_gt, cells_gq, max_col_len, record,
rec_id, nb_records)
is_kept = False
for tool in tools:
if tool in record["tools"]:
###########################
# IF TOOL DETECTS THE SV: #
###########################
######################
# SET FILTERED DATA: #
######################
is_kept = True
sv_format = {"bg_color": color_is_kept}
# SV data (sheet 1):
cells, max_col_len = fill_tool_data(i, 1+((nb_tools+1)*3), cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
{"bg_color": color_col_filter})
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, 1 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
record["tools"][tool], my_genotypes)
#################
# SET TOOL DATA #
#################
# SV data (sheet 1):
cells, max_col_len = fill_tool_data(i, j, cells, max_col_len,
record["tools"][tool], nb_records, my_start, my_end, my_length,
sv_format)
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
record["tools"][tool], my_genotypes)
###############################
# TOOL DOES NOT DETECT THE SV #
###############################
# noinspection PyUnresolvedReferences
cells[xlsx_cols[j + k] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[j + k] + str(i+nb_records+3)] = {"text": "", "format": {"bg_color": color_not_found}}
# Genotype:
for gt in range(0, nb_inds):
# noinspection PyUnresolvedReferences
cells_gt[xlsx_cols[g + gt] + str(i)] = cells_gq[xlsx_cols[g + gt] + str(i)] = \
{"text": "", "format": {"bg_color": color_not_found_2}}
###############################################################################
# Until we have filled all tools, check if the record is kept after filtering: #
###############################################################################
# SV is kept: color bg in green in the corresponding tool
cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_is_kept
elif filtered_records is not None:
# SV does not pass the filter: color bg in red in the filter column
cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i)] = {"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3)] + str(i + nb_records + 3)] = {"text": "",
"format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 1] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": color_not_found}}
cells[xlsx_cols[1 + ((nb_tools + 1) * 3) + 2] + str(i + nb_records + 3)] = {
"text": "", "format": {"bg_color": color_not_found}}
# Color in black in the filter column
for gt in range(0, nb_inds):
cells_gt[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
cells_gq[xlsx_cols[1 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = {"text": "", "format":
if re.match(r"^orphan_\d+$", rec_id):
cells[xlsx_cols[0] + str(i)]["format"]["bg_color"] = \
cells[xlsx_cols[0] + str(i+nb_records+3)]["format"]["bg_color"] = color_false_positive
i += 1
# Create document:
if not args.no_xls:
create_xls_document(args, headers, filtered_records is not None, nb_records, nb_inds, cells, cells_gt, cells_gq,
max_col_len)
# Create CSV files:
create_tsv_file(args.output + "_sv_per_tools.tsv", headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2, nb_records+2))
create_tsv_file(args.output + "_sv_diffs_per_tools.tsv", headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2+nb_records+3, nb_records * 2 + 5))
create_tsv_file(args.output + "_sv_genotypes_per_tools.tsv", headers, cells_gt,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
create_tsv_file(args.output + "_sv_genotypes_quality_per_tools.tsv", headers, cells_gq,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
print("")
print("###########")
print("# RESULTS #")
print("###########")
print("")
print(str(nb_records) + " Results found")
print(str(orphans) + " False Positive")
print("")
if not args.no_xls:
print("Results saved in :\n\t- " + args.output + ".xslx")
else:
print("Results:")
print("")
print("TSV files:")
print("\t- " + args.output + "_sv_per_tools.tsv")
print("\t- " + args.output + "_sv_diffs_per_tools.tsv")
print("\t- " + args.output + "_sv_genotypes_per_tools.tsv")
print("\t- " + args.output + "_sv_genotypes_quality_per_tools.tsv")
print("")
# initialize the script
if __name__ == '__main__':
try:
sys.exit(main())
except:
raise