Skip to content
Snippets Groups Projects
Commit 09249c14 authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Allow several filter and genotype files

parent 4461fed1
No related branches found
No related tags found
No related merge requests found
......@@ -50,8 +50,8 @@ description: Build results of the simulated data detection")
parser.add_argument('-v', '--vcf', type=str, required=True, help='folder containing all vcf results files')
parser.add_argument('-t', '--true-vcf', type=str, required=True, help='VCF file containing the simulated deletions')
parser.add_argument('-f', '--filtered-vcf', type=str, required=False,
help='VCF file containing the filtered results')
parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes")
help='VCF file containing the filtered results', nargs='+')
parser.add_argument('-g', '--genotypes', type=str, help="VCF file containing genotypes", nargs='+')
parser.add_argument('--overlap_cutoff', type=float, default=0.5, help='cutoff for reciprocal overlap')
parser.add_argument('--left_precision', type=int, default=-1, help='left breakpoint precision')
parser.add_argument('--right_precision', type=int, default=-1, help='right breakpoint precision')
......@@ -162,7 +162,7 @@ def get_quality_color(quality):
return color_very_low_quality
def get_genotypes(genotypes_file, true_vcf_file):
def get_genotypes(genotypes_files, true_vcf_file):
"""
Get genotype of each individual for each SV
:param genotypes_file: VCF file containing genotypes
......@@ -182,12 +182,13 @@ def get_genotypes(genotypes_file, true_vcf_file):
nb_inds = len(list(genotypes.values())[0])
# Samples:
reader = VariantFile(genotypes_file)
for rec in reader:
samples = rec.samples
genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()] # Fixed: use samples keys
# from real data to keep the same order
gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]
for genotypes_file in genotypes_files:
reader = VariantFile(genotypes_file)
for rec in reader:
samples = rec.samples
genotypes[rec.id] = ["/".join(map(str, samples[x]["GT"])) for x in samples_t.keys()] # Fixed: use samples keys
# from real data to keep the same order
gt_quality[rec.id] = [samples[x]["GQ"] for x in samples_t.keys()]
return genotypes, gt_quality, nb_inds
......@@ -625,14 +626,14 @@ def search_vcf_files(my_folder):
return vcf_files
def print_results(nb_records, orphans, with_xlsx, output, genotypes_file):
def print_results(nb_records, orphans, with_xlsx, output, do_genotype):
"""
Print list of outputs
:param nb_records: number of records {int}
:param orphans: sv found in tools but not present in real data {dict}
:param with_xlsx: build xlsx file {bool}
:param output: output prefix {str}
:param genotypes_file: genotypes file {str}
:param do_genotype: do the genotype {bool}
"""
print("")
print("###########")
......@@ -650,13 +651,13 @@ def print_results(nb_records, orphans, with_xlsx, output, genotypes_file):
print("TSV files:")
print("\t- " + output + "_sv_per_tools.tsv")
print("\t- " + output + "_sv_diffs_per_tools.tsv")
if genotypes_file:
if do_genotype:
print("\t- " + output + "_sv_genotypes_per_tools.tsv")
print("\t- " + output + "_sv_genotypes_quality_per_tools.tsv")
print("")
def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, genotypes_file):
def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds, do_genotype):
"""
Fill cells when a tool does not detect a SV
:param cells: cells definition
......@@ -667,7 +668,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
:param g: column index for genotypes and genotypes quality tables
:param nb_records: total number of records
:param nb_inds: total number of individuals
:param genotypes_file: VCF file containing genotypes
:param do_genotype: do the genotypes
:return: cells, completed
"""
for k in range(0, 3):
......@@ -677,7 +678,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
"format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
if genotypes_file:
if do_genotype:
for gt in range(0, nb_inds):
# noinspection PyUnresolvedReferences
cells_gt[XLSX_COLS[g + gt] + str(i)] = cells_gq[XLSX_COLS[g + gt] + str(i)] = \
......@@ -686,7 +687,7 @@ def fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records, nb_inds, nb_tools, filtered_records,
genotypes_file, rec_id):
do_genotype, rec_id):
"""
Apply style of cells
:param cells: cells of the default table {dict}
......@@ -698,7 +699,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
:param nb_inds: number of individuals {int}
:param nb_tools: number of tools {int}
:param filtered_records: file containing filtered records {str}
:param genotypes_file: file containing genotypes {str}
:param do_genotype: do the genotype {bool}
:param rec_id: id of the record {str}
:return:
"""
......@@ -722,7 +723,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
"text": "", "format": {"bg_color": COLOR_NOT_FOUND}}
# Genotype:
if genotypes_file:
if do_genotype:
# Color in gray in the filter column
for gt in range(0, nb_inds):
cells_gt[XLSX_COLS[2 + ((nb_tools + 1) * nb_inds) + gt] + str(i)] = \
......@@ -739,7 +740,7 @@ def apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_record
def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells_gt, cells_gq, max_col_len,
nb_tools, genotypes_file, haploid, filtered_records):
nb_tools, do_genotype, haploid, filtered_records):
i = 3
for rec_id in rec_keys:
record = records[rec_id]
......@@ -776,7 +777,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
record["tools"][tool], nb_records, my_start, my_end, my_length,
{"bg_color": COLOR_COL_FILTER})
if genotypes_file:
if do_genotype:
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, 2 + ((nb_tools + 1) * nb_inds), cells_gt, cells_gq,
record["tools"][tool], my_genotypes, haploid)
......@@ -792,7 +793,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
record["tools"][tool], nb_records, my_start, my_end, my_length,
sv_format)
if genotypes_file:
if do_genotype:
# Genotype (sheets 2&3):
cells_gt, cells_gq = fill_genotypes_data(i, g, cells_gt, cells_gq,
record["tools"][tool], my_genotypes, haploid)
......@@ -802,7 +803,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
# TOOL DOES NOT DETECT THE SV #
###############################
cells, cells_gt, cells_gq = fill_cells_no_tools(cells, cells_gt, cells_gq, i, j, g, nb_records, nb_inds,
genotypes_file)
do_genotype)
j += 3
g += nb_inds
......@@ -811,7 +812,7 @@ def build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells, cells
# Until we have filled all tools, check if the record is kept after filtering: #
###############################################################################
cells, cells_gt, cells_gq = apply_style_of_filter_cells(cells, cells_gt, cells_gq, i, is_kept, nb_records,
nb_inds, nb_tools, filtered_records, genotypes_file,
nb_inds, nb_tools, filtered_records, do_genotype,
rec_id)
i += 1
......@@ -825,7 +826,7 @@ def build_xlsx_cols():
XLSX_COLS.append(alp + j)
def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, overlap_cutoff=0.5,
def init(output, vcf_folder, true_vcf, filtered_vcfs=None, genotypes_files=None, overlap_cutoff=0.5,
left_precision=sys.maxsize, right_precision=sys.maxsize, no_xls=False, haploid=False):
build_xlsx_cols()
......@@ -834,8 +835,10 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
nb_inds = 0
if genotypes_file:
genotypes, gt_quality, nb_inds = get_genotypes(genotypes_file, true_vcf)
if genotypes_files:
genotypes, gt_quality, nb_inds = get_genotypes(genotypes_files, true_vcf)
do_genotype = genotypes_files is not None
filenames = search_vcf_files(vcf_folder)
......@@ -853,11 +856,12 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
eprint(" Reading file %s" % true_ones)
sv_set_to, true_ones_records = read_vcf_file(true_ones)
sv_set += sv_set_to
filtered_records = None
filtered_records = []
if filtered_vcf:
eprint(" Reading file %s" % filtered_vcf)
filtered_records = read_vcf_file(filtered_vcf)[1]
if filtered_vcfs:
for filtered_vcf in filtered_vcfs:
eprint(" Reading file %s" % filtered_vcf)
filtered_records += read_vcf_file(filtered_vcf)[1]
# Compute connected components:
eprint("Computing Connected components")
......@@ -890,7 +894,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
# BUILD BODY CELLS #
####################
cells, cells_gt, cells_gq, max_col_len = build_body_cells(rec_keys, records, nb_records, nb_inds, tools, cells,
cells_gt, cells_gq, max_col_len, nb_tools, genotypes_file,
cells_gt, cells_gq, max_col_len, nb_tools, do_genotype,
haploid, filtered_records)
# Create document:
......@@ -906,7 +910,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
create_tsv_file(output + "_sv_diffs_per_tools.tsv", headers, cells,
nb_tools + (2 if filtered_records is not None else 1),
3, (2 + nb_records + 3, nb_records * 2 + 5))
if genotypes_file:
if do_genotype:
create_tsv_file(output + "_sv_genotypes_per_tools.tsv", headers, cells_gt,
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
......@@ -914,7 +918,7 @@ def init(output, vcf_folder, true_vcf, filtered_vcf=None, genotypes_file=None, o
nb_tools + (2 if filtered_records is not None else 1),
nb_inds, (2, nb_records + 2))
print_results(nb_records, orphans, with_xlsx, output, genotypes_file)
print_results(nb_records, orphans, with_xlsx, output, do_genotype)
def main():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment