Commit e6f253ea authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

A lot of works, not working for contigs fix

parent 55115024
#!/usr/bin/env python3
import os
import re
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from pysam import FastaFile
GAP_BLOCK_SIZE_MIN = 3
def get_valid_maps(maps):
"""
Parse diamond maps and get valid and unvalid maps
:param maps: diamond file path
:type maps: str
:return: pass matchs, filtered ones
:rtype: list, list
"""
matchs = []
bad_matchs = []
with open(maps, 'r') as mps:
......@@ -35,14 +53,24 @@ def get_valid_maps(maps):
"btop": btop
})
return matchs, bad_matchs
return matchs, bad_matchs
def get_gap_sequence(seq, pos_gap):
"""
Get gap sequence from btop
:param seq: btop sequence
:type seq: str
:param pos_gap: position of gaps in btop (odd or even chars)
:type pos_gap: int
:return: gap sequence (on sequence which has no gap)
:rtype: str
"""
i = 0
gap_seq = ""
end_seq = False
while i < len(seq) and not end_seq:
while i < len(seq)-1 and not end_seq:
if pos_gap == 0:
next_base = seq[i+1]
if seq[i] != "-":
......@@ -52,57 +80,260 @@ def get_gap_sequence(seq, pos_gap):
if seq[i+1] != "-":
end_seq = True
if not end_seq:
gap_seq += next_base
if next_base.isalpha() or next_base == "*":
gap_seq += next_base
else:
end_seq = True
i += 2
return gap_seq
def parse_btop(btop, rseq, pseq):
def parse_btop(btop):
"""
Parse btop
:param btop: btop string
:param rseq: reference sequence
:param pseq: protein sequence
:return: parsed btop
:type btop: str
:return: parsed btop: two list (one for reference, one for protein). In each position, we have what we have
for each one, at the same index of the list.
Example: FGP-47 devient : reference = ["F", "P", 47] et protein = ["G", "-", 47]
"""
reference = []
protein = []
i = 0
while i < len(btop):
char = btop[i]
if char.isalpha():
if char.isalpha() or char == "*": # It's a char (A-Z)
pchar = btop[i+1]
if pchar.isalpha():
if pchar.isalpha() or pchar == "*": # Next char is a char (A-Z)
reference.append(char)
protein.append(btop[i+1])
i += 2
else:
gap_seq = get_gap_sequence(rseq[i:], 1)
else: # Next char is a "-" (gap)
gap_seq = get_gap_sequence(btop[i:], 1)
reference.append(gap_seq)
protein.append("-" * len(gap_seq))
i += len(gap_seq) + 1
elif char.isdigit():
i += len(gap_seq) * 2
elif char.isdigit(): # It's a number (match)
j = i+1
while btop[i:j].isdigit():
while btop[i:j].isdigit() and j <= len(btop):
j += 1
j -= 1
m_size = int(btop[i:j])
reference.append(int(btop[i:j]))
protein.append(int(btop[i:j]))
i = j
elif char == "-":
gap_seq =get_gap_sequence(pseq[i:], 0)
elif char == "-": #It's a gap
gap_seq = get_gap_sequence(btop[i:], 0)
protein.append(gap_seq)
reference.append("-" * len(gap_seq))
elif char == "/":
pass # TODO
i += len(gap_seq) * 2
elif char == "/": # Frameshift
pchar = btop[i + 1]
if pchar == "-":
reference.append("/-")
protein.append("")
i += 2
else:
print("Oups! Frameshift not followed by a gap")
exit(1)
elif char == "\\":
pass # TODO
pchar = btop[i + 1]
if pchar == "-":
reference.append("\\-")
protein.append("")
i += 2
else:
print("Oups! Frameshift not followed by a gap")
exit(1)
else:
print("Unexpected case:")
print(btop)
print(i)
print(btop[i], btop[i+1])
exit(1)
return reference, protein
def parse_valid_matches(matches):
def fix_contig(reference, protein, ref_seq, id_ref, id_prot, reverse, fixed_orf_file, prots_file,
fixed_orf_file_wgaps, prots_file_wgaps, fixed_contigs_file, fixed_contigs_file_wgaps, fasta):
"""
Fix contig sequence with protein one
:param reference: reference sequence list
:type reference: list
:param protein: protein sequence list
:type protein: list
:param ref_seq: reference sequence fasta
:type ref_seq: str
:param id_ref: reference fasta id
:type id_ref: str
:param id_prot: protein fasta id
:type id_prot: str
:param: reverse: do we have to reverse complement
:type reverse: bool
:param fixed_orf_file: fixed orf fasta file path
:type fixed_orf_file: str
:param prots_file: proteins file path for matchs without gaps
:type prots_file: str
:param fixed_orf_file_wgaps: fixed orf with gaps fasta file path
:type fixed_orf_file_wgaps: str
:param prots_file_wgaps: proteins file path for matchs with gaps
:type prots_file_wgaps: str
:param fasta: FastaFile pysam object
:type fasta: FastaFile
:return: True if gaps blocks, else False
"""
contains_stop = False
gaps_block = []
position = 0
if reverse:
ref_seq = str(Seq(ref_seq).reverse_complement())
new_refseq = ""
reshifted = 0
for item in reference:
if isinstance(item, str):
if len(item) == 1 and (item.isalpha() or item == "*"):
new_refseq += ref_seq[position:position+3]
if item == "*":
contains_stop = True
position += 3
elif item == "\\-":
reshifted = 1
position += 1
elif item == "/-":
reshifted = 1
new_refseq += "N"
elif "-" in item:
len_gap = len(item)
if len_gap >= GAP_BLOCK_SIZE_MIN:
gaps_block.append((position, position + (len_gap * 3), "ref"))
new_refseq += ref_seq[position: position + (len_gap * 3)] # Remove this to remove gaps in new map
position += len_gap * 3
elif re.match("^[A-Za-z*]+$", item) is not None:
if len(item) >= GAP_BLOCK_SIZE_MIN:
gaps_block.append((position, len(item), "prot"))
new_refseq += ref_seq[position: position + (len(item) * 3)]
position += len(item) * 3
if "*" in item:
contains_stop = True
else:
raise Exception("Unexpected case: %s!" % item)
elif isinstance(item, int):
new_refseq += ref_seq[position: position + (item * 3)]
position += item * 3
else:
raise Exception("Unexpected case: %s!" % item)
if len(new_refseq) == 0:
raise Exception("Empty sequence for %s!" % id_ref)
prot_seq = Seq(new_refseq).translate()
if reverse: # Restore intial sequence
new_refseq = str(Seq(new_refseq).reverse_complement())
contig_seq = fasta.fetch(id_ref)
fixed_contig = contig_seq.replace(ref_seq, new_refseq)
if len(gaps_block) == 0:
with open(fixed_orf_file, "a") as fasta:
record = SeqRecord(Seq(new_refseq), id=id_ref, name=id_ref,
description="match_prot=%s contains_stop=%s reshifted=%s reversed=%s" %
(id_prot, contains_stop, reshifted, "1" if reverse else "0"))
SeqIO.write([record], fasta, "fasta")
with open(prots_file, "a") as fasta:
record = SeqRecord(prot_seq, id=id_ref, name=id_ref,
description="match_prot=%s contains_stop=%s reshifted=%s reversed=%s" %
(id_prot, contains_stop, reshifted, "1" if reverse else "0"))
SeqIO.write([record], fasta, "fasta")
with open(fixed_contigs_file, "a") as fasta:
record = SeqRecord(Seq(fixed_contig), id=id_ref, name=id_ref,
description="match_prot=%s contains_stop=%s reshifted=%s reversed=%s" %
(id_prot, contains_stop, reshifted, "1" if reverse else "0"))
SeqIO.write([record], fasta, "fasta")
return False
ref_gaps = []
prot_gaps = []
for gap in gaps_block:
if gap[2] == "ref":
ref_gaps.append("%d-%d" % gap[:2])
else:
prot_gaps.append("%d[%d]" % gap[:2])
with open(fixed_orf_file_wgaps, "a") as fasta:
record = SeqRecord(Seq(new_refseq), id=id_ref, name=id_ref,
description=
"match_prot=%s contains_stop=%s reshifted=%s reversed=%s ref-gaps=%s prot-gaps=%s" %
(id_prot, contains_stop, reshifted, "1" if reverse else "0", ",".join(ref_gaps),
",".join(prot_gaps)))
SeqIO.write([record], fasta, "fasta")
with open(prots_file_wgaps, "a") as fasta:
record = SeqRecord(prot_seq, id=id_ref, name=id_ref,
description=
"match_prot=%s contains_stop=%s reshifted=%s reversed=%s ref-gaps=%s prot-gaps=%s" %
(id_prot, contains_stop, reshifted, "1" if reverse else "0", ",".join(ref_gaps),
",".join(prot_gaps)))
SeqIO.write([record], fasta, "fasta")
with open(fixed_contigs_file_wgaps, "a") as fasta:
record = SeqRecord(Seq(fixed_contig), id=id_ref, name=id_ref,
description=
"match_prot=%s contains_stop=%s reshifted=%s reversed=%s ref-gaps=%s prot-gaps=%s" %
(id_prot, contains_stop, reshifted, "1" if reverse else "0", ",".join(ref_gaps),
",".join(prot_gaps)))
SeqIO.write([record], fasta, "fasta")
return True
def parse_valid_matches(matches, out_prefix, reference_fasta):
"""
Parse valid matches (the ones with 98% of coverage)
:param matches: matches list
:type matches: list
:param out_prefix: outpout prefix
:type out_prefix: str
"""
i = 0
out_file = out_prefix + "_fixed_orf.fasta"
prots_file = out_prefix + "_fixed_prots.fasta"
fix_cont = out_prefix + "_fixed_contigs.fasta"
out_file_wgaps = out_prefix + "_fixed_orf_wgaps.fasta"
prots_file_wgaps = out_prefix + "_fixed_prots_wgaps.fasta"
fix_cont_wgaps = out_prefix + "_fixed_contigs_wgaps.fasta"
n_prot_with_gaps = 0
if os.path.exists(out_file):
os.remove(out_file)
if os.path.exists(prots_file):
os.remove(prots_file)
if os.path.exists(out_file_wgaps):
os.remove(out_file_wgaps)
if os.path.exists(prots_file_wgaps):
os.remove(prots_file_wgaps)
if os.path.exists(fix_cont):
os.remove(fix_cont)
if os.path.exists(fix_cont_wgaps):
os.remove(fix_cont_wgaps)
align = FastaFile(reference_fasta)
for match in matches:
reference, protein = parse_btop(match["btop"], match["ref"]["seq"], match["prot"]["seq"])
reference, protein = parse_btop(match["btop"])
has_gaps_block = fix_contig(reference=reference, protein=protein, ref_seq=match["ref"]["seq"],
id_ref=match["ref"]["id"], id_prot=match["prot"]["id"],
reverse=match["ref"]["start"]>match["ref"]["end"], fixed_orf_file=out_file,
prots_file=prots_file, fixed_orf_file_wgaps=out_file_wgaps,
prots_file_wgaps=prots_file_wgaps, fasta=align, fixed_contigs_file=fix_cont,
fixed_contigs_file_wgaps=fix_cont_wgaps)
if has_gaps_block:
n_prot_with_gaps += 1
i += 1
print("Total number of matching proteins: %d" % len(matches))
print("Proteins filtered out (aligned with gaps block of size >= %d: %d (%.02f %%)" %
(GAP_BLOCK_SIZE_MIN, n_prot_with_gaps, n_prot_with_gaps / len(matches) * 100))
if __name__ == "__main__":
......@@ -113,3 +344,9 @@ if __name__ == "__main__":
parser.add_argument("-r", "--reference", help="Reference fasta file", required=True)
parser.add_argument("-d", "--diamond", help="Diamond mapping file", required=True)
parser.add_argument("-o", "--output-prefix", help="Output prefix", required=True)
args = parser.parse_args()
maps, bad_maps = get_valid_maps(args.diamond)
print("load OK", flush=True)
parse_valid_matches(maps, args.output_prefix, args.reference)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment