Commit 6c319bf7 authored by Celine Noirot's avatar Celine Noirot
Browse files

Add option -z zerobased and debug when file enzyme is provide

parent a76b20c4
......@@ -36,7 +36,23 @@ import seqio as seqio
GRAPH_CLASS_PARTITION = 100
MAX_FRAGMENT_LEN_IN_HISTOGRAM = 5000
ENZYME_PATH=os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","lib","enzyme.txt")
ENZYME_DICT={ "SbfI":["CCTGCAGG","CCTGCA*GG"],
"PstI":["CTGCAG","CTGCA*G"],
"NsiI":["ATGCAT","ATGCA*T"],
"NotI":["GCGGCCGC","GC*GGCCGC"],
"EagI":["CGGCCG","C*GGCCG"],
"EcoRI":["GAATTC","G*AATTC"],
"MfeI":["CAATTG","C*AATTG"],
"BamHI":["GGATCC","G*GATCC"],
"BclI":["TGATCA","T*GATCA"],
"BglII":["AGATCT","A*GATCT"],
"BbvCI":["CCTCAGC","CC*TCAGC"],
"HaeIII":["GGCC","GG*CC"],
"RsaI":["GTAC","GT*AC"],
"MspI":["CCGG","C*CGG"],
"TaqI":["TCGA","T*CGA"],
"Alu":["AGCT","AG*CT"]}
def version_string ():
"""
Return the insilicoRRBSdigestion version
......@@ -59,7 +75,7 @@ def bedtools_intersect(bedtools,external_bed,fragment_bed, output):
'''Validate with R'''
nb_cols_external_bed = nb_col_file(external_bed)
fh_out=open(output,"w")
cmd = [bedtools, "intersect", "-wo", "-a", external_bed, "-b", fragment_bed]
cmd = [bedtools, "intersect", "-wo", "-nonamecheck", "-a", external_bed, "-b", fragment_bed]
p_intersect = Popen(cmd, stdout=fh_out, stderr=PIPE)
stderr = p_intersect.communicate()[1]
if stderr.decode("utf-8") != "" :
......@@ -130,7 +146,7 @@ def get_fragment_histogram(fragments_length, bins=None):
results={"edges":edges.tolist(),"hist":hist.tolist(),"select": "exclude fragment longer than "+str(MAX_FRAGMENT_LEN_IN_HISTOGRAM)}
return results
def split_genome(fasta, enz_dict, bed=None, bed_select=None, min=30, max=500 ,bedCpG=None, double_selection=False):
def split_genome(fasta, enz_dict,zerobased, bed=None, bed_select=None, min=30, max=500 ,bedCpG=None, double_selection=False):
#Compared to http://www.restrictionmapper.org/cgi-bin/sitefind3.pl
fa_fh = open(fasta, "rU")
reader = seqio.FastaReader(fa_fh)
......@@ -153,15 +169,21 @@ def split_genome(fasta, enz_dict, bed=None, bed_select=None, min=30, max=500 ,be
pattern1 = "^"+enz_dict[keys_enz[0]][1].split('*')[1]+ ".*"+enz_dict[keys_enz[1]][1].split('*')[0]+"$"
pattern2 = "^"+enz_dict[keys_enz[1]][1].split('*')[1]+ ".*"+enz_dict[keys_enz[0]][1].split('*')[0]+"$"
pattern=pattern1+'|'+pattern2
for id, desc, seq, qual in reader:
genome_size+=len(seq)
# for each enzyme replace sequence by pattern
new_seq=seq
for key, val in enz_dict.items():
new_seq = new_seq.replace(val[0],val[1])
position=0
if zerobased :
position=0
else:
position=1
for frag in new_seq.split("*") :
if not select_fh is None:
if len(frag)>=min and len(frag) <= max:
if double_selection :
......@@ -172,19 +194,21 @@ def split_genome(fasta, enz_dict, bed=None, bed_select=None, min=30, max=500 ,be
select_fh.write("\t".join([id,str(position), str(position+len(frag)),str(len(frag))])+"\n")
fragments_len_selection.append(len(frag))
fragments_len.append(len(frag))
bed_fh.write("\t".join([id,str(position), str(position+len(frag)),str(len(frag))])+"\n")
bed_fh.write("\t".join([id,str(position), str(position+len(frag)),str(len(frag)),frag])+"\n")
position+=len(frag)
# compute position of ecah CpG
cpg_seq = seq.replace("CG","C*G")
position=0
for frag in cpg_seq.split("*") :
if position == 0 :
if seq.startswith('CG') :
bedCpG_fh.write("\t".join([id,str(position), str(position+1), "CpG"])+"\n")
else:
bedCpG_fh.write("\t".join([id,str(position), str(position+1), "CpG"])+"\n")
if zerobased :
position=-1
else:
position=0
# for all fragment but not the last one which end cannot end with CG
for frag in cpg_seq.split("*")[0:-1] :
#Count last value of fragment to get position 0-based
position+=len(frag)
bedCpG_fh.write("\t".join([id,str(position), str(position+1), "CpG"])+"\n")
nb_cpg+=1
fa_fh.close()
bed_fh.close()
......@@ -214,23 +238,26 @@ def get_available_enzymes(file):
@param file : the file with enzyme to load, format : MspI CCGG C*CGG
@return : dict of enzyme_name -> [sequence, pattern]
"""
prog_seq = re.compile("^[ACGT]*$")
prog_pattern = re.compile("^[ACGT\*]*$")
enz_fh = open(file, "rU")
enz_dict = {}
msg=""
for line in enz_fh :
a_enz=line.rstrip().split("\t")
if len(a_enz)==3:
if prog_seq.match(a_enz[1]) is not None and prog_pattern.match(a_enz[2]) is not None :
enz_dict[a_enz[0]]=a_enz[1:]
if (os.path.exists(file)) :
prog_seq = re.compile("^[ACGT]*$")
prog_pattern = re.compile("^[ACGT\*]*$")
enz_fh = open(file, "rU")
msg=""
for line in enz_fh :
a_enz=line.rstrip().split("\t")
if len(a_enz)==3:
if prog_seq.match(a_enz[1]) is not None and prog_pattern.match(a_enz[2]) is not None :
enz_dict[a_enz[0]]=a_enz[1:]
else :
msg+="Only ACGT are authorize in sequence and AC*GT in pattern, not true for "+a_enz[0]+"\n"
else :
msg+="Only ACGT are authorize in sequence and AC*GT in pattern, not true for "+a_enz[0]+"\n"
else :
msg+="Enzyme file format doesn't correspond to ENZ_NAME\tSEQTOCUT\tPATTERN*CUT\n"
print (msg)
enz_fh.close()
msg+="Enzyme file format doesn't correspond to ENZ_NAME\tSEQTOCUT\tPATTERN*CUT\n"
print (msg)
enz_fh.close()
else:
enz_dict=ENZYME_DICT
return enz_dict
def generate_bins(bins_size,min,max):
......@@ -245,7 +272,7 @@ def generate_bins(bins_size,min,max):
index=int(min)
ranges={}
bins=[index]
while ((index+bins_size-1) <= max):
while ((index+bins_size-1) < max):
ranges[str(index)]={'coord':[index,index+bins_size-1]}
bins.append(index+bins_size-1)
index=index+bins_size
......@@ -268,9 +295,6 @@ def compute_fragment_size_per_range(ranges,fragments_length):
if __name__ == "__main__":
# Get available enzyme
all_enzymes = {'None':0}
if os.path.exists(ENZYME_PATH) :
all_enzymes = get_available_enzymes(ENZYME_PATH)
parser = OptionParser(usage="Usage: insilicoRRBS.py -i FILE [options]")
usage = "usage: %prog -i file.fa -o output_directory -c file.bed"
......@@ -285,6 +309,9 @@ if __name__ == "__main__":
help="The genome to digest [fasta]", metavar="FILE")
igroup.add_option("-c", "--bed", dest="bedfile",
help="A bed file of region of interrest (eg CpG island), to get coverage information with enzymatic digestion ", metavar="FILE")
igroup.add_option("-z", "--zero-based", dest="zerobased", action="store_true",default=False,
help="Bed provided and generate are zero-based")
parser.add_option_group(igroup)
ogroup = OptionGroup(parser, "Output directory options","")
......@@ -297,7 +324,7 @@ if __name__ == "__main__":
cgroup = OptionGroup(parser, "Fragment options","")
cgroup.add_option("-e", "--enzyme", dest="enzyme", default=None, metavar="STRING",
help="Enzyme to use for digestion, if two enzyme or more, perform double digestion or more eg : -e 'MspI TaqI'\n\
Available enzyme : "+ " ".join(all_enzymes.keys()))
Available enzyme : "+ " ".join(ENZYME_DICT.keys()))
cgroup.add_option("-y", "--enzyme-file", dest="enzyme_file", default=None,
help="Provide your own enzyme file with following format :\n\
MspI CCGG C*CGG")
......@@ -314,7 +341,7 @@ if __name__ == "__main__":
bgroup = OptionGroup(parser, "Dependencies options","")
bgroup.add_option("-l", "--bedtools", dest="bedtools",
help="The bedtools program path [REQUIRED If not in path]", default='bedtools')
help="The bedtools program path [REQUIRED If not in path]", default='/usr/local/bioinfo/src/bedtools/bedtools2-2.25.0/bin/bedtools')
parser.add_option_group(bgroup)
(options, args) = parser.parse_args()
......@@ -325,7 +352,7 @@ if __name__ == "__main__":
msg="Input file is required\n"
if options.output == None:
msg="Output directory is required\n"
if options.enzyme == None and options.enzyme == None :
if options.enzyme == None and options.enzyme_file == None :
msg="Enzyme file OR enzyme name is required\n"
if options.output == None or os.path.exists(options.output):
msg="Output dir already exists\n"
......@@ -347,10 +374,11 @@ if __name__ == "__main__":
#retrieve enzyme to process
if options.enzyme_file is not None :
all_enzymes = get_available_enzymes(options.enzyme_file)
enz_dict = all_enzymes
enz_dict = get_available_enzymes(options.enzyme_file)
else :
enz_dict = ENZYME_DICT
if options.enzyme is not None :
enz_dict = get_enzymes_pattern(all_enzymes, options.enzyme.split(" "))
enz_dict = get_enzymes_pattern(enz_dict, options.enzyme.split(" "))
if options.double_selection and len(enz_dict.keys()) != 2 :
print ("ERROR option double-selection/-d can only be use with 2 enzymes selected")
parser.print_help()
......@@ -362,7 +390,7 @@ if __name__ == "__main__":
json_file=options.json_file
stats_file=os.path.join(options.output,fasta_basename+".stats.txt")
#digest genome with enzyme
fragments_length,fragments_length_selection,nb_cpg,genome_size = split_genome(options.fasta_file,enz_dict, digestion_genome_bed,digestion_selection_bed,options.min_size,options.max_size,CpG_genome_bed,options.double_selection)
fragments_length,fragments_length_selection,nb_cpg,genome_size = split_genome(options.fasta_file,enz_dict,options.zerobased, digestion_genome_bed,digestion_selection_bed,options.min_size,options.max_size,CpG_genome_bed,options.double_selection)
dict_genome={"fasta":options.fasta_file,
"size" :genome_size,
......@@ -379,6 +407,7 @@ if __name__ == "__main__":
result["fragments"]=dict_fragments
# generate bins
(bins,ranges) = generate_bins(int(options.bins_size), options.min_size, options.max_size)
compute_fragment_size_per_range(ranges,fragments_length_selection)
#Statistics of selected fragments thanks to bins
nb_base,nb_fragment = count_bp_element_in_bed(digestion_selection_bed)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment