Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bios4biol
bioinfo-utils
Commits
5a468bea
Commit
5a468bea
authored
Jun 23, 2014
by
no_author
Browse files
Script perl for in silico enzymatic digestion
parent
15f707f7
Changes
1
Hide whitespace changes
Inline
Side-by-side
bin/DigestEnzym_Statistics.pl
0 → 100755
View file @
5a468bea
#!/usr/bin/perl -w
#
# Give some statistic for RRBS analysis
# PLENECASSAGNES Julien (02 Apr 2014), contact: julien.plenecassagnes@gmail.com
# Example usage : perl DigestEnzym_Statistics.pl -i galGal4.fa -p toto -d enzyme.txt -l 100 -s 20 -S 40 -e 220
use
strict
;
use
Getopt::
Std
;
my
%opts
;
getopts
('
i:p:d:w:s:S:m:b:BR
',
\
%opts
);
usage
()
if
(
$opts
{
h
});
my
$infile
=
$opts
{
i
}
||
usage
();
my
$prefix
=
$opts
{
p
}
||
usage
();
my
$enzymeFile
=
$opts
{
d
}
||
usage
();
my
$length
=
$opts
{
w
}
||
100
;
my
$step
=
$opts
{
s} || 10;
my $start = $opts{S} || 40;
my $end = $opts{m}
||
200
;
my
$bedfile
=
$opts
{
b
}
||
"";
my
$both
=
$opts
{
B
}
||
0
;
my
$rrbs
=
$opts
{
R
}
||
0
;
my
$taille_genome
=
0
;
my
$nb_CG
=
0
;
# Avant tout on teste si les bedtools sont dans le path de l'utilisateur
my
$retour
=
system
"
bedtools > /dev/null
";
if
(
$retour
!=
0
){
print
("
Please put bedtools in your path for use this script.
\n
");
exit
1
;
}
##### Création des répertoire de sortie
my
$tmpDirectory
=
"
tmp_
"
.
$prefix
;
my
$outDirectory
=
$prefix
.
"
_output
";
system
"
mkdir
$tmpDirectory
";
system
"
mkdir
$outDirectory
";
########################################################################
# POURCENTAGE DE COUVERTURE DU GENOME #
########################################################################
print
("
########### Processing parameters... ###########
\n
");
print
("
Reference genome:
$infile
\n
");
print
("
Prefix:
$prefix
\n
");
print
("
Enzyme filename:
$enzymeFile
\n
");
print
("
Window length:
$length
\n
");
print
("
Step:
$step
\n
");
print
("
Start in size:
$start
\n
");
print
("
Stop in size:
$end
\n
");
if
(
$bedfile
ne
""){
print
("
Your input bed/vcf file:
$bedfile
\n
");
}
if
(
int
(
$both
)
==
1
){
print
("
Both-strand option: enable
\n
");
}
else
{
print
("
Both-strand option: disable
\n
");
}
# utilisation ou non en RRBS
if
(
int
(
$rrbs
)
==
1
){
print
("
RRBS: enable
\n\n
");
}
else
{
print
("
RRBS: disable
\n\n
");
}
# Creation du fichier contenant le génome de ref splité par l'enzyme de restriction
split_genome
(
$infile
,
$enzymeFile
,
"
$tmpDirectory
/genome_split.txt
");
# On détermine l'ensemble des fenetres à parcourir
my
%h_windows
=
generateWindows
(
$start
,
$end
,
$step
,
$length
);
print
("
Windows generated: DONE
\n
");
# On ouvre le fichier genome_split.txt généré par la split_genome
# chr1 2287 22 CGGCACCGCTGTGCCCTCAGCC
open
(
GENOME
,"
$tmpDirectory
/genome_split.txt
")
||
die
"
Can't open file : GENOME
";
# lecture du fichier splité ligne à ligne
while
(
my
$line
=<
GENOME
>
)
{
chomp
$line
;
# Découpage de la ligne sur la tabulation
my
(
$chr
,
$posDebut
,
$tailleFragment
)
=
split
(
/\s+/
,
$line
);
# Boucle sur chaque fenetre
foreach
my
$window
(
keys
(
%h_windows
))
{
my
(
$deb
,
$fin
)
=
split
(
/_/
,
$window
);
# si le fragment correspond a la fenetre actuelle
if
(
(
$tailleFragment
>=
$deb
)
and
(
$tailleFragment
<=
$fin
)
){
# taille totale des fragments pour une fenetre
$h_windows
{
$window
}[
0
]
+=
$tailleFragment
;
# nombre de fragment pour une fenetre
$h_windows
{
$window
}[
1
]
+=
1
;
# Ecriture à la fin du bed pour les fragments de $deb $fin (ex : 40_220)
open
(
BED
,"
>>
$tmpDirectory
/
$window
.bed
")
||
die
"
Can't open file :
$tmpDirectory
/
$window
.bed
";
my
$posEnd
=
$posDebut
+
(
$tailleFragment
-
1
);
print
BED
("
$chr
\t
$posDebut
\t
$posEnd
\n
"
);
close
BED
;
}
}
# Taille totale du génome
$taille_genome
+=
$tailleFragment
;
}
close
GENOME
;
print
("
Fragment processed: DONE
\n
");
########################################################################
# POURCENTAGE DE COUVERTURE DES CG EN RRBS #
########################################################################
# Si on veut traiter les fichiers en RRBS on calcule le nombre de CG dans
# le génome de référence
if
(
int
(
$rrbs
)
==
1
){
# Création du fichier tmp d'enzyme CG
open
(
ENZ
,"
>
","
$tmpDirectory
/enzymeCG.txt
")
||
die
"
Can't open file :
$tmpDirectory
/enzymeCG.txt
";
print
ENZ
("
cutCG
\t
CG
\t
C*G
");
close
ENZ
;
# On découpe le génome sur l'enzyme C*G
split_genome
(
$infile
,
"
$tmpDirectory
/enzymeCG.txt
",
"
$tmpDirectory
/genome_split_CG.txt
");
open
(
GENOMECG
,"
<
","
$tmpDirectory
/genome_split_CG.txt
")
||
die
"
Can't open file :
$tmpDirectory
/genome_split_CG.txt
";
open
(
BEDCG
,"
>
","
$tmpDirectory
/CG.bed
")
||
die
"
Can't open file :
$tmpDirectory
/CG.bed
";
# Lecture ligne a ligne du fichier genome_split_CG.txt pour
# compter le nombre de CG dans le génome de référence
while
(
my
$line
=
<
GENOMECG
>
)
{
chomp
$line
;
# Découpage de la ligne
my
$chr
=
"";
my
$posDebut
=
0
;
my
$tailleFragment
=
0
;
(
$chr
,
$posDebut
,
$tailleFragment
)
=
split
(
/\s+/
,
$line
);
# Ecriture dans le fichier bed pour le CG en cours en fonction de l'option both strand
if
(
int
(
$both
)
!=
1
){
if
(((
$posDebut
-
1
)
!=
0
)
and
((
$posDebut
-
1
)
!=
1
)){
print
BEDCG
(
$chr
.
"
\t
"
.
(
$posDebut
-
1
)
.
"
\t
"
.
(
$posDebut
-
1
)
.
"
\n
"
);
# On compte le nombre de CG sur un seul brin
$nb_CG
+=
1
;
}
}
else
{
if
(((
$posDebut
-
1
)
!=
0
)
and
((
$posDebut
-
1
)
!=
1
)){
print
BEDCG
(
$chr
.
"
\t
"
.
(
$posDebut
-
1
)
.
"
\t
"
.
(
$posDebut
-
1
)
.
"
\n
"
.
$chr
.
"
\t
"
.
$posDebut
.
"
\t
"
.
$posDebut
.
"
\n
");
# On compte le nombre de CG sur les deux brins
$nb_CG
+=
2
;
}
}
}
close
BEDCG
;
close
GENOMECG
;
print
("
CG processed: DONE
\n
");
}
########################################################################
# STATISTIQUES #
########################################################################
my
$outfile
=
"
$outDirectory
/results.txt
";
print
("
Write statistics in output file:
$outfile
...
\n
");
open
(
OUTPUT
,"
>
$outfile
")
||
die
"
Can't open file :
$outfile
";
# En-tete du TSV de résultats
# sans bedfile, avec RRBS
if
(
$bedfile
eq
""
and
int
(
$rrbs
)
==
1
){
print
OUTPUT
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\t
nbCG
\t
%CGcoverage
\n
");
print
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\t
nbCG
\t
%CGcoverage
\n
");
# sans bedfile, sans RRBS
}
elsif
(
$bedfile
eq
""
and
int
(
$rrbs
)
==
0
){
print
OUTPUT
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\n
");
print
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\n
");
# avec bedfile, avec RRBS
}
elsif
(
$bedfile
ne
""
and
int
(
$rrbs
)
==
1
){
print
OUTPUT
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\t
nbCG
\t
%CGcoverage
\t
nbBed
\t
%bedCoverage
\n
");
print
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\t
nbCG
\t
%CGcoverage
\t
nbBed
\t
%bedCoverage
\n
");
# avec bedfile, sans RRBS
}
elsif
(
$bedfile
ne
""
and
int
(
$rrbs
)
==
0
){
print
OUTPUT
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\t
nbBed
\t
%bedCoverage
\n
");
print
("
#Windows
\t
nbFragment
\t
FragmentSize
\t
%GenomeCoverage
\t
nbBed
\t
%bedCoverage
\n
");
}
# Boucle sur chaques fenetres possibles avec les paramètres données pour
# récupérer les stats de couverture du génome pour remplir le TSV
my
$nbIntersect
=
"";
my
$pourcentCG
=
0.0
;
my
$nb_candidat_bed
=
"";
my
$nbIntersectbed
=
"";
my
$pourcentbed
=
0.0
;
my
$listeIntersect
=
"";
foreach
my
$window
(
keys
(
%h_windows
))
{
# Statistique de couverture du genome
my
(
$deb
,
$fin
)
=
split
(
/_/
,
$window
);
my
$pourcentCouv
=
0
;
if
(
$h_windows
{
$window
}[
0
]
!=
0
){
$pourcentCouv
=
(
$h_windows
{
$window
}[
0
]
*
100.0
)
/
$taille_genome
;
}
# sans bedfile, avec RRBS OK
if
(
$bedfile
eq
""
and
int
(
$rrbs
)
==
1
){
# Statistique de couverture en CG
$nbIntersect
=
`
bedtools intersect -a
$tmpDirectory
/CG.bed -b
$tmpDirectory
/
$window
.bed | wc -l
`;
$pourcentCG
=
0.0
;
if
(
$nbIntersect
!=
0
){
$pourcentCG
=
(
int
(
$nbIntersect
)
*
100.0
)
/
$nb_CG
;
printf
OUTPUT
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\t
"
.
int
(
$nbIntersect
)
.
"
\t
%.2f
\n
",
$pourcentCouv
,
$pourcentCG
);
printf
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\t
"
.
int
(
$nbIntersect
)
.
"
\t
%.2f
\n
",
$pourcentCouv
,
$pourcentCG
);
}
# sans bedfile, sans RRBS OK
}
elsif
(
$bedfile
eq
""
and
int
(
$rrbs
)
==
0
){
printf
OUTPUT
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\n
",
$pourcentCouv
);
printf
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\n
",
$pourcentCouv
);
# avec bedfile, avec RRBS OK
}
elsif
(
$bedfile
ne
""
and
int
(
$rrbs
)
==
1
){
# Statistique de couverture en CG
$nbIntersect
=
`
bedtools intersect -a
$tmpDirectory
/CG.bed -b
$tmpDirectory
/
$window
.bed | wc -l
`;
$pourcentCG
=
0.0
;
if
(
$nbIntersect
!=
0
){
$pourcentCG
=
(
int
(
$nbIntersect
)
*
100.0
)
/
$nb_CG
;
# Avec bedfile
$nb_candidat_bed
=
`
bedtools sort -i
$bedfile
| uniq | wc -l
`;
$listeIntersect
=
`
bedtools intersect -a
$bedfile
-b
$tmpDirectory
/
$window
.bed >
$outDirectory
/intersection.out
`;
$nbIntersectbed
=
`
cat
$outDirectory
/intersection.out | wc -l
`;
$pourcentbed
=
0.0
;
if
(
int
(
$nb_candidat_bed
)
!=
0
){
$pourcentbed
=
(
int
(
$nbIntersectbed
)
*
100.0
)
/
int
(
$nb_candidat_bed
);
}
printf
OUTPUT
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\t
"
.
int
(
$nbIntersect
)
.
"
\t
%.2f
\t
"
.
int
(
$nbIntersectbed
)
.
"
\t
%.2f
\n
",
$pourcentCouv
,
$pourcentCG
,
$pourcentbed
);
printf
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\t
"
.
int
(
$nbIntersect
)
.
"
\t
%.2f
\t
"
.
int
(
$nbIntersectbed
)
.
"
\t
%.2f
\n
",
$pourcentCouv
,
$pourcentCG
,
$pourcentbed
);
}
# avec bedfile, sans RRBS
}
elsif
(
$bedfile
ne
""
and
int
(
$rrbs
)
==
0
){
# avec bedfile
$nb_candidat_bed
=
`
bedtools sort -i
$bedfile
| uniq | wc -l
`;
$listeIntersect
=
`
bedtools intersect -a
$bedfile
-b
$tmpDirectory
/
$window
.bed >
$outDirectory
/intersection.out
`;
$nbIntersectbed
=
`
cat
$outDirectory
/intersection.out | wc -l
`;
$pourcentbed
=
0.0
;
if
(
int
(
$nb_candidat_bed
)
!=
0
){
$pourcentbed
=
(
int
(
$nbIntersectbed
)
*
100.0
)
/
int
(
$nb_candidat_bed
);
}
printf
OUTPUT
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\t
"
.
int
(
$nbIntersectbed
)
.
"
\t
%.2f
\n
",
$pourcentCouv
,
$pourcentbed
);
printf
("
$deb
-
$fin
\t
$h_windows
{
$window
}[1]
\t
$h_windows
{
$window
}[0]
\t
%.2f
\t
"
.
int
(
$nbIntersectbed
)
.
"
\t
%.2f
\n
",
$pourcentCouv
,
$pourcentbed
);
}
}
print
("
\n
");
close
(
OUTPUT
);
# Suppression de l'ensemble des fichiers temporaires
system
"
rm -rf
$tmpDirectory
";
########################################################################
# FONCTIONS #
########################################################################
#renvoi un tableau avec l'ensemble des fenetres
sub
generateWindows
{
my
(
$start
,
$end
,
$step
,
$length
)
=
@_
;
my
%h_tabFenetre
;
while
((
$start
+
$length
)
<=
$end
){
my
$cle
=
$start
.
"
_
"
.
(
$start
+
$length
);
my
@a_valeur
=
(
0
,
0
);
$h_tabFenetre
{
$cle
}
=
\
@a_valeur
;
$start
+=
$step
;
}
return
%h_tabFenetre
;
}
# fonction de patrice qui découpe le génome avec un enzyme de restriction
# spécifié dans le fichier : enzyme.txt
sub
split_genome
{
# On récupère les paramètres de la fonction
my
(
$infile
,
$enzymeFile
,
$sortie
)
=
@_
;
open
(
FIN
,"
<
",
$enzymeFile
)
||
die
"
Can't open file :
$enzymeFile
";
my
%Enzyme
=
();
while
(
my
$enz
=<
FIN
>
)
{
next
if
(
$enz
=~
/^#/
);
next
if
(
$enz
=~
/^\s*$/
);
chomp
$enz
;
my
@F
=
split
/\s+/
,
$enz
;
$#F
==
2
||
die
"
bad formated enzyme file, required three columns (enzyme_name sequence sequence_with_cut=new_lin
\n
(ex:MspI CCGG C*CGG))
";
$Enzyme
{
$F
[
0
]}{
seq
}
=
$F
[
1
];
$Enzyme
{
$F
[
0
]}{
pat
}
=
$F
[
2
];
}
close
FIN
;
open
(
FICH
,"
<
",
$infile
)
||
die
"
Can't open file :
$infile
";
open
(
OUT
,"
>
",
$sortie
)
||
die
"
Can't open file :
$sortie
";
my
$save
=
$/
;
$/
=
"
>
";
while
(
my
$seq
=
<
FICH
>
){
chomp
$seq
;
$seq
=~
s/^>//
;
my
@Lines
=
split
/\n/
,
$seq
;
$#Lines
>
0
||
next
;
my
(
$ac
)
=
(
$Lines
[
0
]
=~
/^(\S+)/
);
map
{
s/\s//g
;}
@Lines
;
$seq
=
uc
(
join
('',
@Lines
[
1
..
$#Lines
]));
foreach
my
$enz
(
keys
%Enzyme
)
{
$seq
=~
s/$Enzyme{$enz}{seq}/$Enzyme{$enz}{pat}/g
;
}
my
$pos
=
1
;
foreach
my
$frag
(
split
/\*/
,
$seq
)
{
my
$len
=
length
(
$frag
);
printf
OUT
("
%s
\t
%d
\t
%d
\n
",
$ac
,
$pos
,
$len
);
$pos
+=
$len
;
}
}
$/
=
$save
;
close
OUT
;
close
FICH
;
}
########################################################################
# UTILISATION DU PROGRAMME #
########################################################################
sub
usage
{
$
0
=~
s/.*\///
;
print
("
Description:
\n
DigestEnzym_Statistics.pl digest genome file with all enzymes in digestion file and compute number of C (in CpG context)
and generate statistics such as genome coverage, number C covered in each windows...
Differents windows correspond slinding windows from min-range fragment with windows size and step untils max-range.
If you provide a bed file this program check if yours positions are included in the fragment selection for each windows.
Schema:
length
<--------->
<--------->
...
-------|----*------|--*--------|---
min step max
Usage:
\n
required:
\n
-i[nfile reference genome (fasta)]
-p[refix (prefix for yours files)]
-d[igestion: enzyme digestion file]
Enzyme file format : MspI CCGG C*CGG
optionnal:
\n
-w[indows-size: (default: 100bp)]
-s[tep: (default: 10bp)]
-S[min-range: (default: 40bp)]
-m[max-range: (default: 200bp)]
-b[ed/vcf: (list of positions)]
-B[oth-strand: (default: disable)]
-R[RBS: (default: disable)]
\n\n
");
exit
1
;
}
=pod
=head1 NAME
DigestEnzym_Statistics.pl
=head1 SYNOPSIS
DigestEnzym_Statistics.pl [options]
=head1 DESCRIPTION
DigestEnzym_Statistics.pl digest genome file with all enzymes in digestion file and compute number of C (in CpG context)
and generate statistics such as genome coverage, number C covered in each windows...
Differents windows correspond slinding windows from min-range fragment with windows size and step untils max-range.
If you provide a bed file this program check if yours positions are included in the fragment selection for each windows.
Schema :
length
<---------->
<--------->
...
-------|----*------|--*--------|---
min step max
=head1 OPTIONS
required :
-i[nfile reference genome (fasta)]
-p[refix (prefix for yours files)]
-d[igestion: enzyme digestion file]
Enzyme file format : MspI CCGG C*CGG
optionnal :
-w[indows-size: (ex: 100bp)]
-s[tep: (ex: default bp)]
-S[min-range: (default: 40bp)]
-m[max-range: (default: 200bp)]
-b[ed/vcf: (list of positions)]
-B[oth-strand: (default: disable)]
-R[RBS: (default: disable)]
=head1 AUTHORS
PLENECASSAGNES Julien
=head1 VERSION
1
=head1 DATE
04/02/2014
=head1 KEYWORDS
RRBS enzyme-digestion methylation bsseq
=head1 EXAMPLE
perl DigestEnzym_Statistics.pl [options]
=cut
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment