Commit 5a468bea authored by no_author's avatar no_author
Browse files

Script perl for in silico enzymatic digestion

parent 15f707f7
#!/usr/bin/perl -w
#
# Give some statistic for RRBS analysis
# PLENECASSAGNES Julien (02 Apr 2014), contact: julien.plenecassagnes@gmail.com
# Example usage : perl DigestEnzym_Statistics.pl -i galGal4.fa -p toto -d enzyme.txt -l 100 -s 20 -S 40 -e 220
use strict;
use Getopt::Std;
my %opts;
getopts('i:p:d:w:s:S:m:b:BR', \%opts);
usage() if ( $opts{h});
my $infile = $opts{i} || usage();
my $prefix = $opts{p} || usage();
my $enzymeFile = $opts{d} || usage();
my $length = $opts{w} || 100;
my $step = $opts{s} || 10;
my $start = $opts{S} || 40;
my $end = $opts{m} || 200;
my $bedfile = $opts{b} || "";
my $both = $opts{B} || 0;
my $rrbs = $opts{R} || 0;
my $taille_genome = 0;
my $nb_CG = 0;
# Avant tout on teste si les bedtools sont dans le path de l'utilisateur
my $retour = system "bedtools > /dev/null";
if ($retour != 0){
print("Please put bedtools in your path for use this script.\n");
exit 1;
}
##### Création des répertoire de sortie
my $tmpDirectory = "tmp_".$prefix;
my $outDirectory = $prefix."_output";
system "mkdir $tmpDirectory";
system "mkdir $outDirectory";
########################################################################
# POURCENTAGE DE COUVERTURE DU GENOME #
########################################################################
print("########### Processing parameters... ###########\n");
print("Reference genome: $infile\n");
print("Prefix: $prefix\n");
print("Enzyme filename: $enzymeFile\n");
print("Window length: $length\n");
print("Step: $step\n");
print("Start in size: $start\n");
print("Stop in size: $end\n");
if ($bedfile ne ""){
print("Your input bed/vcf file: $bedfile\n");
}
if (int($both) == 1){
print("Both-strand option: enable\n");
}else{
print("Both-strand option: disable\n");
}
# utilisation ou non en RRBS
if (int($rrbs) == 1){
print("RRBS: enable\n\n");
}else{
print("RRBS: disable\n\n");
}
# Creation du fichier contenant le génome de ref splité par l'enzyme de restriction
split_genome($infile, $enzymeFile, "$tmpDirectory/genome_split.txt");
# On détermine l'ensemble des fenetres à parcourir
my %h_windows = generateWindows($start,$end,$step,$length);
print("Windows generated: DONE\n");
# On ouvre le fichier genome_split.txt généré par la split_genome
# chr1 2287 22 CGGCACCGCTGTGCCCTCAGCC
open(GENOME,"$tmpDirectory/genome_split.txt") || die "Can't open file : GENOME";
# lecture du fichier splité ligne à ligne
while(my $line=<GENOME>) {
chomp $line;
# Découpage de la ligne sur la tabulation
my ($chr, $posDebut, $tailleFragment) = split(/\s+/,$line);
# Boucle sur chaque fenetre
foreach my $window (keys(%h_windows)) {
my ($deb,$fin) = split(/_/,$window);
# si le fragment correspond a la fenetre actuelle
if ( ($tailleFragment >= $deb) and ($tailleFragment <= $fin) ){
# taille totale des fragments pour une fenetre
$h_windows{$window}[0] += $tailleFragment;
# nombre de fragment pour une fenetre
$h_windows{$window}[1] += 1;
# Ecriture à la fin du bed pour les fragments de $deb $fin (ex : 40_220)
open(BED,">>$tmpDirectory/$window.bed") || die "Can't open file : $tmpDirectory/$window.bed";
my $posEnd = $posDebut+($tailleFragment-1);
print BED ("$chr\t$posDebut\t$posEnd\n" );
close BED;
}
}
# Taille totale du génome
$taille_genome += $tailleFragment;
}
close GENOME;
print("Fragment processed: DONE\n");
########################################################################
# POURCENTAGE DE COUVERTURE DES CG EN RRBS #
########################################################################
# Si on veut traiter les fichiers en RRBS on calcule le nombre de CG dans
# le génome de référence
if (int($rrbs)==1){
# Création du fichier tmp d'enzyme CG
open(ENZ,">","$tmpDirectory/enzymeCG.txt") || die "Can't open file : $tmpDirectory/enzymeCG.txt";
print ENZ ("cutCG\tCG\tC*G");
close ENZ;
# On découpe le génome sur l'enzyme C*G
split_genome($infile, "$tmpDirectory/enzymeCG.txt", "$tmpDirectory/genome_split_CG.txt");
open(GENOMECG,"<","$tmpDirectory/genome_split_CG.txt") || die "Can't open file : $tmpDirectory/genome_split_CG.txt";
open(BEDCG,">","$tmpDirectory/CG.bed") || die "Can't open file : $tmpDirectory/CG.bed";
# Lecture ligne a ligne du fichier genome_split_CG.txt pour
# compter le nombre de CG dans le génome de référence
while(my $line = <GENOMECG>) {
chomp $line;
# Découpage de la ligne
my $chr = "";
my $posDebut = 0;
my $tailleFragment = 0;
($chr,$posDebut,$tailleFragment) = split(/\s+/, $line);
# Ecriture dans le fichier bed pour le CG en cours en fonction de l'option both strand
if (int($both) != 1){
if((($posDebut-1) != 0) and (($posDebut-1) != 1)){
print BEDCG ($chr."\t".($posDebut-1)."\t".($posDebut-1)."\n" );
# On compte le nombre de CG sur un seul brin
$nb_CG += 1;
}
}else{
if((($posDebut-1) != 0) and (($posDebut-1) != 1)){
print BEDCG ($chr."\t".($posDebut-1)."\t".($posDebut-1)."\n".$chr."\t".$posDebut."\t".$posDebut."\n");
# On compte le nombre de CG sur les deux brins
$nb_CG += 2;
}
}
}
close BEDCG;
close GENOMECG;
print("CG processed: DONE\n");
}
########################################################################
# STATISTIQUES #
########################################################################
my $outfile = "$outDirectory/results.txt";
print("Write statistics in output file: $outfile...\n");
open(OUTPUT,">$outfile") || die "Can't open file : $outfile";
# En-tete du TSV de résultats
# sans bedfile, avec RRBS
if ($bedfile eq "" and int($rrbs) == 1){
print OUTPUT ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\tnbCG\t%CGcoverage\n");
print ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\tnbCG\t%CGcoverage\n");
# sans bedfile, sans RRBS
}elsif($bedfile eq "" and int($rrbs) == 0){
print OUTPUT ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\n");
print ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\n");
# avec bedfile, avec RRBS
}elsif($bedfile ne "" and int($rrbs) == 1){
print OUTPUT ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\tnbCG\t%CGcoverage\tnbBed\t%bedCoverage\n");
print ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\tnbCG\t%CGcoverage\tnbBed\t%bedCoverage\n");
# avec bedfile, sans RRBS
}elsif($bedfile ne "" and int($rrbs) == 0){
print OUTPUT ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\tnbBed\t%bedCoverage\n");
print ("#Windows\tnbFragment\tFragmentSize\t%GenomeCoverage\tnbBed\t%bedCoverage\n");
}
# Boucle sur chaques fenetres possibles avec les paramètres données pour
# récupérer les stats de couverture du génome pour remplir le TSV
my $nbIntersect = "";
my $pourcentCG = 0.0;
my $nb_candidat_bed = "";
my $nbIntersectbed = "";
my $pourcentbed = 0.0;
my $listeIntersect = "";
foreach my $window (keys(%h_windows)) {
# Statistique de couverture du genome
my ($deb,$fin) = split(/_/,$window);
my $pourcentCouv = 0;
if ($h_windows{$window}[0] != 0){
$pourcentCouv = ($h_windows{$window}[0] * 100.0) / $taille_genome;
}
# sans bedfile, avec RRBS OK
if ($bedfile eq "" and int($rrbs) == 1){
# Statistique de couverture en CG
$nbIntersect = `bedtools intersect -a $tmpDirectory/CG.bed -b $tmpDirectory/$window.bed | wc -l`;
$pourcentCG = 0.0;
if ($nbIntersect != 0){
$pourcentCG = (int($nbIntersect) * 100.0) / $nb_CG;
printf OUTPUT ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\t".int($nbIntersect)."\t%.2f\n", $pourcentCouv, $pourcentCG);
printf ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\t".int($nbIntersect)."\t%.2f\n", $pourcentCouv, $pourcentCG);
}
# sans bedfile, sans RRBS OK
}elsif($bedfile eq "" and int($rrbs) == 0){
printf OUTPUT ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\n",$pourcentCouv);
printf ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\n",$pourcentCouv);
# avec bedfile, avec RRBS OK
}elsif($bedfile ne "" and int($rrbs) == 1){
# Statistique de couverture en CG
$nbIntersect = `bedtools intersect -a $tmpDirectory/CG.bed -b $tmpDirectory/$window.bed | wc -l`;
$pourcentCG = 0.0;
if ($nbIntersect != 0){
$pourcentCG = (int($nbIntersect) * 100.0) / $nb_CG;
# Avec bedfile
$nb_candidat_bed = `bedtools sort -i $bedfile | uniq | wc -l`;
$listeIntersect = `bedtools intersect -a $bedfile -b $tmpDirectory/$window.bed > $outDirectory/intersection.out`;
$nbIntersectbed = `cat $outDirectory/intersection.out | wc -l`;
$pourcentbed = 0.0;
if (int($nb_candidat_bed) != 0){
$pourcentbed = (int($nbIntersectbed) * 100.0) / int($nb_candidat_bed);
}
printf OUTPUT ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\t".int($nbIntersect)."\t%.2f\t".int($nbIntersectbed)."\t%.2f\n", $pourcentCouv, $pourcentCG, $pourcentbed);
printf ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\t".int($nbIntersect)."\t%.2f\t".int($nbIntersectbed)."\t%.2f\n", $pourcentCouv, $pourcentCG, $pourcentbed);
}
# avec bedfile, sans RRBS
}elsif($bedfile ne "" and int($rrbs) == 0){
# avec bedfile
$nb_candidat_bed = `bedtools sort -i $bedfile | uniq | wc -l`;
$listeIntersect = `bedtools intersect -a $bedfile -b $tmpDirectory/$window.bed > $outDirectory/intersection.out`;
$nbIntersectbed = `cat $outDirectory/intersection.out | wc -l`;
$pourcentbed = 0.0;
if (int($nb_candidat_bed) != 0){
$pourcentbed = (int($nbIntersectbed) * 100.0) / int($nb_candidat_bed);
}
printf OUTPUT ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\t".int($nbIntersectbed)."\t%.2f\n", $pourcentCouv, $pourcentbed);
printf ("$deb-$fin\t$h_windows{$window}[1]\t$h_windows{$window}[0]\t%.2f\t".int($nbIntersectbed)."\t%.2f\n", $pourcentCouv, $pourcentbed);
}
}
print("\n");
close(OUTPUT);
# Suppression de l'ensemble des fichiers temporaires
system "rm -rf $tmpDirectory";
########################################################################
# FONCTIONS #
########################################################################
#renvoi un tableau avec l'ensemble des fenetres
sub generateWindows {
my ($start,$end,$step,$length) = @_;
my %h_tabFenetre;
while (($start+$length) <= $end){
my $cle = $start."_".($start+$length);
my @a_valeur = (0,0);
$h_tabFenetre{$cle} = \@a_valeur;
$start += $step;
}
return %h_tabFenetre;
}
# fonction de patrice qui découpe le génome avec un enzyme de restriction
# spécifié dans le fichier : enzyme.txt
sub split_genome {
# On récupère les paramètres de la fonction
my ($infile, $enzymeFile, $sortie) = @_;
open(FIN,"<",$enzymeFile) || die "Can't open file : $enzymeFile";
my %Enzyme=();
while(my $enz=<FIN>) {
next if ($enz=~/^#/);
next if ($enz=~/^\s*$/);
chomp $enz;
my @F=split/\s+/,$enz;
$#F==2||die "bad formated enzyme file, required three columns (enzyme_name sequence sequence_with_cut=new_lin\n(ex:MspI CCGG C*CGG))";
$Enzyme{$F[0]}{seq}=$F[1];
$Enzyme{$F[0]}{pat}=$F[2];
}
close FIN;
open(FICH,"<",$infile) || die "Can't open file : $infile";
open(OUT,">",$sortie) || die "Can't open file : $sortie";
my $save=$/;
$/=">";
while(my $seq = <FICH>){
chomp $seq;
$seq=~s/^>//;
my @Lines=split/\n/,$seq;
$#Lines>0||next;
my ($ac)=($Lines[0]=~/^(\S+)/);
map{s/\s//g;}@Lines;
$seq=uc(join('',@Lines[1..$#Lines]));
foreach my $enz (keys %Enzyme) {
$seq=~s/$Enzyme{$enz}{seq}/$Enzyme{$enz}{pat}/g;
}
my $pos=1;
foreach my $frag (split /\*/,$seq) {
my $len=length($frag);
printf OUT ("%s\t%d\t%d\n",$ac,$pos,$len);
$pos+=$len;
}
}
$/=$save;
close OUT;
close FICH;
}
########################################################################
# UTILISATION DU PROGRAMME #
########################################################################
sub usage {
$0 =~ s/.*\///;
print("
Description: \n
DigestEnzym_Statistics.pl digest genome file with all enzymes in digestion file and compute number of C (in CpG context)
and generate statistics such as genome coverage, number C covered in each windows...
Differents windows correspond slinding windows from min-range fragment with windows size and step untils max-range.
If you provide a bed file this program check if yours positions are included in the fragment selection for each windows.
Schema:
length
<--------->
<--------->
...
-------|----*------|--*--------|---
min step max
Usage: \n
required:\n
-i[nfile reference genome (fasta)]
-p[refix (prefix for yours files)]
-d[igestion: enzyme digestion file]
Enzyme file format : MspI CCGG C*CGG
optionnal:\n
-w[indows-size: (default: 100bp)]
-s[tep: (default: 10bp)]
-S[min-range: (default: 40bp)]
-m[max-range: (default: 200bp)]
-b[ed/vcf: (list of positions)]
-B[oth-strand: (default: disable)]
-R[RBS: (default: disable)]\n\n");
exit 1;
}
=pod
=head1 NAME
DigestEnzym_Statistics.pl
=head1 SYNOPSIS
DigestEnzym_Statistics.pl [options]
=head1 DESCRIPTION
DigestEnzym_Statistics.pl digest genome file with all enzymes in digestion file and compute number of C (in CpG context)
and generate statistics such as genome coverage, number C covered in each windows...
Differents windows correspond slinding windows from min-range fragment with windows size and step untils max-range.
If you provide a bed file this program check if yours positions are included in the fragment selection for each windows.
Schema :
length
<---------->
<--------->
...
-------|----*------|--*--------|---
min step max
=head1 OPTIONS
required :
-i[nfile reference genome (fasta)]
-p[refix (prefix for yours files)]
-d[igestion: enzyme digestion file]
Enzyme file format : MspI CCGG C*CGG
optionnal :
-w[indows-size: (ex: 100bp)]
-s[tep: (ex: default bp)]
-S[min-range: (default: 40bp)]
-m[max-range: (default: 200bp)]
-b[ed/vcf: (list of positions)]
-B[oth-strand: (default: disable)]
-R[RBS: (default: disable)]
=head1 AUTHORS
PLENECASSAGNES Julien
=head1 VERSION
1
=head1 DATE
04/02/2014
=head1 KEYWORDS
RRBS enzyme-digestion methylation bsseq
=head1 EXAMPLE
perl DigestEnzym_Statistics.pl [options]
=cut
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment