Commit d806fc9b authored by Celine Noirot's avatar Celine Noirot
Browse files

Add new script and some update

parent e7f3e34f
......@@ -4,8 +4,9 @@ use strict;
use Pod::Usage;
use Getopt::Long;
my ($man, $help, $nb_best);
my ($man, $help, $focus, $nb_best);
GetOptions(
'f=s' => \$focus,
'b=i' => \$nb_best,
'help' => \$help,
'man' => \$man,
......@@ -14,18 +15,16 @@ GetOptions(
pod2usage(1) if ($help);
pod2usage(-exitstatus => 0, -verbose => 2) if ($man);
pod2usage(-exitstatus => 0, -verbose => 1) if (-t);
$focus ||= 'q';
$nb_best ||= 1;
my (%option, $line);
do {
$line = <STDIN>;
} until $line =~ /^\d+/;
$option{'m'} = 1;
$option{'p'} = pslIsProtein($line) || 0;
my %h;
while ($line) {
my $f = $focus eq 'q' ? 9 : 13;
while ($line = <STDIN>) {
next unless ($line =~ /^\d+/);
my @t = split(/\t/,$line);
my $pid = sprintf("%.1f",get_pid(@t));
......@@ -33,29 +32,28 @@ while ($line) {
$t[15] += 1;
my $Qcov = sprintf("%.1f%",(abs($t[12]-$t[11])-$t[5]+1)/$t[10]*100);
my $Tcov = sprintf("%.1f%",(abs($t[16]-$t[15])-$t[7]+1)/$t[14]*100);
my @list = (get_score(@t),$t[11],$t[12],$t[10],$Qcov,$pid.'%',$t[8],$t[13],$t[15],$t[16],$t[14],$Tcov,$t[4],$t[5],$t[6],$t[7]);
if (exists($h{$t[9]})) {
scalar(@{$h{$t[9]}}) == $nb_best ? push_if_better(\@{$h{$t[9]}}, \@list) : push_and_sort(\@{$h{$t[9]}},\@list);
my @list = ($t[9],get_score(@t),$t[11],$t[12],$t[10],$Qcov,$pid.'%',$t[8],$t[13],$t[15],$t[16],$t[14],$Tcov,$t[4],$t[5],$t[6],$t[7]);
if (exists($h{$t[$f]})) {
scalar(@{$h{$t[$f]}}) == $nb_best ? push_if_better(\@{$h{$t[$f]}}, \@list) : push_and_sort(\@{$h{$t[$f]}},\@list);
}
else {
push(@{$h{$t[9]}},\@list);
push(@{$h{$t[$f]}},\@list);
}
$line = <STDIN>;
}
print join("\t",qw(qName score qStart qEnd qSize qCoverage identity strand tName tStart tEnd tSize tCoverage qNumInsert qBaseInsert tNumInsert tBaseInsert))."\n";
foreach my $seq (sort keys %h) {
foreach my $match (@{$h{$seq}}) {
print join("\t",$seq,@$match)."\n";
print join("\t",@$match)."\n";
}
}
sub push_if_better {
my ($first, $second) = @_;
return unless (${$$first[-1]}[0] < $$second[0]);
return unless (${$$first[-1]}[1] < $$second[1]);
$$first[-1] = $second;
my @t = sort { $$b[0] <=> $$a[0] } @$first;
my @t = sort { $$b[1] <=> $$a[1] } @$first;
@$first = @t;
return;
}
......@@ -63,7 +61,7 @@ sub push_if_better {
sub push_and_sort {
my ($first, $second) = @_;
push(@$first, $second);
my @t = sort { $$b[0] <=> $$a[0] } @$first;
my @t = sort { $$b[1] <=> $$a[1] } @$first;
@$first = @t;
return;
}
......@@ -183,9 +181,13 @@ sub pslIsProtein {
Prints the manual page and exits.
=item B<-f>
Focus best matches extraction on query (q) or target (t) [q]
=item B<-b>
Max number of best matches to print for each query [1]
Max number of best matches to extract [1]
=back
......@@ -203,7 +205,7 @@ sub pslIsProtein {
=head1 VERSION
1
2
=head1 DATE
......
......@@ -58,10 +58,10 @@ for line in f :
if previous_strand != strand and int(previous_pos)+1 == int(pos) and int(previous_c_met)>0:
C_previous_m=float(float(previous_c_met)/(float(previous_c_met)+float(previous_c_unmet))*100)
C_m=float(float(c_met)/(float(c_met)+float(c_unmet))*100)
if previous_strand == "+" :
if previous_strand != strand :
N_values_forward.append(C_previous_m)
else :
N_values_reverse.append(C_previous_m)
N_values_reverse.append(C_m)
fout.write( "\t".join([previous_pos,str(C_previous_m),str(C_m),previous_strand])+"\n" )
(previous_chr,previous_pos,previous_strand,previous_c_met,previous_c_unmet,previous_c_context)=(chr,pos,strand,c_met,c_unmet,c_context)
......@@ -72,4 +72,4 @@ else :
N_values_reverse.append(C_previous_m)
print "Correlation on forward VS reverse values : " + str(pearsonr(N_values_forward,N_values_reverse))
fout.close()
\ No newline at end of file
fout.close()
#!/bin/sh
##NAME = "colstat.sh"
##SYNOPSIS = "colstat.sh [-h] [INT(default 1)]"
##DATE = "2012"
##AUTHORS = "Jean Marc Aury, Genoscope CEA. Maria Bernard"
##KEYWORDS = "compute statistcs on integer column"
##DESCRIPTION = "colstat index_col"
pg=`basename $0`
usage() {
echo USAGE :
echo " $pg <field number : int>"
echo " Return statistics from a set of value (stdin). Optionnal argument is the field where to find the dataset, 1 by default."
exit 1
}
field=1
while [ $1 ]
do
case $1 in
"-h") usage ;;
*) field=$1 ;;
esac
shift
done
gawk -v field=$field '
BEGIN {
if(field=="") { field=1; }
}
NF<field {
printerr("Not enough field : NF="NF" and field="field"\n");
exit;
}
NF>0 {
n++;
s+=$field;
a[n]=$field;
}
n==1 {
min=$field;
max=$field;
}
NF>0 {
if ($field<min) min=$field;
if ($field>max) max=$field;
}
END {
# Calcul de la moyenne
mean=0;
if(n > 0) { mean=s/n ; }
# calcul de l ecart type
for (i=1; i<=n; i++ ) {
x=a[i];
sum_ecart += ( (x-mean) * (x-mean ) )
}
SD=0;
if(n>0) { SD=sqrt(sum_ecart / n) ; }
# calcul de la mediane
nbelem = asort(a);
med = a[int((nbelem+1)/2)];
printf("%s %d %s %d %s %.2f %s %.2f %s %.2f %s %.2f %s %.2f\n", "Nombre=",n,"Somme=",s,"Moyenne=",mean,"SD=",SD,"max=",max, "min=", min, "Mediane=", med);
}
function printerr(message) {
printf("%s\n", message) > "/dev/fd/2";
}'
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
my %options;
Getopt::Long::Configure("no_ignorecase");
Getopt::Long::Configure("bundling");
GetOptions( \%options,
'fasta=s', # fasta file
'fastq=s', # fastq file
'output', # write sequences ?
'help|h', # help
) or pod2usage(-verbose => 1);
pod2usage(-verbose => 1,-noperldoc => 1) if (exists $options{'help'});
pod2usage(-message => "A fasta or fastq file must be provided !",-verbose => 1) if (! exists $options{'fasta'} && ! exists $options{'fastq'});
# Main
my %h;
if (exists $options{'fasta'}){
open FASTA,$options{'fasta'};
while(<FASTA>){
next if (/^>/);
chomp;
if (exists $h{$_}){
$h{$_}++;
}
else{
$h{$_}=1;
}
}
close FASTA;
}
if (exists $options{'fastq'}){
open FASTQ,$options{'fastq'};
while(<FASTQ>){
chomp;
next if not ($_ =~ /^[actgnACTGN]+$/);
if (exists $h{$_}){
$h{$_}++;
}
else{
$h{$_}=1;
}
}
close FASTQ;
}
if (exists $options{'output'}){
foreach my $k (keys %h){
print $k,"\t",$h{$k},"\n";
}
}
else{
my $nbr = keys (%h);
print $nbr;
}
=pod
=head1 NAME
count_unique_seq.pl
=head1 SYNOPSIS
count_unique_seq.pl [--fasta FILE] [--fastq FILE] [--output] [--help]
=head1 DESCRIPTION
count_unique_seq.pl counts the number of unique sequences (remove redundancy)
outputs sequences and their occurences if --output
=head1 AUTHORS
Olivier Rue
=head1 VERSION
1
=head1 DATE
05/2013
=head1 KEYWORDS
count sequences unique fasta fastq
=head1 EXAMPLE
perl count_unique_seq.pl --fasta myfile.fa
=cut
#!/bin/sh
##NAME = "distrib_class.sh"
##SYNOPSIS = "distrib_class.sh [-h] [<SIZE_OF_BIN>INT(default 10)] [<COLUMN_INDEX>INT(default 1)]"
##DATE = "2012"
##AUTHORS = "Jean Marc Aury, Genoscope CEA. Maria Bernard"
##KEYWORDS = "compute histogram distribution on integer column"
##DESCRIPTION = "distrib_class.sh <window : int> <field number : int>"
pg=`basename $0`
usage() {
echo USAGE :
echo " $pg <window : int> <field number : int> <field value : int>"
echo " Return a bin distribution (y axes = number of cases) from a set of values (stdin)"
echo " First argument is the size of the bin, 10 by default"
echo " Second argument is the field where to find the dataset, 1 by default"
echo " Third argument is the field of the value for each element, none by default"
exit 1
}
WIN=
FIELD=
FVALUE=
while [ $1 ]
do
case $1 in
"-h") usage ;;
*) if [ "$WIN" = "" ]; then WIN=$1; elif [ "$FIELD" = "" ]; then FIELD=$1; else FVALUE=$1; fi ;;
esac
shift
done
gawk -v win=$WIN -v field=$FIELD -v fvalue=$FVALUE 'BEGIN { if(win=="") { win=10; } if(field == "") { field=1; } mul=1; while(win < 1) { win=win*10; mul=mul*10; }} { value = 1; if(fvalue != "") { value = $(fvalue); } offset=1; if(val < 0) { offset = -1; } val = int($field*mul); if(val%win ==0 ) { idp[val]+=value; idp2[val]++; } else { idp[int((val/win)+offset)*win]+=value; idp2[int((val/win)+offset)*win]++; } tot++; } END{ for(i in idp) { if(fvalue == "") { idp2[i]=1; } print i/mul, idp[i]/idp2[i]; } }' | sort -k1,1n
#!/bin/sh
##NAME = "distrib_class_percent.sh"
##SYNOPSIS = "distrib_class_percent.sh [-h] [<SIZE_OF_BIN>INT(default 10)] [<COLUMN_INDEX>INT(default 1)]"
##DATE = "2012"
##AUTHORS = "Jean Marc Aury, Genoscope CEA. Maria Bernard"
##KEYWORDS = "compute histogram distribution on integer column transform in proportion (percent)"
##DESCRIPTION = "distrib_class_percent <window : int> <field number : int>"
pg=`basename $0`
usage() {
echo USAGE :
echo " $pg <window : int> <field number : int>"
echo " Return a bin distribution (y axes = % of cases) from a set of values (stdin)"
echo " First argument is the size of the bin, 10 by default"
echo " Second argument is the field where to find the dataset, 1 by default"
exit 1
}
WIN=
FIELD=
while [ $1 ]
do
case $1 in
"-h") usage ;;
*) if [ "$WIN" = "" ]; then WIN=$1; else FIELD=$1; fi ;;
esac
shift
done
gawk -v win=$WIN -v field=$FIELD 'BEGIN { if(win=="") { win=10; } if(field == "") { field=1; } mul=1; while(win < 1) { win=win*10; mul=mul*10; }} { offset=1; if(val < 0) { offset = -1; } val = int($field*mul); if(val%win == 0 ) { idp[val]++; } else { idp[int((val/win)+offset)*win]++; } tot++; } END{ for(i in idp) { print i/mul, (idp[i]/tot)*100; } }' | sort -k1,1n
#!/usr/bin/perl
#use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
my %options;
my %h_type = (
'SNP' => 1,
'INDEL' => 1,
);
Getopt::Long::Configure("no_ignorecase");
Getopt::Long::Configure("bundling");
GetOptions(\%options,
'man',
'h|help',
'f|file=s',
't|type=s',
'o|output=s'
)
or pod2usage(-verbose => 1);
pod2usage(-verbose => 2) if (exists $options{'man'});
pod2usage(-verbose => 1) if (exists $options{'h'});
pod2usage(-message => "Please provide A VCF file ! (-f)",-verbose => 1) if (! exists $options{'f'});
pod2usage(-message => "Please specify if you want to extract SNP or INDEL ! (-t)",-verbose => 1) if (! exists($options{'t'}) || ! exists $h_type{$options{'t'}});
#pod2usage(-message => "Please specify the output file ! (-o)",-verbose => 1) if (! exists $options{'o'});
my $output = *STDOUT;
my $file=$options{'f'};
my $type=$options{'t'};
if (exists ($options{'o'})){
$output = $options{'o'};
open($output, ">$output") or die "can't open $output: $!\n";
}
my @line;
open FILE,$file or die "Cannot open file $file\n";
while(<FILE>){
chomp();
if(/^#/){
print $output $_,"\n";
next;
}
# SNP case
if ($type eq 'SNP'){
@line = split(/\s/,$_);
if ( (length($line[3]) == 1 && length($line[4]) == 1) || (length($line[3]) == 1 && $line[4] =~ m/[ACGT],[ACGT]/ && length($line[4]) == 3) ){
for (my $i=0;$i<=2;$i++){
print $output $line[$i],"\t";
}
for (my $i=3;$i<scalar(@line)-1;$i++){
print $output $line[$i],"\t";
}
print $output $line[scalar(@line)-1],"\n";
}
else{
next;
}
}
# INDEL case
else {
@line = split(/\s/,$_);
next if ( (length($line[3]) == 1 && length($line[4]) == 1) || (length($line[3]) == 1 && $line[4] =~ m/[ACGT],[ACGT]/ && length($line[4]) == 3) );
for (my $i=0;$i<=2;$i++){
print $output $line[$i],"\t";
}
for (my $i=3;$i<scalar(@line)-1;$i++){
print $output $line[$i],"\t";
}
print $output $line[scalar(@line)-1],"\n";
}
}
close FILE;
=pod
=head1 NAME
extract_variants_from_VCF.pl
=head1 SYNOPSIS
extract_variants_from_VCF.pl -f FILE -t [SNP|INDEL] [options]
=head1 OPTIONS
-o <FILE> : output FILE
=head1 DESCRIPTION
Extracts SNP or INDEL from a VCF file.
=head1 AUTHORS
Olivier Rue
=head1 VERSION
1
=head1 DATE
06/2012
=head1 KEYWORDS
VCF, extract, SNP, INDEL, variants
=head1 EXAMPLE
./extract_variants_from_VCF.pl -f snp_and_indel.vcf -t SNP -o only_snp.vcf
=cut
......@@ -2,6 +2,8 @@
use Pod::Usage;
use Getopt::Long;
use FindBin;
use lib $FindBin::RealBin;
use bioutils;
BEGIN
......
......@@ -10,8 +10,8 @@ final_count.pl
final_count.pl [options]
-arg1 <Path to genome bowtie2bd fasta file (/bank2/bowtie2db/genome.fa)>
-arg2<Path to your merged.gtf file>
-arg3<Path to your project>
-arg2 <Path to your merged.gtf file>
-arg3 <Path to your project>
=head1 OPTIONS
......
......@@ -3,6 +3,7 @@
from Bio import SeqIO
import sys
import string
import math
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from optparse import *
......@@ -15,9 +16,11 @@ __keywords__ = "fasta histogram length"
__description__ = "This script generate a histogram of the length of the sequences."
__version__ = '1.0'
parser = OptionParser(usage="Usage: length_histogram.py -i FILE ")
parser = OptionParser(usage="Usage: length_histogram.py -i FILE -l")
parser.add_option("-i", "--in", dest="fasta_file",
help="The fasta file", metavar="FILE")
parser.add_option("-l", "--log", action="store_true", dest="log",
help="Log10 x axis")
(options, args) = parser.parse_args()
if options.fasta_file == None :
......@@ -25,12 +28,16 @@ if options.fasta_file == None :
sys.exit(1)
else :
x = []
for seq_record in SeqIO.parse(open(options.fasta_file, "rU"), "fasta") :
x.append(len(seq_record.seq.tostring()))
x = []
if options.log == None :
for seq_record in SeqIO.parse(open(options.fasta_file, "rU"), "fasta") :
x.append(len(seq_record.seq.tostring()))
else :
for seq_record in SeqIO.parse(open(options.fasta_file, "rU"), "fasta") :
x.append(math.log(len(seq_record.seq.tostring()),10))
plt.hist(x, 50, facecolor='green', alpha=0.75)
plt.grid(True)
plt.savefig(options.fasta_file+".lengthhist.png")
sys.exit(0)
\ No newline at end of file
plt.hist(x, 50, facecolor='green', alpha=0.75)
plt.grid(True)
plt.savefig(options.fasta_file+".lengthhist.png")
sys.exit(0)
package bioutils;
use strict;
=head1 Description
Contains some usefull functions such as
=head2 reverse(seq)
seq is a string, this function returns another string with all characters in reverse order
=head2 complement(seq)
seq is a flat nucleic acid sequence stored in a string,
this function returns the complement of the sequence (in uppercase)
=head2 translate(seq)
seq is a flat nucleic acid sequence stored in a string,
this function returns the proteic translation of this sequence (in uppercase)
=head2 fasta2flat(file) fasta2flat(file, AC)
file is the path of a FASTA formated file.
This function returns the first flat sequence of this file or the one with the given
accession number AC (an accession number is a valid identifier WITHOUT internal blanks)
=head2 flat2fasta(file, AC, seq, line_length)
This function APPENDS a sequence to a FASTA formatted file (file),
creates a header line with AC and formats the sequence (seq) with line_length375
characters per lines
=head2 fasta_cut(file, begin, end)
This function reads a ONE ENTRY FASTA file and return the sub sequence from begin to end
=head2 fasta_length(file)
This function reads a ONE ENTRY FASTA file and return the length of the sequence
=head2 flat2gb(file, LOCUS, seq)
This function CREATE a GenBank formated file containing the sequence
given as parameter, and labelled with LOCUS.