Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bios4biol
bioinfo-utils
Commits
d806fc9b
Commit
d806fc9b
authored
Feb 02, 2015
by
Celine Noirot
Browse files
Add new script and some update
parent
e7f3e34f
Changes
17
Hide whitespace changes
Inline
Side-by-side
bin/blat_parser.pl
View file @
d806fc9b
...
...
@@ -4,8 +4,9 @@ use strict;
use
Pod::
Usage
;
use
Getopt::
Long
;
my
(
$man
,
$help
,
$nb_best
);
my
(
$man
,
$help
,
$focus
,
$nb_best
);
GetOptions
(
'
f=s
'
=>
\
$focus
,
'
b=i
'
=>
\
$nb_best
,
'
help
'
=>
\
$help
,
'
man
'
=>
\
$man
,
...
...
@@ -14,18 +15,16 @@ GetOptions(
pod2usage
(
1
)
if
(
$help
);
pod2usage
(
-
exitstatus
=>
0
,
-
verbose
=>
2
)
if
(
$man
);
pod2usage
(
-
exitstatus
=>
0
,
-
verbose
=>
1
)
if
(
-
t
);
$focus
||=
'
q
';
$nb_best
||=
1
;
my
(
%option
,
$line
);
do
{
$line
=
<
STDIN
>
;
}
until
$line
=~
/^\d+/
;
$option
{'
m
'}
=
1
;
$option
{'
p
'}
=
pslIsProtein
(
$line
)
||
0
;
my
%h
;
while
(
$line
)
{
my
$f
=
$focus
eq
'
q
'
?
9
:
13
;
while
(
$line
=
<
STDIN
>
)
{
next
unless
(
$line
=~
/^\d+/
);
my
@t
=
split
(
/\t/
,
$line
);
my
$pid
=
sprintf
("
%.1f
",
get_pid
(
@t
));
...
...
@@ -33,29 +32,28 @@ while ($line) {
$t
[
15
]
+=
1
;
my
$Qcov
=
sprintf
("
%.1f%
",(
abs
(
$t
[
12
]
-
$t
[
11
])
-
$t
[
5
]
+
1
)
/
$t
[
10
]
*
100
);
my
$Tcov
=
sprintf
("
%.1f%
",(
abs
(
$t
[
16
]
-
$t
[
15
])
-
$t
[
7
]
+
1
)
/
$t
[
14
]
*
100
);
my
@list
=
(
get_score
(
@t
),
$t
[
11
],
$t
[
12
],
$t
[
10
],
$Qcov
,
$pid
.
'
%
',
$t
[
8
],
$t
[
13
],
$t
[
15
],
$t
[
16
],
$t
[
14
],
$Tcov
,
$t
[
4
],
$t
[
5
],
$t
[
6
],
$t
[
7
]);
if
(
exists
(
$h
{
$t
[
9
]}))
{
scalar
(
@
{
$h
{
$t
[
9
]}})
==
$nb_best
?
push_if_better
(
\
@
{
$h
{
$t
[
9
]}},
\
@list
)
:
push_and_sort
(
\
@
{
$h
{
$t
[
9
]}},
\
@list
);
my
@list
=
(
$t
[
9
],
get_score
(
@t
),
$t
[
11
],
$t
[
12
],
$t
[
10
],
$Qcov
,
$pid
.
'
%
',
$t
[
8
],
$t
[
13
],
$t
[
15
],
$t
[
16
],
$t
[
14
],
$Tcov
,
$t
[
4
],
$t
[
5
],
$t
[
6
],
$t
[
7
]);
if
(
exists
(
$h
{
$t
[
$f
]}))
{
scalar
(
@
{
$h
{
$t
[
$f
]}})
==
$nb_best
?
push_if_better
(
\
@
{
$h
{
$t
[
$f
]}},
\
@list
)
:
push_and_sort
(
\
@
{
$h
{
$t
[
$f
]}},
\
@list
);
}
else
{
push
(
@
{
$h
{
$t
[
9
]}},
\
@list
);
push
(
@
{
$h
{
$t
[
$f
]}},
\
@list
);
}
$line
=
<
STDIN
>
;
}
print
join
("
\t
",
qw(qName score qStart qEnd qSize qCoverage identity strand tName tStart tEnd tSize tCoverage qNumInsert qBaseInsert tNumInsert tBaseInsert)
)
.
"
\n
";
foreach
my
$seq
(
sort
keys
%h
)
{
foreach
my
$match
(
@
{
$h
{
$seq
}})
{
print
join
("
\t
",
$seq
,
@$match
)
.
"
\n
";
print
join
("
\t
",
@$match
)
.
"
\n
";
}
}
sub
push_if_better
{
my
(
$first
,
$second
)
=
@_
;
return
unless
(
$
{
$$first
[
-
1
]}[
0
]
<
$$second
[
0
]);
return
unless
(
$
{
$$first
[
-
1
]}[
1
]
<
$$second
[
1
]);
$$first
[
-
1
]
=
$second
;
my
@t
=
sort
{
$$b
[
0
]
<=>
$$a
[
0
]
}
@$first
;
my
@t
=
sort
{
$$b
[
1
]
<=>
$$a
[
1
]
}
@$first
;
@$first
=
@t
;
return
;
}
...
...
@@ -63,7 +61,7 @@ sub push_if_better {
sub
push_and_sort
{
my
(
$first
,
$second
)
=
@_
;
push
(
@$first
,
$second
);
my
@t
=
sort
{
$$b
[
0
]
<=>
$$a
[
0
]
}
@$first
;
my
@t
=
sort
{
$$b
[
1
]
<=>
$$a
[
1
]
}
@$first
;
@$first
=
@t
;
return
;
}
...
...
@@ -183,9 +181,13 @@ sub pslIsProtein {
Prints the manual page and exits.
=item B<-f>
Focus best matches extraction on query (q) or target (t) [q]
=item B<-b>
Max number of best matches to
print for each query
[1]
Max number of best matches to
extract
[1]
=back
...
...
@@ -203,7 +205,7 @@ sub pslIsProtein {
=head1 VERSION
1
2
=head1 DATE
...
...
bin/build_matrix_from_CXreport.py
View file @
d806fc9b
...
...
@@ -58,10 +58,10 @@ for line in f :
if
previous_strand
!=
strand
and
int
(
previous_pos
)
+
1
==
int
(
pos
)
and
int
(
previous_c_met
)
>
0
:
C_previous_m
=
float
(
float
(
previous_c_met
)
/
(
float
(
previous_c_met
)
+
float
(
previous_c_unmet
))
*
100
)
C_m
=
float
(
float
(
c_met
)
/
(
float
(
c_met
)
+
float
(
c_unmet
))
*
100
)
if
previous_strand
=
=
"+"
:
if
previous_strand
!
=
strand
:
N_values_forward
.
append
(
C_previous_m
)
else
:
N_values_reverse
.
append
(
C_previous_m
)
N_values_reverse
.
append
(
C_m
)
fout
.
write
(
"
\t
"
.
join
([
previous_pos
,
str
(
C_previous_m
),
str
(
C_m
),
previous_strand
])
+
"
\n
"
)
(
previous_chr
,
previous_pos
,
previous_strand
,
previous_c_met
,
previous_c_unmet
,
previous_c_context
)
=
(
chr
,
pos
,
strand
,
c_met
,
c_unmet
,
c_context
)
...
...
@@ -72,4 +72,4 @@ else :
N_values_reverse
.
append
(
C_previous_m
)
print
"Correlation on forward VS reverse values : "
+
str
(
pearsonr
(
N_values_forward
,
N_values_reverse
))
fout
.
close
()
\ No newline at end of file
fout
.
close
()
bin/colstat.sh
0 → 100755
View file @
d806fc9b
#!/bin/sh
##NAME = "colstat.sh"
##SYNOPSIS = "colstat.sh [-h] [INT(default 1)]"
##DATE = "2012"
##AUTHORS = "Jean Marc Aury, Genoscope CEA. Maria Bernard"
##KEYWORDS = "compute statistcs on integer column"
##DESCRIPTION = "colstat index_col"
pg
=
`
basename
$0
`
usage
()
{
echo
USAGE :
echo
"
$pg
<field number : int>"
echo
" Return statistics from a set of value (stdin). Optionnal argument is the field where to find the dataset, 1 by default."
exit
1
}
field
=
1
while
[
$1
]
do
case
$1
in
"-h"
)
usage
;;
*
)
field
=
$1
;;
esac
shift
done
gawk
-v
field
=
$field
'
BEGIN {
if(field=="") { field=1; }
}
NF<field {
printerr("Not enough field : NF="NF" and field="field"\n");
exit;
}
NF>0 {
n++;
s+=$field;
a[n]=$field;
}
n==1 {
min=$field;
max=$field;
}
NF>0 {
if ($field<min) min=$field;
if ($field>max) max=$field;
}
END {
# Calcul de la moyenne
mean=0;
if(n > 0) { mean=s/n ; }
# calcul de l ecart type
for (i=1; i<=n; i++ ) {
x=a[i];
sum_ecart += ( (x-mean) * (x-mean ) )
}
SD=0;
if(n>0) { SD=sqrt(sum_ecart / n) ; }
# calcul de la mediane
nbelem = asort(a);
med = a[int((nbelem+1)/2)];
printf("%s %d %s %d %s %.2f %s %.2f %s %.2f %s %.2f %s %.2f\n", "Nombre=",n,"Somme=",s,"Moyenne=",mean,"SD=",SD,"max=",max, "min=", min, "Mediane=", med);
}
function printerr(message) {
printf("%s\n", message) > "/dev/fd/2";
}'
bin/count_unique_seq.pl
0 → 100755
View file @
d806fc9b
#!/usr/bin/perl
use
strict
;
use
warnings
;
use
Getopt::
Long
;
use
Pod::
Usage
;
my
%options
;
Getopt::Long::
Configure
("
no_ignorecase
");
Getopt::Long::
Configure
("
bundling
");
GetOptions
(
\
%options
,
'
fasta=s
',
# fasta file
'
fastq=s
',
# fastq file
'
output
',
# write sequences ?
'
help|h
',
# help
)
or
pod2usage
(
-
verbose
=>
1
);
pod2usage
(
-
verbose
=>
1
,
-
noperldoc
=>
1
)
if
(
exists
$options
{'
help
'});
pod2usage
(
-
message
=>
"
A fasta or fastq file must be provided !
",
-
verbose
=>
1
)
if
(
!
exists
$options
{'
fasta
'}
&&
!
exists
$options
{'
fastq
'});
# Main
my
%h
;
if
(
exists
$options
{'
fasta
'}){
open
FASTA
,
$options
{'
fasta
'};
while
(
<
FASTA
>
){
next
if
(
/^>/
);
chomp
;
if
(
exists
$h
{
$_
}){
$h
{
$_
}
++
;
}
else
{
$h
{
$_
}
=
1
;
}
}
close
FASTA
;
}
if
(
exists
$options
{'
fastq
'}){
open
FASTQ
,
$options
{'
fastq
'};
while
(
<
FASTQ
>
){
chomp
;
next
if
not
(
$_
=~
/^[actgnACTGN]+$/
);
if
(
exists
$h
{
$_
}){
$h
{
$_
}
++
;
}
else
{
$h
{
$_
}
=
1
;
}
}
close
FASTQ
;
}
if
(
exists
$options
{'
output
'}){
foreach
my
$k
(
keys
%h
){
print
$k
,"
\t
",
$h
{
$k
},"
\n
";
}
}
else
{
my
$nbr
=
keys
(
%h
);
print
$nbr
;
}
=pod
=head1 NAME
count_unique_seq.pl
=head1 SYNOPSIS
count_unique_seq.pl [--fasta FILE] [--fastq FILE] [--output] [--help]
=head1 DESCRIPTION
count_unique_seq.pl counts the number of unique sequences (remove redundancy)
outputs sequences and their occurences if --output
=head1 AUTHORS
Olivier Rue
=head1 VERSION
1
=head1 DATE
05/2013
=head1 KEYWORDS
count sequences unique fasta fastq
=head1 EXAMPLE
perl count_unique_seq.pl --fasta myfile.fa
=cut
bin/distrib_class.sh
0 → 100755
View file @
d806fc9b
#!/bin/sh
##NAME = "distrib_class.sh"
##SYNOPSIS = "distrib_class.sh [-h] [<SIZE_OF_BIN>INT(default 10)] [<COLUMN_INDEX>INT(default 1)]"
##DATE = "2012"
##AUTHORS = "Jean Marc Aury, Genoscope CEA. Maria Bernard"
##KEYWORDS = "compute histogram distribution on integer column"
##DESCRIPTION = "distrib_class.sh <window : int> <field number : int>"
pg
=
`
basename
$0
`
usage
()
{
echo
USAGE :
echo
"
$pg
<window : int> <field number : int> <field value : int>"
echo
" Return a bin distribution (y axes = number of cases) from a set of values (stdin)"
echo
" First argument is the size of the bin, 10 by default"
echo
" Second argument is the field where to find the dataset, 1 by default"
echo
" Third argument is the field of the value for each element, none by default"
exit
1
}
WIN
=
FIELD
=
FVALUE
=
while
[
$1
]
do
case
$1
in
"-h"
)
usage
;;
*
)
if
[
"
$WIN
"
=
""
]
;
then
WIN
=
$1
;
elif
[
"
$FIELD
"
=
""
]
;
then
FIELD
=
$1
;
else
FVALUE
=
$1
;
fi
;;
esac
shift
done
gawk
-v
win
=
$WIN
-v
field
=
$FIELD
-v
fvalue
=
$FVALUE
'BEGIN { if(win=="") { win=10; } if(field == "") { field=1; } mul=1; while(win < 1) { win=win*10; mul=mul*10; }} { value = 1; if(fvalue != "") { value = $(fvalue); } offset=1; if(val < 0) { offset = -1; } val = int($field*mul); if(val%win ==0 ) { idp[val]+=value; idp2[val]++; } else { idp[int((val/win)+offset)*win]+=value; idp2[int((val/win)+offset)*win]++; } tot++; } END{ for(i in idp) { if(fvalue == "") { idp2[i]=1; } print i/mul, idp[i]/idp2[i]; } }'
|
sort
-k1
,1n
bin/distrib_class_percent.sh
0 → 100755
View file @
d806fc9b
#!/bin/sh
##NAME = "distrib_class_percent.sh"
##SYNOPSIS = "distrib_class_percent.sh [-h] [<SIZE_OF_BIN>INT(default 10)] [<COLUMN_INDEX>INT(default 1)]"
##DATE = "2012"
##AUTHORS = "Jean Marc Aury, Genoscope CEA. Maria Bernard"
##KEYWORDS = "compute histogram distribution on integer column transform in proportion (percent)"
##DESCRIPTION = "distrib_class_percent <window : int> <field number : int>"
pg
=
`
basename
$0
`
usage
()
{
echo
USAGE :
echo
"
$pg
<window : int> <field number : int>"
echo
" Return a bin distribution (y axes = % of cases) from a set of values (stdin)"
echo
" First argument is the size of the bin, 10 by default"
echo
" Second argument is the field where to find the dataset, 1 by default"
exit
1
}
WIN
=
FIELD
=
while
[
$1
]
do
case
$1
in
"-h"
)
usage
;;
*
)
if
[
"
$WIN
"
=
""
]
;
then
WIN
=
$1
;
else
FIELD
=
$1
;
fi
;;
esac
shift
done
gawk
-v
win
=
$WIN
-v
field
=
$FIELD
'BEGIN { if(win=="") { win=10; } if(field == "") { field=1; } mul=1; while(win < 1) { win=win*10; mul=mul*10; }} { offset=1; if(val < 0) { offset = -1; } val = int($field*mul); if(val%win == 0 ) { idp[val]++; } else { idp[int((val/win)+offset)*win]++; } tot++; } END{ for(i in idp) { print i/mul, (idp[i]/tot)*100; } }'
|
sort
-k1
,1n
bin/extract_variants_from_VCF.pl
0 → 100755
View file @
d806fc9b
#!/usr/bin/perl
#use strict;
use
warnings
;
use
Getopt::
Long
;
use
Pod::
Usage
;
my
%options
;
my
%h_type
=
(
'
SNP
'
=>
1
,
'
INDEL
'
=>
1
,
);
Getopt::Long::
Configure
("
no_ignorecase
");
Getopt::Long::
Configure
("
bundling
");
GetOptions
(
\
%options
,
'
man
',
'
h|help
',
'
f|file=s
',
'
t|type=s
',
'
o|output=s
'
)
or
pod2usage
(
-
verbose
=>
1
);
pod2usage
(
-
verbose
=>
2
)
if
(
exists
$options
{'
man
'});
pod2usage
(
-
verbose
=>
1
)
if
(
exists
$options
{'
h
'});
pod2usage
(
-
message
=>
"
Please provide A VCF file ! (-f)
",
-
verbose
=>
1
)
if
(
!
exists
$options
{'
f
'});
pod2usage
(
-
message
=>
"
Please specify if you want to extract SNP or INDEL ! (-t)
",
-
verbose
=>
1
)
if
(
!
exists
(
$options
{'
t
'})
||
!
exists
$h_type
{
$options
{'
t
'}});
#pod2usage(-message => "Please specify the output file ! (-o)",-verbose => 1) if (! exists $options{'o'});
my
$output
=
*STDOUT
;
my
$file
=
$options
{'
f
'};
my
$type
=
$options
{'
t
'};
if
(
exists
(
$options
{'
o
'})){
$output
=
$options
{'
o
'};
open
(
$output
,
"
>
$output
")
or
die
"
can't open
$output
: $!
\n
";
}
my
@line
;
open
FILE
,
$file
or
die
"
Cannot open file
$file
\n
";
while
(
<
FILE
>
){
chomp
();
if
(
/^#/
){
print
$output
$_
,"
\n
";
next
;
}
# SNP case
if
(
$type
eq
'
SNP
'){
@line
=
split
(
/\s/
,
$_
);
if
(
(
length
(
$line
[
3
])
==
1
&&
length
(
$line
[
4
])
==
1
)
||
(
length
(
$line
[
3
])
==
1
&&
$line
[
4
]
=~
m/[ACGT],[ACGT]/
&&
length
(
$line
[
4
])
==
3
)
){
for
(
my
$i
=
0
;
$i
<=
2
;
$i
++
){
print
$output
$line
[
$i
],"
\t
";
}
for
(
my
$i
=
3
;
$i
<
scalar
(
@line
)
-
1
;
$i
++
){
print
$output
$line
[
$i
],"
\t
";
}
print
$output
$line
[
scalar
(
@line
)
-
1
],"
\n
";
}
else
{
next
;
}
}
# INDEL case
else
{
@line
=
split
(
/\s/
,
$_
);
next
if
(
(
length
(
$line
[
3
])
==
1
&&
length
(
$line
[
4
])
==
1
)
||
(
length
(
$line
[
3
])
==
1
&&
$line
[
4
]
=~
m/[ACGT],[ACGT]/
&&
length
(
$line
[
4
])
==
3
)
);
for
(
my
$i
=
0
;
$i
<=
2
;
$i
++
){
print
$output
$line
[
$i
],"
\t
";
}
for
(
my
$i
=
3
;
$i
<
scalar
(
@line
)
-
1
;
$i
++
){
print
$output
$line
[
$i
],"
\t
";
}
print
$output
$line
[
scalar
(
@line
)
-
1
],"
\n
";
}
}
close
FILE
;
=pod
=head1 NAME
extract_variants_from_VCF.pl
=head1 SYNOPSIS
extract_variants_from_VCF.pl -f FILE -t [SNP|INDEL] [options]
=head1 OPTIONS
-o <FILE> : output FILE
=head1 DESCRIPTION
Extracts SNP or INDEL from a VCF file.
=head1 AUTHORS
Olivier Rue
=head1 VERSION
1
=head1 DATE
06/2012
=head1 KEYWORDS
VCF, extract, SNP, INDEL, variants
=head1 EXAMPLE
./extract_variants_from_VCF.pl -f snp_and_indel.vcf -t SNP -o only_snp.vcf
=cut
bin/fasta_split.pl
View file @
d806fc9b
...
...
@@ -2,6 +2,8 @@
use
Pod::
Usage
;
use
Getopt::
Long
;
use
FindBin
;
use
lib
$
FindBin::
RealBin
;
use
bioutils
;
BEGIN
...
...
bin/final_count_file.pl
View file @
d806fc9b
...
...
@@ -10,8 +10,8 @@ final_count.pl
final_count.pl [options]
-arg1 <Path to genome bowtie2bd fasta file (/bank2/bowtie2db/genome.fa)>
-arg2<Path to your merged.gtf file>
-arg3<Path to your project>
-arg2
<Path to your merged.gtf file>
-arg3
<Path to your project>
=head1 OPTIONS
...
...
bin/length_histogram.py
View file @
d806fc9b
...
...
@@ -3,6 +3,7 @@
from
Bio
import
SeqIO
import
sys
import
string
import
math
import
matplotlib.mlab
as
mlab
import
matplotlib.pyplot
as
plt
from
optparse
import
*
...
...
@@ -15,9 +16,11 @@ __keywords__ = "fasta histogram length"
__description__
=
"This script generate a histogram of the length of the sequences."
__version__
=
'1.0'
parser
=
OptionParser
(
usage
=
"Usage: length_histogram.py -i FILE "
)
parser
=
OptionParser
(
usage
=
"Usage: length_histogram.py -i FILE
-l
"
)
parser
.
add_option
(
"-i"
,
"--in"
,
dest
=
"fasta_file"
,
help
=
"The fasta file"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"-l"
,
"--log"
,
action
=
"store_true"
,
dest
=
"log"
,
help
=
"Log10 x axis"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
options
.
fasta_file
==
None
:
...
...
@@ -25,12 +28,16 @@ if options.fasta_file == None :
sys
.
exit
(
1
)
else
:
x
=
[]
for
seq_record
in
SeqIO
.
parse
(
open
(
options
.
fasta_file
,
"rU"
),
"fasta"
)
:
x
.
append
(
len
(
seq_record
.
seq
.
tostring
()))
x
=
[]
if
options
.
log
==
None
:
for
seq_record
in
SeqIO
.
parse
(
open
(
options
.
fasta_file
,
"rU"
),
"fasta"
)
:
x
.
append
(
len
(
seq_record
.
seq
.
tostring
()))
else
:
for
seq_record
in
SeqIO
.
parse
(
open
(
options
.
fasta_file
,
"rU"
),
"fasta"
)
:
x
.
append
(
math
.
log
(
len
(
seq_record
.
seq
.
tostring
()),
10
))
plt
.
hist
(
x
,
50
,
facecolor
=
'green'
,
alpha
=
0.75
)
plt
.
grid
(
True
)
plt
.
savefig
(
options
.
fasta_file
+
".lengthhist.png"
)
sys
.
exit
(
0
)
\ No newline at end of file
plt
.
hist
(
x
,
50
,
facecolor
=
'green'
,
alpha
=
0.75
)
plt
.
grid
(
True
)
plt
.
savefig
(
options
.
fasta_file
+
".lengthhist.png"
)
sys
.
exit
(
0
)
bin/not_documented/bioutils.pm
0 → 100755
View file @
d806fc9b
package
bioutils
;
use
strict
;
=head1 Description
Contains some usefull functions such as
=head2 reverse(seq)
seq is a string, this function returns another string with all characters in reverse order
=head2 complement(seq)
seq is a flat nucleic acid sequence stored in a string,
this function returns the complement of the sequence (in uppercase)
=head2 translate(seq)
seq is a flat nucleic acid sequence stored in a string,
this function returns the proteic translation of this sequence (in uppercase)
=head2 fasta2flat(file) fasta2flat(file, AC)
file is the path of a FASTA formated file.
This function returns the first flat sequence of this file or the one with the given
accession number AC (an accession number is a valid identifier WITHOUT internal blanks)
=head2 flat2fasta(file, AC, seq, line_length)
This function APPENDS a sequence to a FASTA formatted file (file),
creates a header line with AC and formats the sequence (seq) with line_length375
characters per lines
=head2 fasta_cut(file, begin, end)
This function reads a ONE ENTRY FASTA file and return the sub sequence from begin to end
=head2 fasta_length(file)
This function reads a ONE ENTRY FASTA file and return the length of the sequence
=head2 flat2gb(file, LOCUS, seq)
This function CREATE a GenBank formated file containing the sequence
given as parameter, and labelled with LOCUS.