Commit 94152add authored by Maria Bernard's avatar Maria Bernard
Browse files

No commit message

No commit message
parent 8ed4535f
......@@ -15,7 +15,6 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
use strict;
use warnings;
use IO::Handle;
......@@ -24,7 +23,137 @@ use PerlIO::gzip;
use Data::Dumper;
use Getopt::Long;
use Carp;
use Pod::Usage;
=pod
=head1 NAME
splitbc.pl
=head1 SYNOPSIS
splitbc.pl read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol]
[--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug]
=head1 DESCRIPTION
splitbc.pl classify fasta/fastq single or paired end reads in function of barcode forward or reverse in the first or both reads.
Also, whether reads are RAD sequence, it checks that barcode is followed by RAD tag.
=head1 OPTIONS
=over
=item B<--bcfile FILE>
Barcodes file name. (splitbc.pl --doc for details)
=item B<--prefix-r1 PREFIX>
File prefix. The sample name will be placed where the % is placed.
=item B<--prefix-r2 PREFIX>
File prefix. The sample name will be placed where the % is placed.
=item B<--bol>
Try to match barcodes at the BEGINNING of sequences.
(What biologists would call the 5' end, and programmers
would call index 0.)
=item B<--eol>
Try to match barcodes at the END of sequences.
(What biologists would call the 3' end, and programmers
would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed.
NOTE: one of --bol, --eol must be specified, but not both.
=item B<--mismatches N>
Max. number of mismatches allowed. default is 1.
=item B<--exact>
Same as '--mismatches 0'. If both --exact and --mismatches are specified, '--exact' takes precedence.
=item B <--partial N>
Allow partial overlap of barcodes. (splitbc.pl --doc for details)
(Default is not partial matching)
item B<--trim>
Should the barecode be trimmed.
=item B<--trim2>
Shoud the read 2 be trimmed to have the same length as the read1
NOTE: one of --trim, --trim2 must be specified, but not both.
=item B<--no_adapt>
there is no adaptator (T or A ) between barcode and the sequence
=item B<--rad STR>
sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
NOTE: --rad implies --no_adapt
=item B<--radTAG STR>
sequence retain at the begining of the sequence after cliping.
NOTE: if define rad, radTAG must be defined to, and conversely
Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file.
=item B<--TAG_mismatch N>
Max. number of mismatches allowed in the radTAG sequence. default is 0.
=item B<--quiet>
Don't print counts and summary at the end of the run.
(Default is to print.)
=item B<--debug>
Print lots of useless debug information to STDERR.
=item B<--help>
This helpful help screen.
=back
=head1 AUTHORS
Jérôme Mariette
modified by Maria Bernard
=head1 VERSION
1.1
=head1 DATE
modified 21/05/2013
=head1 KEYWORDS
Barcode RAD
=head1 EXAMPLE
(Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is the barcodes file):
$0 cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
--prefix-r1 /tmp/bla_%.txt"
=head1 CHANGELOG
1.1: 18/10/2013 Maria
construct list of best barcode to deal with equal match. In this case reads will be written in "ambiguous" files
=cut
##
## This program splits a FASTQ/FASTA file into several smaller files,
......@@ -117,7 +246,7 @@ sub parse_command_line {
my $help;
my $doc;
usage() if (scalar @ARGV==0);
pod2usage(1) if (scalar @ARGV==0);
my $result = GetOptions ( "bcfile=s" => \$barcode_file,
"eol" => \$barcodes_at_eol,
......@@ -139,7 +268,7 @@ sub parse_command_line {
"doc" => \$doc
) ;
usage() if ($help);
pod2usage(1) if ($help);
doc() if ($doc);
die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
......@@ -230,7 +359,7 @@ sub load_barcode_file ($) {
print STDERR "barcode\tsequence\n";
foreach my $barcoderef (@barcodes) {
my ($ident, $seq) = @{$barcoderef};
print STDERR $ident,"\t", $seq ,"\n";
print STDERR $ident,"\t", $seq ,"\t",length($seq),"\n";
}
}
}
......@@ -240,56 +369,59 @@ sub load_barcode_file ($) {
sub create_output_files {
my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
$barcodes{'unmatched'} = 1 ;
$barcodes{'ambiguous'} = 1 ;
foreach my $ident (keys %barcodes) {
if ($paired == 1) {
my $new_filename = $newfile_prefix_r1;
$new_filename =~ s/%/$ident/g;
$filenames{$ident} = $new_filename;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident} = $file ;
if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){
my $new_filename = $newfile_prefix_r1;
$new_filename =~ s/%/$ident/g;
$new_filename =~ s/%/${ident}_2rad/g;
$filenames{$ident."_2rad"} = $new_filename;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident."_2rad"} = $file ;
}
if ($paired == 1) {
my $new_filename_r2 = $newfile_prefix_r2;
$new_filename_r2 =~ s/%/$ident/;
$filenames{$ident} = $new_filename;
$new_filename_r2 =~ s/%/$ident/g;
$filenames{$ident."r2"} = $new_filename_r2;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
open my $file_r2, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename_r2)\n";
$files{$ident} = $file ;
$files{$ident."r2"} = $file_r2 ;
if (defined $rad && $ident ne "unmatched"){
my $new_filename = $newfile_prefix_r1;
$new_filename =~ s/%/${ident}_2rad/g;
open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident."r2"} = $file ;
if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){
my $new_filename_r2 = $newfile_prefix_r2;
$new_filename_r2 =~ s/%/${ident}_2rad/;
$filenames{$ident."_2rad"} = $new_filename;
$new_filename_r2 =~ s/%/${ident}_2rad/g;
$filenames{$ident."_2radr2"} = $new_filename_r2;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
open my $file_r2, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename_r2)\n";
$files{$ident."_2rad"} = $file ;
$files{$ident."_2radr2"} = $file_r2 ;
#~ warn $ident."_2rad"," ",$ident."_2radr2";
#~ warn $new_filename," ",$new_filename_r2;
}
} else {
my $new_filename = $newfile_prefix_r1;
$new_filename =~ s/%/$ident/g;
$filenames{$ident} = $new_filename;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident} = $file ;
if (defined $rad && $ident ne "unmatched"){
my $new_filename = $newfile_prefix_r1;
$new_filename =~ s/%/${ident}_2rad/g;
$filenames{$ident} = $new_filename;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident."2rad"} = $file ;
open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident."_2radr2"} = $file ;
}
}
}
}
sub reverse_complement {
my $dna = shift;
# reverse the DNA sequence
my $revcomp = reverse($dna);
# complement the reversed DNA sequence
$revcomp =~ tr/ACGTacgt/TGCAtgca/;
return $revcomp;
}
sub match_sequences {
my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
$barcodes{'unmatched'} = 1 ;
$barcodes{'ambiguous'} = 1 ;
#reset counters
foreach my $ident ( keys %barcodes ) {
$counts{$ident} = 0;
......@@ -301,15 +433,17 @@ sub match_sequences {
# split accotding to barcodes
do {
chomp $seq_bases;
chomp $seq_qualities;
print STDERR "sequence $seq_bases: \n" if $debug;
#~ print STDERR "sequence ".length($seq_bases)." $seq_bases: \n" if $debug;
if ($fastq_format){
chomp $seq_qualities;
}
if ($paired == 1) {
chomp $seq_bases_r2;
chomp $seq_qualities_r2;
print STDERR "sequence $seq_bases_r2: \n" if $debug;
if ($fastq_format){
chomp $seq_qualities_r2;
}
}
my $best_barcode_mismatches_count = $barcodes_length;
my $best_barcode_ident = undef;
my $sequence_fragment;
......@@ -324,12 +458,11 @@ sub match_sequences {
}
} else {
$sequence_fragment = substr $seq_bases, - $barcodes_length;
#~ warn "sequence fragment : $sequence_fragment\n" if $debug;
#~ $sequence_fragment = reverse_complement $sequence_fragment;
#~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug;
}
#~ warn $seq_bases;
#~ warn $sequence_fragment;
#~ warn $sequence_fragment_radTAG;
foreach my $barcoderef (@barcodes) {
my ($ident, $barcode) = @{$barcoderef};
......@@ -337,9 +470,11 @@ sub match_sequences {
# The barcode will be tested only against this fragment
# (no point in testing the barcode against the whole sequence)
my $mm=$barcodes_length;
# check validity of RAD.
if (defined $rad){
my $mm_rad = mismatch_count($sequence_fragment_radTAG, $radTAG) ;
if ($mm_rad< $TAG_mm+1){ $mm = mismatch_count($sequence_fragment, $barcode) ; }
if ($mm_rad<= $TAG_mm){ $mm = mismatch_count($sequence_fragment, $barcode) ; }
}else {
$mm = mismatch_count($sequence_fragment, $barcode) ;
}
......@@ -351,30 +486,39 @@ sub match_sequences {
if ( $mm < $best_barcode_mismatches_count ) {
$best_barcode_mismatches_count = $mm ;
$best_barcode_ident = $ident ;
}elsif($mm == $best_barcode_mismatches_count){
$best_barcode_ident = "ambiguous"
}
}
if ($best_barcode_mismatches_count<=$allowed_mismatches){
if ($best_barcode_mismatches_count<=$allowed_mismatches && $best_barcode_ident ne 'ambiguous'){
my $start;
my $stop;
# trim the first read
if ($trim) {
#~ warn $seq_len," ",$barcodes_length;
if ($no_adapt && $barcodes_at_bol){
$start=$barcodes_length;
$stop=$seq_len;
}elsif ($no_adapt && $barcodes_at_eol){
$start=0;
$stop=$seq_len-$barcodes_length;
$stop=$seq_len;
}elsif ($barcodes_at_bol){
$start=$barcodes_length+1;
$stop=$seq_len;
}elsif ($barcodes_at_eol){
$start=0;
$stop=$seq_len-($barcodes_length+1);
$stop=$seq_len-1;
}
warn "position of fragment to keep : $start, $stop \n" if $debug;
$seq_bases = substr $seq_bases, $start, $stop;
if ($fastq_format) {
$seq_qualities = substr $seq_qualities, $start, $stop;
}
$seq_bases = substr $seq_bases, $start, $stop;
$seq_qualities = substr $seq_qualities, $start, $stop;
}
# trim both read
if ($trim2){
my $start;
my $stop;
......@@ -385,21 +529,23 @@ sub match_sequences {
$stop2=$seq_len_r2;
}elsif ($no_adapt && $barcodes_at_eol){
$start=0;
$stop=$seq_len-$barcodes_length;
$stop2=$seq_len_r2-$barcodes_length;
$stop=$seq_len;
$stop2=$seq_len_r2;
}elsif ($barcodes_at_bol){
$start=$barcodes_length+1;
$stop=$seq_len;
$stop2=$seq_len_r2;
}elsif ($barcodes_at_eol){
$start=0;
$stop=$seq_len-($barcodes_length+1);
$stop2=$seq_len_r2-($barcodes_length+1);
$stop=$seq_len-1;
$stop2=$seq_len_r2-1;
}
$seq_bases = substr $seq_bases, $start, $stop;
$seq_qualities = substr $seq_qualities, $start, $stop;
$seq_bases_r2 = substr $seq_bases_r2, $start, $stop2;
$seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop2;
$seq_bases_r2 = substr $seq_bases_r2, $start, $stop2;
if ($fastq_format){
$seq_qualities = substr $seq_qualities, $start, $stop;
$seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop2;
}
}
}
......@@ -409,7 +555,8 @@ sub match_sequences {
#get the file associated with the matched barcode.
#(note: there's also a file associated with 'unmatched' barcode)
if (defined $rad && $best_barcode_ident ne 'unmatched' ){
if (defined $rad && $best_barcode_ident ne 'unmatched' && $best_barcode_ident ne 'ambiguous'){
my $nb_rad =()= ($seq_bases =~ /$rad/g);
#~ warn $seq_bases," ",$nb_rad;
$best_barcode_ident.='_2rad' if ($nb_rad > 0);
......@@ -418,12 +565,15 @@ sub match_sequences {
$counts{$best_barcode_ident}++;
my $file = $files{$best_barcode_ident};
#~ warn "$seq_name , $best_barcode_ident \n";
#~ warn $file ;
$write_r2 = 0;
write_record($file);
if ($paired == 1) {
#~ warn $best_barcode_ident."r2";
my $file = $files{$best_barcode_ident."r2"};
$write_r2 = 1;
#~ warn $best_barcode_ident."r2";
write_record($file);
}
......@@ -473,7 +623,7 @@ sub read_record
$seq_bases = $input_file_io->getline();
die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
$seq_len = length($seq_bases)-$barcodes_length-1;
# If using FASTQ format, read two more lines
if ($fastq_format) {
$seq_name2 = $input_file_io->getline();
......@@ -490,7 +640,7 @@ sub read_record
$seq_bases_r2 = $input_file_io_r2->getline();
die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases_r2;
$seq_len_r2 = length($seq_bases_r2)-$barcodes_length-1;
# If using FASTQ format, read two more lines
if ($fastq_format) {
$seq_name2_r2 = $input_file_io_r2->getline();
......@@ -520,7 +670,7 @@ sub write_record($)
}
} else {
my $file = shift;
croak "Bad file handle" unless defined $file;
croak "Bad file handle $seq_name" unless defined $file;
print $file $seq_name;
print $file $seq_bases,"\n";
......@@ -594,45 +744,7 @@ sub open_and_detect_input_format
die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
}
}
sub usage()
{
print<<EOF;
usage: $0 read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol]
[--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug]
Arguments:
--bcfile FILE - Barcodes file name. (see explanation below.)
--prefix-r1 PREFIX - File prefix. The sample name will be placed where the % is placed.
--prefix-r2 PREFIX - File prefix. The sample name will be placed where the % is placed.
--bol - Try to match barcodes at the BEGINNING of sequences.
(What biologists would call the 5' end, and programmers
would call index 0.)
--eol - Try to match barcodes at the END of sequences.
(What biologists would call the 3' end, and programmers
would call the end of the string.)
NOTE: one of --bol, --eol must be specified, but not both.
--mismatches N - Max. number of mismatches allowed. default is 1.
--exact - Same as '--mismatches 0'. If both --exact and --mismatches
are specified, '--exact' takes precedence.
--partial N - Allow partial overlap of barcodes. (see explanation below.)
(Default is not partial matching)
--trim - Should the barecode be trimmed.
--trim2 - Shoud the read 2 be trimmed to have the same length as the read1
NOTE: one of --trim, --trim2 must be specified, but not both.
--no_adapt - there is no adaptator (T or A ) between barcode and the sequence
--rad STR - sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
NOTE: --rad implies --no_adapt
--radTAG STR - sequence retain at the begining of the sequence after cliping.
NOTE: if define rad radTAG must be defined to, and conversely
--TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0.
--quiet - Don't print counts and summary at the end of the run.
(Default is to print.)
--debug - Print lots of useless debug information to STDERR.
--help - This helpful help screen.
EOF
exit 1;
}
sub doc()
{
print<<EOF;
......@@ -657,7 +769,7 @@ Arguments:
would call index 0.)
--eol - Try to match barcodes at the END of sequences.
(What biologists would call the 3' end, and programmers
would call the end of the string.)
would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed.
NOTE: one of --bol, --eol must be specified, but not both.
--mismatches N - Max. number of mismatches allowed. default is 1.
--exact - Same as '--mismatches 0'. If both --exact and --mismatches
......@@ -665,14 +777,15 @@ Arguments:
--partial N - Allow partial overlap of barcodes. (see explanation below.)
(Default is not partial matching)
--trim - Should the barecode be trimmed.
--trim2 - both read are trimmed to have the same length.
--trim2 - Shoud the read 2 be trimmed to have the same length as the read1
NOTE: one of --trim, --trim2 must be specified, but not both.
--no_adapt - there is no adaptator (T or A ) between barcode and the sequence
--rad STR - sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
NOTE: --rad implies --no_adapt
--radTAG STR - sequence retain at the begining of the sequence after cliping.
NOTE: if define rad radTAG must be defined to, and conversely
--TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0.
NOTE: if define rad, radTAG must be defined to, and conversely
Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file.
--TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0.
--quiet - Don't print counts and summary at the end of the run.
(Default is to print.)
--debug - Print lots of useless debug information to STDERR.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment