Commit bd9e9d79 authored by no_author's avatar no_author
Browse files

No commit message

No commit message
parent 3f1cda9a
#!/usr/bin/perl
# FASTX-toolkit - FASTA/FASTQ preprocessing tools.
# FASTX-toolkit - FASTA/FASTQ preprocessing tools. fastx_barcode_splitter.pl
# Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
#
# This program is free software: you can redistribute it and/or modify
......@@ -155,7 +155,6 @@ item B<--trim>
=cut
##
## This program splits a FASTQ/FASTA file into several smaller files,
## Based on barcode matching.
......@@ -189,6 +188,7 @@ my $quiet = 0 ;
my $debug = 0 ;
my $trim = 0 ;
my $trim2 = 0 ;
my $adapt = 1 ;
my $no_adapt = 0 ;
my $rad;
my $radTAG;
......@@ -246,6 +246,7 @@ print_results unless $quiet;
sub parse_command_line {
my $help;
my $doc;
my $version;
pod2usage(1) if (scalar @ARGV==0);
......@@ -266,12 +267,14 @@ sub parse_command_line {
"radTAG=s"=> \$radTAG,
"TAG_mismatch=i"=> \$TAG_mm,
"help" => \$help,
"doc" => \$doc
"doc" => \$doc,
"version" => \$version
) ;
pod2usage(1) if ($help);
doc() if ($doc);
version() if ($version);
die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
die "Error: prefix path/filename not specified (use '--prefix-r1 [PATH]%.fq')\n" unless defined $newfile_prefix_r1;
# If a read2 file is provided
......@@ -293,19 +296,19 @@ sub parse_command_line {
die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
"max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
if ($trim){
die "Error: can't specify --trim and --trim2\n" if $trim2;
}
$trim = 1 if $trim2 ;
if ((defined $rad && !defined $radTAG) || (!defined $rad && defined $radTAG)){
die "Error: you must defined --rad AND --radTAG option\n";
}
if ( $no_adapt){
$adapt=0;
}
if (defined $rad ){
chomp $rad;
chomp $radTAG;
die "Error: $rad does not ended by $radTAG\n" if ($rad !~ /$radTAG$/g);
$no_adapt=1;
$adapt=0;
$check_rad=1;
#~ warn $rad, " ",$radTAG;
$radTAG_length=length($radTAG);
......@@ -449,7 +452,11 @@ sub match_sequences {
my $best_barcode_ident = undef;
my $sequence_fragment;
my $sequence_fragment_radTAG;
my $start = 0;
my $cut_len = $seq_len;
my $start2 = 0;
my $cut_len2 = $seq_len_r2;
warn $start , " " , $cut_len , " " , $start2 , " " , $cut_len2;
#Try all barcodes, find the one with the lowest mismatch count
if ($barcodes_at_bol) {
$sequence_fragment = substr $seq_bases, 0, $barcodes_length;
......@@ -457,12 +464,43 @@ sub match_sequences {
{
$sequence_fragment_radTAG = substr $seq_bases, $barcodes_length, $radTAG_length;
}
} else {
$sequence_fragment = substr $seq_bases, - $barcodes_length;
if ($trim){
$start=$barcodes_length+$adapt;
$cut_len=$seq_len-$barcodes_length-$adapt;
if ($trim2) {
$start2 = 0 ;
$cut_len2=$cut_len;
}
}
} elsif($barcodes_at_eol) {
if ($paired != 1){
$sequence_fragment = substr $seq_bases, - $barcodes_length ;
if (defined $rad){
$sequence_fragment_radTAG = substr $seq_bases, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length;
}
if ($trim){
$start=0 ;
$cut_len=$seq_len-$barcodes_length-$adapt;
}
} else {
$sequence_fragment = substr $seq_bases_r2, - $barcodes_length;
if (defined $rad){
$sequence_fragment_radTAG = substr $seq_bases_r2, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length;
}
if ($trim){
$start2=0 ;
$cut_len2=$seq_len_r2-$barcodes_length-$adapt;
if ($trim2) {
$start = 0 ;
$cut_len=$cut_len2 ;
}
}
}
#~ warn "sequence fragment : $sequence_fragment\n" if $debug;
#~ $sequence_fragment = reverse_complement $sequence_fragment;
#~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug;
}
warn $start," ", $cut_len, " ",$start2, " ",$cut_len2;
foreach my $barcoderef (@barcodes) {
my ($ident, $barcode) = @{$barcoderef};
......@@ -493,61 +531,85 @@ sub match_sequences {
}
if ($best_barcode_mismatches_count<=$allowed_mismatches && $best_barcode_ident ne 'ambiguous'){
my $start;
my $stop;
# trim the first read
if ($trim) {
#~ warn $seq_len," ",$barcodes_length;
if ($no_adapt && $barcodes_at_bol){
$start=$barcodes_length;
$stop=$seq_len;
}elsif ($no_adapt && $barcodes_at_eol){
$start=0;
$stop=$seq_len;
}elsif ($barcodes_at_bol){
$start=$barcodes_length+1;
$stop=$seq_len;
}elsif ($barcodes_at_eol){
$start=0;
$stop=$seq_len-1;
}
warn "position of fragment to keep : $start, $stop \n" if $debug;
$seq_bases = substr $seq_bases, $start, $stop;
if ($fastq_format) {
$seq_qualities = substr $seq_qualities, $start, $stop;
}
$seq_bases = substr $seq_bases, $start, $cut_len;
if ($fastq_format) {
$seq_qualities = substr $seq_qualities, $start, $cut_len;
}
# trim both read
if ($trim2){
my $start;
my $stop;
my $stop2;
if ($no_adapt && $barcodes_at_bol){
$start=$barcodes_length;
$stop=$seq_len;
$stop2=$seq_len_r2;
}elsif ($no_adapt && $barcodes_at_eol){
$start=0;
$stop=$seq_len;
$stop2=$seq_len_r2;
}elsif ($barcodes_at_bol){
$start=$barcodes_length+1;
$stop=$seq_len;
$stop2=$seq_len_r2;
}elsif ($barcodes_at_eol){
$start=0;
$stop=$seq_len-1;
$stop2=$seq_len_r2-1;
}
$seq_bases = substr $seq_bases, $start, $stop;
$seq_bases_r2 = substr $seq_bases_r2, $start, $stop2;
if ($fastq_format){
$seq_qualities = substr $seq_qualities, $start, $stop;
$seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop2;
if ($paired == 1 ) {
$seq_bases_r2 = substr $seq_bases_r2, $start2, $cut_len2;
if ($fastq_format) {
$seq_qualities_r2 = substr $seq_qualities_r2, $start2, $cut_len2;
}
}
# trim the barcode
# if ($trim) {
# if ($barcodes_at_bol){
# $start=$barcodes_length+$adapt;
# $stop=$seq_len;
# } elsif ($barcodes_at_eol) {
# $start=0;
# $stop=$seq_len-$adapt;
# }
#
# #~ warn $seq_len," ",$barcodes_length;
## if ($adapt && $barcodes_at_bol){
## $start=$barcodes_length;
## $stop=$seq_len;
## }elsif ($adapt && $barcodes_at_eol){
## $start=0;
## $stop=$seq_len;
## }elsif ($barcodes_at_bol){
## $start=$barcodes_length+1;
## $stop=$seq_len;
## }elsif ($barcodes_at_eol){
## $start=0;
## $stop=$seq_len-1;
## }
# warn "position of fragment to keep : $start, $stop \n" if $debug;
# if ($paired==1 && $barcodes_at_eol){
# $seq_bases_r2 = substr $seq_bases_r2, $start, $stop;
# } else {
# $seq_bases = substr $seq_bases, $start, $stop;
# }
#
# if ($fastq_format) {
# if ($paired==1 && $barcodes_at_eol){
# $seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop;
# } else {
# $seq_qualities = substr $seq_qualities, $start, $stop;
# }
# }
# }
#
# # trim both read
# if ($trim2){
# my $start;
# my $stop;
# my $stop2;
# if ($adapt && $barcodes_at_bol){
# $start=$barcodes_length;
# $stop=$seq_len;
# $stop2=$seq_len_r2;
# }elsif ($adapt && $barcodes_at_eol){
# $start=0;
# $stop=$seq_len;
# $stop2=$seq_len_r2;
# }elsif ($barcodes_at_bol){
# $start=$barcodes_length+1;
# $stop=$seq_len;
# $stop2=$seq_len_r2;
# }elsif ($barcodes_at_eol){
# $start=0;
# $stop=$seq_len-1;
# $stop2=$seq_len_r2-1;
# }
# $seq_bases = substr $seq_bases, $start, $stop;
# $seq_bases_r2 = substr $seq_bases_r2, $start, $stop2;
# if ($fastq_format){
# $seq_qualities = substr $seq_qualities, $start, $stop;
# $seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop2;
# }
# }
}
if ( (!defined $best_barcode_ident) || ($best_barcode_mismatches_count>$allowed_mismatches)){
......@@ -624,7 +686,7 @@ sub read_record
$seq_bases = $input_file_io->getline();
die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
$seq_len = length($seq_bases)-$barcodes_length-1;
$seq_len = length($seq_bases)-1;
# If using FASTQ format, read two more lines
if ($fastq_format) {
$seq_name2 = $input_file_io->getline();
......@@ -641,7 +703,7 @@ sub read_record
$seq_bases_r2 = $input_file_io_r2->getline();
die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases_r2;
$seq_len_r2 = length($seq_bases_r2)-$barcodes_length-1;
$seq_len_r2 = length($seq_bases_r2)-1;
# If using FASTQ format, read two more lines
if ($fastq_format) {
$seq_name2_r2 = $input_file_io_r2->getline();
......@@ -717,11 +779,11 @@ sub open_and_detect_input_format
$input_file_io->ungetc(ord $first_char);
$seq_name = $input_file_io->getline();
$seq_bases = $input_file_io->getline();
$seq_len = length($seq_bases)-$barcodes_length-1;
$seq_len = length($seq_bases)-1;
if ($paired == 1) {
$seq_name_r2 = $input_file_io_r2->getline();
$seq_bases_r2 = $input_file_io_r2->getline();
$seq_len_r2 = length($seq_bases_r2)-$barcodes_length-1;
$seq_len_r2 = length($seq_bases_r2)-1;
}
if ($first_char eq '>') {
# FASTA format
......@@ -744,6 +806,7 @@ sub open_and_detect_input_format
} else {
die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
}
warn "len1: ", $seq_len, "; len2: ", $seq_len_r2, " ;barcode len: ", $barcodes_length, "; no adapt: ", $adapt;
}
sub doc()
......@@ -877,3 +940,9 @@ EOF
exit 1;
}
sub version()
{
print STDOUT "1.0";
exit 0 ;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment