Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bios4biol
bioinfo-utils
Commits
bd9e9d79
Commit
bd9e9d79
authored
Mar 02, 2015
by
no_author
Browse files
No commit message
No commit message
parent
3f1cda9a
Changes
1
Hide whitespace changes
Inline
Side-by-side
bin/splitbc.pl
View file @
bd9e9d79
#!/usr/bin/perl
# FASTX-toolkit - FASTA/FASTQ preprocessing tools.
# FASTX-toolkit - FASTA/FASTQ preprocessing tools.
fastx_barcode_splitter.pl
# Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
#
# This program is free software: you can redistribute it and/or modify
...
...
@@ -155,7 +155,6 @@ item B<--trim>
=cut
##
## This program splits a FASTQ/FASTA file into several smaller files,
## Based on barcode matching.
...
...
@@ -189,6 +188,7 @@ my $quiet = 0 ;
my
$debug
=
0
;
my
$trim
=
0
;
my
$trim2
=
0
;
my
$adapt
=
1
;
my
$no_adapt
=
0
;
my
$rad
;
my
$radTAG
;
...
...
@@ -246,6 +246,7 @@ print_results unless $quiet;
sub
parse_command_line
{
my
$help
;
my
$doc
;
my
$version
;
pod2usage
(
1
)
if
(
scalar
@ARGV
==
0
);
...
...
@@ -266,12 +267,14 @@ sub parse_command_line {
"
radTAG=s
"
=>
\
$radTAG
,
"
TAG_mismatch=i
"
=>
\
$TAG_mm
,
"
help
"
=>
\
$help
,
"
doc
"
=>
\
$doc
"
doc
"
=>
\
$doc
,
"
version
"
=>
\
$version
)
;
pod2usage
(
1
)
if
(
$help
);
doc
()
if
(
$doc
);
version
()
if
(
$version
);
die
"
Error: barcode file not specified (use '--bcfile [FILENAME]')
\n
"
unless
defined
$barcode_file
;
die
"
Error: prefix path/filename not specified (use '--prefix-r1 [PATH]%.fq')
\n
"
unless
defined
$newfile_prefix_r1
;
# If a read2 file is provided
...
...
@@ -293,19 +296,19 @@ sub parse_command_line {
die
"
Error: partial overlap value (
$allow_partial_overlap
) bigger than
"
.
"
max. allowed mismatches (
$allowed_mismatches
)
\n
"
if
(
$allow_partial_overlap
>
$allowed_mismatches
);
if
(
$trim
){
die
"
Error: can't specify --trim and --trim2
\n
"
if
$trim2
;
}
$trim
=
1
if
$trim2
;
if
((
defined
$rad
&&
!
defined
$radTAG
)
||
(
!
defined
$rad
&&
defined
$radTAG
)){
die
"
Error: you must defined --rad AND --radTAG option
\n
";
}
if
(
$no_adapt
){
$adapt
=
0
;
}
if
(
defined
$rad
){
chomp
$rad
;
chomp
$radTAG
;
die
"
Error:
$rad
does not ended by
$radTAG
\n
"
if
(
$rad
!~
/$radTAG$/g
);
$
no_
adapt
=
1
;
$adapt
=
0
;
$check_rad
=
1
;
#~ warn $rad, " ",$radTAG;
$radTAG_length
=
length
(
$radTAG
);
...
...
@@ -449,7 +452,11 @@ sub match_sequences {
my
$best_barcode_ident
=
undef
;
my
$sequence_fragment
;
my
$sequence_fragment_radTAG
;
my
$start
=
0
;
my
$cut_len
=
$seq_len
;
my
$start2
=
0
;
my
$cut_len2
=
$seq_len_r2
;
warn
$start
,
"
"
,
$cut_len
,
"
"
,
$start2
,
"
"
,
$cut_len2
;
#Try all barcodes, find the one with the lowest mismatch count
if
(
$barcodes_at_bol
)
{
$sequence_fragment
=
substr
$seq_bases
,
0
,
$barcodes_length
;
...
...
@@ -457,12 +464,43 @@ sub match_sequences {
{
$sequence_fragment_radTAG
=
substr
$seq_bases
,
$barcodes_length
,
$radTAG_length
;
}
}
else
{
$sequence_fragment
=
substr
$seq_bases
,
-
$barcodes_length
;
if
(
$trim
){
$start
=
$barcodes_length
+
$adapt
;
$cut_len
=
$seq_len
-
$barcodes_length
-
$adapt
;
if
(
$trim2
)
{
$start2
=
0
;
$cut_len2
=
$cut_len
;
}
}
}
elsif
(
$barcodes_at_eol
)
{
if
(
$paired
!=
1
){
$sequence_fragment
=
substr
$seq_bases
,
-
$barcodes_length
;
if
(
defined
$rad
){
$sequence_fragment_radTAG
=
substr
$seq_bases
,
-
(
$barcodes_length
+
$adapt
+
$radTAG_length
),
$radTAG_length
;
}
if
(
$trim
){
$start
=
0
;
$cut_len
=
$seq_len
-
$barcodes_length
-
$adapt
;
}
}
else
{
$sequence_fragment
=
substr
$seq_bases_r2
,
-
$barcodes_length
;
if
(
defined
$rad
){
$sequence_fragment_radTAG
=
substr
$seq_bases_r2
,
-
(
$barcodes_length
+
$adapt
+
$radTAG_length
),
$radTAG_length
;
}
if
(
$trim
){
$start2
=
0
;
$cut_len2
=
$seq_len_r2
-
$barcodes_length
-
$adapt
;
if
(
$trim2
)
{
$start
=
0
;
$cut_len
=
$cut_len2
;
}
}
}
#~ warn "sequence fragment : $sequence_fragment\n" if $debug;
#~ $sequence_fragment = reverse_complement $sequence_fragment;
#~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug;
}
warn
$start
,"
",
$cut_len
,
"
",
$start2
,
"
",
$cut_len2
;
foreach
my
$barcoderef
(
@barcodes
)
{
my
(
$ident
,
$barcode
)
=
@
{
$barcoderef
};
...
...
@@ -493,61 +531,85 @@ sub match_sequences {
}
if
(
$best_barcode_mismatches_count
<=
$allowed_mismatches
&&
$best_barcode_ident
ne
'
ambiguous
'){
my
$start
;
my
$stop
;
# trim the first read
if
(
$trim
)
{
#~ warn $seq_len," ",$barcodes_length;
if
(
$no_adapt
&&
$barcodes_at_bol
){
$start
=
$barcodes_length
;
$stop
=
$seq_len
;
}
elsif
(
$no_adapt
&&
$barcodes_at_eol
){
$start
=
0
;
$stop
=
$seq_len
;
}
elsif
(
$barcodes_at_bol
){
$start
=
$barcodes_length
+
1
;
$stop
=
$seq_len
;
}
elsif
(
$barcodes_at_eol
){
$start
=
0
;
$stop
=
$seq_len
-
1
;
}
warn
"
position of fragment to keep :
$start
,
$stop
\n
"
if
$debug
;
$seq_bases
=
substr
$seq_bases
,
$start
,
$stop
;
if
(
$fastq_format
)
{
$seq_qualities
=
substr
$seq_qualities
,
$start
,
$stop
;
}
$seq_bases
=
substr
$seq_bases
,
$start
,
$cut_len
;
if
(
$fastq_format
)
{
$seq_qualities
=
substr
$seq_qualities
,
$start
,
$cut_len
;
}
# trim both read
if
(
$trim2
){
my
$start
;
my
$stop
;
my
$stop2
;
if
(
$no_adapt
&&
$barcodes_at_bol
){
$start
=
$barcodes_length
;
$stop
=
$seq_len
;
$stop2
=
$seq_len_r2
;
}
elsif
(
$no_adapt
&&
$barcodes_at_eol
){
$start
=
0
;
$stop
=
$seq_len
;
$stop2
=
$seq_len_r2
;
}
elsif
(
$barcodes_at_bol
){
$start
=
$barcodes_length
+
1
;
$stop
=
$seq_len
;
$stop2
=
$seq_len_r2
;
}
elsif
(
$barcodes_at_eol
){
$start
=
0
;
$stop
=
$seq_len
-
1
;
$stop2
=
$seq_len_r2
-
1
;
}
$seq_bases
=
substr
$seq_bases
,
$start
,
$stop
;
$seq_bases_r2
=
substr
$seq_bases_r2
,
$start
,
$stop2
;
if
(
$fastq_format
){
$seq_qualities
=
substr
$seq_qualities
,
$start
,
$stop
;
$seq_qualities_r2
=
substr
$seq_qualities_r2
,
$start
,
$stop2
;
if
(
$paired
==
1
)
{
$seq_bases_r2
=
substr
$seq_bases_r2
,
$start2
,
$cut_len2
;
if
(
$fastq_format
)
{
$seq_qualities_r2
=
substr
$seq_qualities_r2
,
$start2
,
$cut_len2
;
}
}
# trim the barcode
# if ($trim) {
# if ($barcodes_at_bol){
# $start=$barcodes_length+$adapt;
# $stop=$seq_len;
# } elsif ($barcodes_at_eol) {
# $start=0;
# $stop=$seq_len-$adapt;
# }
#
# #~ warn $seq_len," ",$barcodes_length;
## if ($adapt && $barcodes_at_bol){
## $start=$barcodes_length;
## $stop=$seq_len;
## }elsif ($adapt && $barcodes_at_eol){
## $start=0;
## $stop=$seq_len;
## }elsif ($barcodes_at_bol){
## $start=$barcodes_length+1;
## $stop=$seq_len;
## }elsif ($barcodes_at_eol){
## $start=0;
## $stop=$seq_len-1;
## }
# warn "position of fragment to keep : $start, $stop \n" if $debug;
# if ($paired==1 && $barcodes_at_eol){
# $seq_bases_r2 = substr $seq_bases_r2, $start, $stop;
# } else {
# $seq_bases = substr $seq_bases, $start, $stop;
# }
#
# if ($fastq_format) {
# if ($paired==1 && $barcodes_at_eol){
# $seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop;
# } else {
# $seq_qualities = substr $seq_qualities, $start, $stop;
# }
# }
# }
#
# # trim both read
# if ($trim2){
# my $start;
# my $stop;
# my $stop2;
# if ($adapt && $barcodes_at_bol){
# $start=$barcodes_length;
# $stop=$seq_len;
# $stop2=$seq_len_r2;
# }elsif ($adapt && $barcodes_at_eol){
# $start=0;
# $stop=$seq_len;
# $stop2=$seq_len_r2;
# }elsif ($barcodes_at_bol){
# $start=$barcodes_length+1;
# $stop=$seq_len;
# $stop2=$seq_len_r2;
# }elsif ($barcodes_at_eol){
# $start=0;
# $stop=$seq_len-1;
# $stop2=$seq_len_r2-1;
# }
# $seq_bases = substr $seq_bases, $start, $stop;
# $seq_bases_r2 = substr $seq_bases_r2, $start, $stop2;
# if ($fastq_format){
# $seq_qualities = substr $seq_qualities, $start, $stop;
# $seq_qualities_r2 = substr $seq_qualities_r2, $start, $stop2;
# }
# }
}
if
(
(
!
defined
$best_barcode_ident
)
||
(
$best_barcode_mismatches_count
>
$allowed_mismatches
)){
...
...
@@ -624,7 +686,7 @@ sub read_record
$seq_bases
=
$input_file_io
->
getline
();
die
"
Error: bad input file, expecting line with sequences
\n
"
unless
defined
$seq_bases
;
$seq_len
=
length
(
$seq_bases
)
-
$barcodes_length
-
1
;
$seq_len
=
length
(
$seq_bases
)
-
1
;
# If using FASTQ format, read two more lines
if
(
$fastq_format
)
{
$seq_name2
=
$input_file_io
->
getline
();
...
...
@@ -641,7 +703,7 @@ sub read_record
$seq_bases_r2
=
$input_file_io_r2
->
getline
();
die
"
Error: bad input file, expecting line with sequences
\n
"
unless
defined
$seq_bases_r2
;
$seq_len_r2
=
length
(
$seq_bases_r2
)
-
$barcodes_length
-
1
;
$seq_len_r2
=
length
(
$seq_bases_r2
)
-
1
;
# If using FASTQ format, read two more lines
if
(
$fastq_format
)
{
$seq_name2_r2
=
$input_file_io_r2
->
getline
();
...
...
@@ -717,11 +779,11 @@ sub open_and_detect_input_format
$input_file_io
->
ungetc
(
ord
$first_char
);
$seq_name
=
$input_file_io
->
getline
();
$seq_bases
=
$input_file_io
->
getline
();
$seq_len
=
length
(
$seq_bases
)
-
$barcodes_length
-
1
;
$seq_len
=
length
(
$seq_bases
)
-
1
;
if
(
$paired
==
1
)
{
$seq_name_r2
=
$input_file_io_r2
->
getline
();
$seq_bases_r2
=
$input_file_io_r2
->
getline
();
$seq_len_r2
=
length
(
$seq_bases_r2
)
-
$barcodes_length
-
1
;
$seq_len_r2
=
length
(
$seq_bases_r2
)
-
1
;
}
if
(
$first_char
eq
'
>
')
{
# FASTA format
...
...
@@ -744,6 +806,7 @@ sub open_and_detect_input_format
}
else
{
die
"
Error: unknown file format. First character = '
$first_char
' (expecting > or
\@
)
\n
";
}
warn
"
len1:
",
$seq_len
,
"
; len2:
",
$seq_len_r2
,
"
;barcode len:
",
$barcodes_length
,
"
; no adapt:
",
$adapt
;
}
sub
doc
()
...
...
@@ -877,3 +940,9 @@ EOF
exit
1
;
}
sub
version
()
{
print
STDOUT
"
1.0
";
exit
0
;
}
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment