Commit 0755db9b authored by Penom Nom's avatar Penom Nom
Browse files

add script perl : convert mothur classify to krona

parent ebbee70f
#! /usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use File::Basename;
use feature 'say';
use lib (`ktGetLibPath`);
use KronaTools;
setOption('name', 'root');
setOption('out', 'mothur.krona.html');
my @options =
qw(
out
name
combine
);
# minConfidence
# depth
# hueBad
# hueGood
# local
# url
# postUrl
getKronaOptions(@options);
if
(
@ARGV < 1
)
{
printUsage
(
'Create a Krona chart from Mothur classifications.',
'mothur_details',
'Option 1 : Take 1 argument : file.taxonomy
or Option 2 : Take 3 arguments : file.taxonomy, file.groups and file.names.',
0,
1,
\@options
);
exit 0;
}
my $tree = newTree();
my @datasetNames;
my $set = 0;
sub getExtension{
my $fileName = $_[0];
my ($name,$path,$suffix) = fileparse($fileName,qr"\..[^.]*$");
#~ say "my suffix $suffix";
return ($path,$name);
}
sub getNames {
#create a hash of file.names : key = ID Seq and value = array with all ID seq with same seq
my $filename = $_[0];
my $path = $_[1];
open INFILE, "${path}${filename}.names" or die $!;
my %hashNames;
while (my $line = <INFILE>)
{
my ($key, $value) = split /\s+/, $line;
push @{$hashNames{$key}}, split /\s*,\s*/,$value;
}
return \%hashNames;
}
sub getGroups {
#create a hash of file.names : key = Num sample and value = array with all ID seq
my $filename = $_[0];
my $path = $_[1];
open INFILE, "${path}${filename}.groups" or die $!;
my %hashGroups;
while (my $line = <INFILE>)
{
my ($value, $key) = split /\t/, $line;
push @{ $hashGroups{$key} }, $value;
}
return %hashGroups;
}
sub getIdNames {
#retrieve the general ID seq
my $idGroups = $_[0];
my $hashNames = $_[1];
foreach my $key ( keys %$hashNames )
{
foreach my $values ( @{ $hashNames->{$key} } )
{
if ($idGroups eq $values)
{
return $key;
}
}
}
}
sub getTaxonomy {
#create two hashs of file.taxonomy :
#taxonomyLineage : key = ID seq and value = array of taxons
#taxonomyScore : key = ID seq and value = array of pourcent
my %taxonomyLineage;
my %taxonomyScores;
my $filename = $_[0];
my $path = $_[1];
say "${path}${filename}.taxonomy";
open INFILE, "${path}${filename}.taxonomy" or die $!;
while (my $line = <INFILE>)
{
next if ($line =~ m/OTU\tSize\tTaxonomy/);
my $root;
my @lineage;
my @ranks;
my @scores;
my $allRanks;
my @fields = split /\t[0-9]\t|[\t;]/, $line;
my $queryID = $fields[0];
for ( my $i = 1; $i < $#fields; $i++ )
{
my ($val, $int) = $fields[$i] =~ /\"?(\w+)\"?\(*(\d*)\)*/;
push @lineage, $val;
push @scores, $int;
}
push(@{ $taxonomyLineage{$queryID} }, @lineage);
push(@{ $taxonomyScores{$queryID} }, @scores);
}
return (\%taxonomyLineage, \%taxonomyScores);
}
sub getLineage{
# retrun the good lineage for one ID seq
my $hashLineage = $_[0];
my $idNames = $_[1];
my @lineage = @{ $hashLineage->{$idNames} };
return @lineage;
}
sub getScores{
# retrun the good score for one ID seq
my $hashScores = $_[0];
my $idNames = $_[1];
my @scores = @{ $hashScores->{$idNames} };
return @scores;
}
sub getListFiles{
#check if all files exists
my $filename=$_[0];
my $path=$_[1];
opendir(DIR, $path);
my @files = grep(/^$filename\.(taxonomy|groups|names)/,readdir(DIR));
closedir(DIR);
return \@files;
}
foreach my $input(@ARGV)
{
say "MY INPUT $input";
my ($fileName, $magFile, $name) = parseDataset($input);
say "Importing $fileName...";
open INFILE, "<$fileName" or die $!;
my $format;
my $line = <INFILE>;
if ( $line =~ /\t/ )
{
$format = 1;
say "File taxonomy found.";
while ( $line !~ /\t/ )
{
$line = <INFILE>;
if ( ! $line )
{
ktDie("Classifications not found in \"$fileName\". Is it an \"Assignment detail\" file?");
}
}
}
else
{
say "Command line format detected.";
}
my ($path,$filewe) = getExtension($input);
#~ say "my name $name";
#~ say "my file $filewe";
#~ say "my path $path";
#~ say "my fullname ${path}${filewe}";
my $listFiles = getListFiles($filewe,$path);
my $size = $#$listFiles + 1;
my ($hashLineage, $hashScores) = getTaxonomy($filewe,$path);
#if i have multiple samples
if ($size eq 3)
{
my %samples = getGroups($filewe,$path);
my $hashNames = getNames($filewe,$path);
foreach my $sample (keys %samples)
{
say "My sample ",$sample;
foreach my $seq (@{$samples{$sample}})
{
#
my $idNames = getIdNames($seq,$hashNames);
my @lineage = getLineage($hashLineage,$idNames);
my @scores = getLineage($hashScores,$idNames);
my $allRanks;
my $root = $lineage[0];
for ( my $i = 1; $i < @lineage; $i ++ )
{
if (! getOption('combine'))
{
if ( $lineage[$i] eq $root)
{
setOption('name',$root );
$allRanks = 1;
next;
}
}
}
addByLineage
(
$tree,
$set,#rank for the different samples
\@lineage,
$seq,#name of id sequence
undef,
\@scores#pourcent
# $format ? ($allRanks ? undef : \@webRanks) : \@ranks
);
}
if ( ! getOption('combine') )
{
$set++;
push @datasetNames, $sample;
}
}
close INFILE;
}
#if i have just file.taxonomy
elsif($size eq 1)
{
foreach my $seq (keys $hashLineage)
{
my $allRanks;
my @lineage = @{ $hashLineage->{$seq} };
my @scores = @{ $hashScores->{$seq} };
addByLineage
(
$tree,
$set,
\@lineage,
$seq,
undef,
\@scores
# $format ? ($allRanks ? undef : \@webRanks) : \@ranks
);
}
}
else
{
say "There are not all files required. Please check if you have opt1 : file.taxonomy or opt2 : file.taxonomy, file.groups and file.names in your current directory.";
exit 0;
}
}
say "Done";
my @attributeNames =
(
'count',
'unassigned',
'score',
'rank'
);
my @attributeDisplayNames =
(
'Count',
'Unassigned',
'Avg. % Confidence',
'Rank'
);
writeTree
(
$tree,
\@attributeNames,
\@attributeDisplayNames,
\@datasetNames,
# getOption('hueBad'),
# getOption('hueGood')
);
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment