Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/local/bin/perl
use strict;
use warnings;
my $infile = $ARGV[0];
my $outfile = $ARGV[1];
my $nodes_file = "nodes.dmp";
my $merged_file = "merged.dmp";
my $names_file = "names.dmp";
my $lin_file = 'lineages.dmp';
open (NFIL, '<', $nodes_file) or die "Couldn't open file $nodes_file: $!\n";
open (MFIL, '<', $merged_file) or die "Couldn't open file $merged_file: $!\n";
open (SFIL, '<', $names_file) or die "Couldn't open file $names_file: $!\n";
open (LFIL, '<', $lin_file) or die "Couldn't open file $lin_file: $!\n";
my %taxnodes;
my %sp_names;
my %lin;
taxinit();
open(my $IFIL, '<', $infile) || die "can’t open file $infile\n";
my %refseq;
my %sequence_data;
while (read_fasta_sequence($IFIL, \%sequence_data)) {
my $header = $sequence_data{header};
my $seq = $sequence_data{seq};
#print "$header\n";
$header =~ /^[^-]+-(\d+)-[^-]+-([^-]+)$/;
my $tax = $1;
my $lineage = '';
my $species_name = '';
if (exists $sp_names{$tax}){
$species_name = $sp_names{$tax};
}
#get subject taxonomy tree
my $tax_tree = taxdump($tax);
my @ncbi_lineage = split(/,/, $tax_tree);
my %ncbi_lineage_hash;
#get subject group name and store lineage in hash
foreach my $taxon_id (@ncbi_lineage) {
$ncbi_lineage_hash{$taxon_id} = 1;
if (exists $lin{$taxon_id} && $lineage eq ''){
$lineage = $lin{$taxon_id};
}
}
$refseq{$lineage}{"$tax\t$species_name"}++;
}
close $IFIL;
open (OFIL, '>', $outfile) or die "Couldn't write to file $outfile: $!\n";
foreach my $lineage (keys %refseq){
foreach my $species (keys %{$refseq{$lineage}}){
print OFIL "$lineage\t$species\t$refseq{$lineage}{$species}\n";
}
}
close OFIL;
sub taxdump {
my $taxid = shift;
my $root;
my @taxlinarr;
while (!defined $root){
if (!defined $taxid){
last;
}
if ($taxid == 1){$root = 1};
push @taxlinarr, $taxid;
$taxid = $taxnodes{$taxid};
}
my $taxlin = join(",", @taxlinarr);
return $taxlin;
}
sub taxinit {
while ( my $line = <SFIL> ) {
chomp $line;
$line =~ /^(\d+)\t\|\t([^\t]+)\t\|\t[^\t]*\t\|\t([^\t]+).*$/;
if ($3 eq 'scientific name'){
$sp_names{$1} = $2;
}
}
close SFIL;
while ( my $line = <NFIL> ) {
chomp $line;
$line =~ /^(\d+)\t\|\t(\d+)\t\|\t([^\t]+)\t\|/;
$taxnodes{$1} = $2;
}
close NFIL;
while ( my $line = <MFIL> ) {
chomp $line;
$line =~ /^(\d+)\t\|\t(\d+)\t\|/;
$taxnodes{$1} = $2;
}
close MFIL;
while ( my $line = <LFIL> ) {
chomp $line;
(my $taxid , my $lineage) = split("\t", $line);
$lin{$taxid} = $lineage;
}
close LFIL;
}
sub read_fasta_sequence {
my ($IFIL, $seq_info) = @_;
$seq_info->{seq} = undef; # clear out previous sequence
# put the header into place
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header};
my $file_not_empty = 0;
while (<$IFIL>) {
$file_not_empty = 1;
next if /^\s*$/; # skip blank lines
chomp;
if (/^>/) { # fasta header line
my $h = $_;
$h =~ s/^>//;
if ($seq_info->{header}) {
$seq_info->{next_header} = $h;
return $seq_info;
}
else { # first time through only
$seq_info->{header} = $h;
}
}
else {
s/\s+//; # remove any white space
$seq_info->{seq} .= $_;
}
}
if ($file_not_empty) {
return $seq_info;
}
else {
# clean everything up
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef;
return;
}
}