Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/local/bin/perl
use strict;
use warnings;
my $accn_file = $ARGV[0];
my $nraccn_file = $ARGV[1];
my $outfile = $ARGV[2];
my $logfile = "${outfile}.log";
my $nodes_file = "nodes.dmp";
my $merged_file = "merged.dmp";
my $lineages_file = "lineages.dmp";
my $names_file = "names.dmp";
open (NFIL, '<', $nodes_file) or die "Couldn't open file $nodes_file: $!\n";
open (MFIL, '<', $merged_file) or die "Couldn't open file $merged_file: $!\n";
open (LFIL, '<', $lineages_file) or die "Couldn't open file $lineages_file: $!\n";
open (NMFIL, '<', $names_file) or die "Couldn't open file $names_file: $!\n";
if ($accn_file =~ /gz$/){
open (AFIL, "gunzip -c $accn_file |") or die "Couldn't open pipe to $accn_file: $!\n";
}else{
open (AFIL, '<', $accn_file) or die "Couldn't open file $accn_file: $!\n";
}
if ($accn_file =~ /gz$/){
open (NRFIL, "gunzip -c $nraccn_file |") or die "Couldn't open pipe to $nraccn_file: $!\n";
}else{
open (NRFIL, '<', $nraccn_file) or die "Couldn't open file $nraccn_file: $!\n";
}
#open the total database using seqIO
open (OFIL, '>', $outfile) or die "Couldn't write to file $outfile: $!\n";
open (LOG, '>', $logfile) or die "Couldn't write to file $logfile: $!\n";
my %taxnodes;
my %accn_hash;
my %rank_hash;
my %lin;
my %genus_species;
taxinit();
# ===== Glob all files together into array ====
my @files = glob("complete/*faa");
my %sequence_data;
foreach my $infile (@files) {
print "READING $infile\n";
open (my $IFIL, '<', $infile) or die "Couldn't read from file $infile: $!\n";
while (read_fasta_sequence($IFIL, \%sequence_data)) {
my $header = $sequence_data{header};
my $seq = $sequence_data{seq};
#print "$header\n";
my $accn = '';
my $name = '';
my $tax = '';
#print "$line\n";
# header for refseq81
#if ($header =~ /^[A-z]+\|([^\|]+)\|/){
if ($header =~ /^(\w+\.\d+)/){
$accn = $1;
#print "$accn\t";
}
if (exists $accn_hash{$accn}){
foreach my $tax (keys %{$accn_hash{$accn}}){
$name = $accn_hash{$accn}{$tax};
unless ($tax =~ /^\d+$/){
print LOG "NOT NUMERIC [$tax] : skipping $header\n";
next;
}
#get subject taxonomy tree
my $tax_tree = taxdump($tax);
my @ncbi_lineage = split(/,/, $tax_tree);
my %ncbi_lineage_hash;
my $lineage = '';
#get subject group name and store lineage in hash
foreach my $taxon_id (@ncbi_lineage) {
$ncbi_lineage_hash{$taxon_id} = 1;
if (exists $lin{$taxon_id} && $lineage eq ''){
$lineage = $lin{$taxon_id};
}
}
if ($lineage eq ''){
print LOG "LINEAGE UNDEFINED [$tax - $name] : skipping $header\n";
next;
}
if ( length($name) > 25){
$name = substr($name, 0, 25);
}
$name =~ s/\W+/_/g;
$name =~ s/_+$//;
my $final_string = "${accn}-${tax}-${name}-$lineage";
my $string_len = length($final_string);
print OFIL ">$final_string\n$seq\n";
}
}else{
print LOG "ACCN NOT STORED : skipping $header\n"
}
}
close $IFIL;
}
close OFIL;
close LOG;
sub taxdump {
my $taxid = shift;
my $root;
my @taxlinarr;
while (!defined $root){
if (!defined $taxid){
last;
}
if ($taxid == 1){$root = 1};
push @taxlinarr, $taxid;
$taxid = $taxnodes{$taxid};
}
my $taxlin = join(",", @taxlinarr);
return $taxlin;
}
sub taxinit {
while ( my $line = <NFIL> ) {
chomp $line;
$line =~ /^(\d+)\t\|\t(\d+)\t\|\t([^\t]+)\t\|/;
my $taxid = $1;
my $parentid = $2;
my $rank = $3;
$taxnodes{$taxid} = $parentid;
$rank_hash{$taxid} = $rank;
}
close NFIL;
while ( my $line = <MFIL> ) {
chomp $line;
$line =~ /^(\d+)\t\|\t(\d+)\t\|/;
$taxnodes{$1} = $taxnodes{$2};
$rank_hash{$1} = $rank_hash{$2};
}
close MFIL;
foreach my $taxid (keys %taxnodes){
my $rank = $rank_hash{$taxid};
my $tax_tree = taxdump($taxid);
if ($tax_tree =~ /,2,/){
if ($rank eq 'genus' || $rank eq 'family'){
$genus_species{$taxid} = 1;
}
}else{
if ($rank eq 'species' || $rank eq 'genus'){
$genus_species{$taxid} = 1;
}
}
}
while ( my $line = <NMFIL> ) {
chomp $line;
if ($line =~ /^(\d+)\t\|\t([^\t]+)\t.*\tscientific name\t/){
my $taxid = $1;
my $name = $2;
if ( exists $genus_species{$taxid}){
$genus_species{$taxid} = $name;
}
}
}
close NMFIL;
while ( my $line = <LFIL> ) {
chomp $line;
my @col = split(/\t/, $line);
$lin{$col[0]} = $col[1];
}
close LFIL;
while ( my $line = <NRFIL> ) {
chomp $line;
my @col = split(/\t/, $line);
my $accn = $col[0];
my $taxid = $col[3];
my $name = $col[5];
if (exists $genus_species{$taxid}){
$accn_hash{$accn}{$taxid} = $name;
#print LOG "storing EXISTING NAME : $name [$taxid] for $accn\n";
}else{
my $tax_tree = taxdump($taxid);
my @ncbi_lineage = split(/,/, $tax_tree);
my %ncbi_lineage_hash;
my $new_taxid = '';
my $new_name = '';
#get the species or genus name and store in hash
foreach my $taxon_id (@ncbi_lineage) {
$ncbi_lineage_hash{$taxon_id} = 1;
if (exists $genus_species{$taxon_id} && $new_taxid eq ''){
$new_taxid = $taxon_id;
$new_name = $genus_species{$taxon_id};
#print LOG "storing NEW NAME : $new_name [$new_taxid] for $accn OLD NAME : $name [$taxid]\n";
}
}
if ($new_taxid eq ''){
#print LOG "GENUS/SPECIES UNKNOWN: [$taxid - $name] : skipping $accn\n";
next;
}
$accn_hash{$accn}{$new_taxid} = $new_name;
}
}
close NRFIL;
while ( my $line = <AFIL> ) {
chomp $line;
my @col = split(/\t/, $line);
my $accn = $col[2];
my $taxid = $col[0];
my $name = $col[1];
next if (exists $accn_hash{$accn});
if (exists $genus_species{$taxid}){
$accn_hash{$accn}{$taxid} = $name;
#print LOG "storing EXISTING NAME : $name [$taxid] for $accn\n";
}else{
my $tax_tree = taxdump($taxid);
my @ncbi_lineage = split(/,/, $tax_tree);
my %ncbi_lineage_hash;
my $new_taxid = '';
my $new_name = '';
#get the species or genus name and store in hash
foreach my $taxon_id (@ncbi_lineage) {
$ncbi_lineage_hash{$taxon_id} = 1;
if (exists $genus_species{$taxon_id} && $new_taxid eq ''){
$new_taxid = $taxon_id;
$new_name = $genus_species{$taxon_id};
#print LOG "storing NEW NAME : $new_name [$new_taxid] for $accn OLD NAME : $name [$taxid]\n";
}
}
if ($new_taxid eq ''){
#print LOG "GENUS/SPECIES UNKNOWN: [$taxid - $name] : skipping $accn\n";
next;
}
$accn_hash{$accn}{$new_taxid} = $new_name;
}
}
close AFIL;
}
sub read_fasta_sequence {
my ($IFIL, $seq_info) = @_;
$seq_info->{seq} = undef; # clear out previous sequence
# put the header into place
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header};
my $file_not_empty = 0;
while (<$IFIL>) {
$file_not_empty = 1;
next if /^\s*$/; # skip blank lines
chomp;
if (/^>/) { # fasta header line
my $h = $_;
$h =~ s/^>//;
if ($seq_info->{header}) {
$seq_info->{next_header} = $h;
return $seq_info;
}
else { # first time through only
$seq_info->{header} = $h;
}
}
else {
s/\s+//; # remove any white space
$seq_info->{seq} .= $_;
}
}
if ($file_not_empty) {
return $seq_info;
}
else {
# clean everything up
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef;
return;
}
}