parse_refseq.pl

#!/usr/local/bin/perl
use strict;
use warnings;

my $accn_file = $ARGV[0];
my $nraccn_file = $ARGV[1];
my $outfile = $ARGV[2];
my $logfile = "${outfile}.log";

my $nodes_file = "nodes.dmp";
my $merged_file = "merged.dmp";
my $lineages_file = "lineages.dmp";
my $names_file = "names.dmp";

open (NFIL, '<', $nodes_file) or die "Couldn't open file $nodes_file: $!\n";
open (MFIL, '<', $merged_file) or die "Couldn't open file $merged_file: $!\n";
open (LFIL, '<', $lineages_file) or die "Couldn't open file $lineages_file: $!\n";
open (NMFIL, '<', $names_file) or die "Couldn't open file $names_file: $!\n";

if ($accn_file =~ /gz$/){
	open (AFIL, "gunzip -c $accn_file |") or die "Couldn't open pipe to $accn_file: $!\n";
}else{
	open (AFIL, '<', $accn_file) or die "Couldn't open file $accn_file: $!\n";
}

if ($accn_file =~ /gz$/){
	open (NRFIL, "gunzip -c $nraccn_file |") or die "Couldn't open pipe to $nraccn_file: $!\n";
}else{
	open (NRFIL, '<', $nraccn_file) or die "Couldn't open file $nraccn_file: $!\n";
}

#open the total database using seqIO
open (OFIL, '>', $outfile) or die "Couldn't write to file $outfile: $!\n";
open (LOG, '>', $logfile) or die "Couldn't write to file $logfile: $!\n";

my %taxnodes;
my %accn_hash;
my %rank_hash;
my %lin;
my %genus_species;
taxinit();

# ===== Glob all files together into array ====
my @files = glob("complete/*faa");

my %sequence_data;

foreach my $infile (@files) {
	print "READING $infile\n";
	open (my $IFIL, '<', $infile) or die "Couldn't read from file $infile: $!\n";
	
	while (read_fasta_sequence($IFIL, \%sequence_data)) {
		my $header = $sequence_data{header};
		my $seq = $sequence_data{seq};
		#print "$header\n";
		
		my $accn = '';
		my $name = '';
		my $tax = '';
		#print "$line\n";
	
		# header for refseq81
		#if ($header =~ /^[A-z]+\|([^\|]+)\|/){
		
		if ($header =~ /^(\w+\.\d+)/){
			$accn = $1;
			#print "$accn\t";
		}
	
		if (exists $accn_hash{$accn}){
			foreach my $tax (keys %{$accn_hash{$accn}}){
				$name = $accn_hash{$accn}{$tax};
		
				unless ($tax =~ /^\d+$/){
					print LOG "NOT NUMERIC [$tax] : skipping $header\n";
					next;	
				}
					
				#get subject taxonomy tree	
				my $tax_tree = taxdump($tax);
				my @ncbi_lineage = split(/,/, $tax_tree);
				my %ncbi_lineage_hash;
				my $lineage = '';
		
				#get subject group name and store lineage in hash
				foreach my $taxon_id (@ncbi_lineage) {
					$ncbi_lineage_hash{$taxon_id} = 1;
					if (exists $lin{$taxon_id} && $lineage eq ''){
						$lineage = $lin{$taxon_id};
					}
				}

				if ($lineage eq ''){
					print LOG "LINEAGE UNDEFINED [$tax - $name] : skipping $header\n";
					next;	
				}
	
				if ( length($name) > 25){
					$name = substr($name, 0, 25);
				}
				$name =~ s/\W+/_/g;
				$name =~ s/_+$//;
		
				my $final_string = "${accn}-${tax}-${name}-$lineage";
				my $string_len = length($final_string);

				print OFIL ">$final_string\n$seq\n";
	
			}
		}else{
			print LOG "ACCN NOT STORED : skipping $header\n"
		}
	}
	close $IFIL;
}

close OFIL;
close LOG;

sub taxdump {
	my $taxid = shift;
	my $root;
	my @taxlinarr;
	while (!defined $root){
		if (!defined $taxid){
			last;
		}
		if ($taxid == 1){$root = 1};
		push @taxlinarr, $taxid;
		$taxid = $taxnodes{$taxid};
	}
	my $taxlin = join(",", @taxlinarr);
	return $taxlin;
}

sub taxinit {

	while ( my $line = <NFIL> ) {
		chomp $line;
		$line =~ /^(\d+)\t\|\t(\d+)\t\|\t([^\t]+)\t\|/;
		my $taxid = $1;
		my $parentid = $2;
		my $rank = $3;
		
		$taxnodes{$taxid} = $parentid;
		$rank_hash{$taxid} = $rank;
	}
	close NFIL;

	while ( my $line = <MFIL> ) {
		chomp $line;
		$line =~ /^(\d+)\t\|\t(\d+)\t\|/;
		$taxnodes{$1} = $taxnodes{$2};
		$rank_hash{$1} = $rank_hash{$2};
	}
	close MFIL;

	foreach my $taxid (keys %taxnodes){
		my $rank = $rank_hash{$taxid};
		my $tax_tree = taxdump($taxid);

		if ($tax_tree =~ /,2,/){
			if ($rank eq 'genus' || $rank eq 'family'){
				$genus_species{$taxid} = 1;
			}
		}else{
			if ($rank eq 'species' || $rank eq 'genus'){
				$genus_species{$taxid} = 1;
			}
		}
	}
	
	while ( my $line = <NMFIL> ) {
		chomp $line;
		if ($line =~ /^(\d+)\t\|\t([^\t]+)\t.*\tscientific name\t/){
			my $taxid = $1;
			my $name = $2;
			if ( exists $genus_species{$taxid}){
				$genus_species{$taxid} = $name;
			}
		}
	}
	close NMFIL;

	while ( my $line = <LFIL> ) {
		chomp $line;
		my @col = split(/\t/, $line);
		$lin{$col[0]} = $col[1];
	}
	close LFIL;

	while ( my $line = <NRFIL> ) {
		chomp $line;
		my @col = split(/\t/, $line);

		my $accn = $col[0];
		my $taxid = $col[3];
		my $name = $col[5];
		
		if (exists $genus_species{$taxid}){
			$accn_hash{$accn}{$taxid} = $name;
			#print LOG "storing EXISTING NAME : $name [$taxid] for $accn\n";
		
		}else{

			my $tax_tree = taxdump($taxid);
			my @ncbi_lineage = split(/,/, $tax_tree);
			my %ncbi_lineage_hash;
			my $new_taxid = '';
			my $new_name = '';
	
			#get the species or genus name and store in hash
			foreach my $taxon_id (@ncbi_lineage) {
				$ncbi_lineage_hash{$taxon_id} = 1;
				if (exists $genus_species{$taxon_id} && $new_taxid eq ''){
					$new_taxid = $taxon_id;
					$new_name = $genus_species{$taxon_id};
					#print LOG "storing NEW NAME : $new_name [$new_taxid] for $accn OLD NAME : $name [$taxid]\n";

				}
			}

			if ($new_taxid eq ''){
				#print LOG "GENUS/SPECIES UNKNOWN: [$taxid - $name] : skipping $accn\n";
				next;				
			}
			
			$accn_hash{$accn}{$new_taxid} = $new_name;
		}
	}
	close NRFIL;

	while ( my $line = <AFIL> ) {
		chomp $line;
		my @col = split(/\t/, $line);
		
		my $accn = $col[2];
		my $taxid = $col[0];
		my $name = $col[1];
		
		next if (exists $accn_hash{$accn});
				
		if (exists $genus_species{$taxid}){
			$accn_hash{$accn}{$taxid} = $name;
			#print LOG "storing EXISTING NAME : $name [$taxid] for $accn\n";
		
		}else{

			my $tax_tree = taxdump($taxid);
			my @ncbi_lineage = split(/,/, $tax_tree);
			my %ncbi_lineage_hash;
			my $new_taxid = '';
			my $new_name = '';
	
			#get the species or genus name and store in hash
			foreach my $taxon_id (@ncbi_lineage) {
				$ncbi_lineage_hash{$taxon_id} = 1;
				if (exists $genus_species{$taxon_id} && $new_taxid eq ''){
					$new_taxid = $taxon_id;
					$new_name = $genus_species{$taxon_id};
					#print LOG "storing NEW NAME : $new_name [$new_taxid] for $accn OLD NAME : $name [$taxid]\n";
				}
			}

			if ($new_taxid eq ''){
				#print LOG "GENUS/SPECIES UNKNOWN: [$taxid - $name] : skipping $accn\n";
				next;				
			}
			
			$accn_hash{$accn}{$new_taxid} = $new_name;
		}
	}
	close AFIL;

}

sub read_fasta_sequence {
   my ($IFIL, $seq_info) = @_;

   $seq_info->{seq} = undef; # clear out previous sequence

   # put the header into place
   $seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header};

   my $file_not_empty = 0; 
   while (<$IFIL>) {
      $file_not_empty = 1;
      next if /^\s*$/;  # skip blank lines
      chomp;    

      if (/^>/) { # fasta header line
         my $h = $_;    
         $h =~ s/^>//;  
         if ($seq_info->{header}) {
            $seq_info->{next_header} = $h;
            return $seq_info;   
         }              
         else { # first time through only
            $seq_info->{header} = $h;
         }              
      }         
      else {    
         s/\s+//;  # remove any white space
         $seq_info->{seq} .= $_;
      }         
   }    

   if ($file_not_empty) {
      return $seq_info;
   }    
   else {
      # clean everything up
      $seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef;

      return;   
   }    
}