Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
custom-refseq/calc_dbstats.pl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
154 lines (130 sloc)
3.48 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
use strict; | |
use warnings; | |
my $infile = $ARGV[0]; | |
my $outfile = $ARGV[1]; | |
my $nodes_file = "nodes.dmp"; | |
my $merged_file = "merged.dmp"; | |
my $names_file = "names.dmp"; | |
my $lin_file = 'lineages.dmp'; | |
open (NFIL, '<', $nodes_file) or die "Couldn't open file $nodes_file: $!\n"; | |
open (MFIL, '<', $merged_file) or die "Couldn't open file $merged_file: $!\n"; | |
open (SFIL, '<', $names_file) or die "Couldn't open file $names_file: $!\n"; | |
open (LFIL, '<', $lin_file) or die "Couldn't open file $lin_file: $!\n"; | |
my %taxnodes; | |
my %sp_names; | |
my %lin; | |
taxinit(); | |
open(my $IFIL, '<', $infile) || die "can’t open file $infile\n"; | |
my %refseq; | |
my %sequence_data; | |
while (read_fasta_sequence($IFIL, \%sequence_data)) { | |
my $header = $sequence_data{header}; | |
my $seq = $sequence_data{seq}; | |
#print "$header\n"; | |
$header =~ /^[^-]+-(\d+)-[^-]+-([^-]+)$/; | |
my $tax = $1; | |
my $lineage = ''; | |
my $species_name = ''; | |
if (exists $sp_names{$tax}){ | |
$species_name = $sp_names{$tax}; | |
} | |
#get subject taxonomy tree | |
my $tax_tree = taxdump($tax); | |
my @ncbi_lineage = split(/,/, $tax_tree); | |
my %ncbi_lineage_hash; | |
#get subject group name and store lineage in hash | |
foreach my $taxon_id (@ncbi_lineage) { | |
$ncbi_lineage_hash{$taxon_id} = 1; | |
if (exists $lin{$taxon_id} && $lineage eq ''){ | |
$lineage = $lin{$taxon_id}; | |
} | |
} | |
$refseq{$lineage}{"$tax\t$species_name"}++; | |
} | |
close $IFIL; | |
open (OFIL, '>', $outfile) or die "Couldn't write to file $outfile: $!\n"; | |
foreach my $lineage (keys %refseq){ | |
foreach my $species (keys %{$refseq{$lineage}}){ | |
print OFIL "$lineage\t$species\t$refseq{$lineage}{$species}\n"; | |
} | |
} | |
close OFIL; | |
sub taxdump { | |
my $taxid = shift; | |
my $root; | |
my @taxlinarr; | |
while (!defined $root){ | |
if (!defined $taxid){ | |
last; | |
} | |
if ($taxid == 1){$root = 1}; | |
push @taxlinarr, $taxid; | |
$taxid = $taxnodes{$taxid}; | |
} | |
my $taxlin = join(",", @taxlinarr); | |
return $taxlin; | |
} | |
sub taxinit { | |
while ( my $line = <SFIL> ) { | |
chomp $line; | |
$line =~ /^(\d+)\t\|\t([^\t]+)\t\|\t[^\t]*\t\|\t([^\t]+).*$/; | |
if ($3 eq 'scientific name'){ | |
$sp_names{$1} = $2; | |
} | |
} | |
close SFIL; | |
while ( my $line = <NFIL> ) { | |
chomp $line; | |
$line =~ /^(\d+)\t\|\t(\d+)\t\|\t([^\t]+)\t\|/; | |
$taxnodes{$1} = $2; | |
} | |
close NFIL; | |
while ( my $line = <MFIL> ) { | |
chomp $line; | |
$line =~ /^(\d+)\t\|\t(\d+)\t\|/; | |
$taxnodes{$1} = $2; | |
} | |
close MFIL; | |
while ( my $line = <LFIL> ) { | |
chomp $line; | |
(my $taxid , my $lineage) = split("\t", $line); | |
$lin{$taxid} = $lineage; | |
} | |
close LFIL; | |
} | |
sub read_fasta_sequence { | |
my ($IFIL, $seq_info) = @_; | |
$seq_info->{seq} = undef; # clear out previous sequence | |
# put the header into place | |
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header}; | |
my $file_not_empty = 0; | |
while (<$IFIL>) { | |
$file_not_empty = 1; | |
next if /^\s*$/; # skip blank lines | |
chomp; | |
if (/^>/) { # fasta header line | |
my $h = $_; | |
$h =~ s/^>//; | |
if ($seq_info->{header}) { | |
$seq_info->{next_header} = $h; | |
return $seq_info; | |
} | |
else { # first time through only | |
$seq_info->{header} = $h; | |
} | |
} | |
else { | |
s/\s+//; # remove any white space | |
$seq_info->{seq} .= $_; | |
} | |
} | |
if ($file_not_empty) { | |
return $seq_info; | |
} | |
else { | |
# clean everything up | |
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef; | |
return; | |
} | |
} |