Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
custom-refseq/parse_refseq.pl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
317 lines (258 sloc)
7.39 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
use strict; | |
use warnings; | |
my $accn_file = $ARGV[0]; | |
my $nraccn_file = $ARGV[1]; | |
my $outfile = $ARGV[2]; | |
my $logfile = "${outfile}.log"; | |
my $nodes_file = "nodes.dmp"; | |
my $merged_file = "merged.dmp"; | |
my $lineages_file = "lineages.dmp"; | |
my $names_file = "names.dmp"; | |
open (NFIL, '<', $nodes_file) or die "Couldn't open file $nodes_file: $!\n"; | |
open (MFIL, '<', $merged_file) or die "Couldn't open file $merged_file: $!\n"; | |
open (LFIL, '<', $lineages_file) or die "Couldn't open file $lineages_file: $!\n"; | |
open (NMFIL, '<', $names_file) or die "Couldn't open file $names_file: $!\n"; | |
if ($accn_file =~ /gz$/){ | |
open (AFIL, "gunzip -c $accn_file |") or die "Couldn't open pipe to $accn_file: $!\n"; | |
}else{ | |
open (AFIL, '<', $accn_file) or die "Couldn't open file $accn_file: $!\n"; | |
} | |
if ($accn_file =~ /gz$/){ | |
open (NRFIL, "gunzip -c $nraccn_file |") or die "Couldn't open pipe to $nraccn_file: $!\n"; | |
}else{ | |
open (NRFIL, '<', $nraccn_file) or die "Couldn't open file $nraccn_file: $!\n"; | |
} | |
#open the total database using seqIO | |
open (OFIL, '>', $outfile) or die "Couldn't write to file $outfile: $!\n"; | |
open (LOG, '>', $logfile) or die "Couldn't write to file $logfile: $!\n"; | |
my %taxnodes; | |
my %accn_hash; | |
my %rank_hash; | |
my %lin; | |
my %genus_species; | |
taxinit(); | |
# ===== Glob all files together into array ==== | |
my @files = glob("complete/*faa"); | |
my %sequence_data; | |
foreach my $infile (@files) { | |
print "READING $infile\n"; | |
open (my $IFIL, '<', $infile) or die "Couldn't read from file $infile: $!\n"; | |
while (read_fasta_sequence($IFIL, \%sequence_data)) { | |
my $header = $sequence_data{header}; | |
my $seq = $sequence_data{seq}; | |
#print "$header\n"; | |
my $accn = ''; | |
my $name = ''; | |
my $tax = ''; | |
#print "$line\n"; | |
# header for refseq81 | |
#if ($header =~ /^[A-z]+\|([^\|]+)\|/){ | |
if ($header =~ /^(\w+\.\d+)/){ | |
$accn = $1; | |
#print "$accn\t"; | |
} | |
if (exists $accn_hash{$accn}){ | |
foreach my $tax (keys %{$accn_hash{$accn}}){ | |
$name = $accn_hash{$accn}{$tax}; | |
unless ($tax =~ /^\d+$/){ | |
print LOG "NOT NUMERIC [$tax] : skipping $header\n"; | |
next; | |
} | |
#get subject taxonomy tree | |
my $tax_tree = taxdump($tax); | |
my @ncbi_lineage = split(/,/, $tax_tree); | |
my %ncbi_lineage_hash; | |
my $lineage = ''; | |
#get subject group name and store lineage in hash | |
foreach my $taxon_id (@ncbi_lineage) { | |
$ncbi_lineage_hash{$taxon_id} = 1; | |
if (exists $lin{$taxon_id} && $lineage eq ''){ | |
$lineage = $lin{$taxon_id}; | |
} | |
} | |
if ($lineage eq ''){ | |
print LOG "LINEAGE UNDEFINED [$tax - $name] : skipping $header\n"; | |
next; | |
} | |
if ( length($name) > 25){ | |
$name = substr($name, 0, 25); | |
} | |
$name =~ s/\W+/_/g; | |
$name =~ s/_+$//; | |
my $final_string = "${accn}-${tax}-${name}-$lineage"; | |
my $string_len = length($final_string); | |
print OFIL ">$final_string\n$seq\n"; | |
} | |
}else{ | |
print LOG "ACCN NOT STORED : skipping $header\n" | |
} | |
} | |
close $IFIL; | |
} | |
close OFIL; | |
close LOG; | |
sub taxdump { | |
my $taxid = shift; | |
my $root; | |
my @taxlinarr; | |
while (!defined $root){ | |
if (!defined $taxid){ | |
last; | |
} | |
if ($taxid == 1){$root = 1}; | |
push @taxlinarr, $taxid; | |
$taxid = $taxnodes{$taxid}; | |
} | |
my $taxlin = join(",", @taxlinarr); | |
return $taxlin; | |
} | |
sub taxinit { | |
while ( my $line = <NFIL> ) { | |
chomp $line; | |
$line =~ /^(\d+)\t\|\t(\d+)\t\|\t([^\t]+)\t\|/; | |
my $taxid = $1; | |
my $parentid = $2; | |
my $rank = $3; | |
$taxnodes{$taxid} = $parentid; | |
$rank_hash{$taxid} = $rank; | |
} | |
close NFIL; | |
while ( my $line = <MFIL> ) { | |
chomp $line; | |
$line =~ /^(\d+)\t\|\t(\d+)\t\|/; | |
$taxnodes{$1} = $taxnodes{$2}; | |
$rank_hash{$1} = $rank_hash{$2}; | |
} | |
close MFIL; | |
foreach my $taxid (keys %taxnodes){ | |
my $rank = $rank_hash{$taxid}; | |
my $tax_tree = taxdump($taxid); | |
if ($tax_tree =~ /,2,/){ | |
if ($rank eq 'genus' || $rank eq 'family'){ | |
$genus_species{$taxid} = 1; | |
} | |
}else{ | |
if ($rank eq 'species' || $rank eq 'genus'){ | |
$genus_species{$taxid} = 1; | |
} | |
} | |
} | |
while ( my $line = <NMFIL> ) { | |
chomp $line; | |
if ($line =~ /^(\d+)\t\|\t([^\t]+)\t.*\tscientific name\t/){ | |
my $taxid = $1; | |
my $name = $2; | |
if ( exists $genus_species{$taxid}){ | |
$genus_species{$taxid} = $name; | |
} | |
} | |
} | |
close NMFIL; | |
while ( my $line = <LFIL> ) { | |
chomp $line; | |
my @col = split(/\t/, $line); | |
$lin{$col[0]} = $col[1]; | |
} | |
close LFIL; | |
while ( my $line = <NRFIL> ) { | |
chomp $line; | |
my @col = split(/\t/, $line); | |
my $accn = $col[0]; | |
my $taxid = $col[3]; | |
my $name = $col[5]; | |
if (exists $genus_species{$taxid}){ | |
$accn_hash{$accn}{$taxid} = $name; | |
#print LOG "storing EXISTING NAME : $name [$taxid] for $accn\n"; | |
}else{ | |
my $tax_tree = taxdump($taxid); | |
my @ncbi_lineage = split(/,/, $tax_tree); | |
my %ncbi_lineage_hash; | |
my $new_taxid = ''; | |
my $new_name = ''; | |
#get the species or genus name and store in hash | |
foreach my $taxon_id (@ncbi_lineage) { | |
$ncbi_lineage_hash{$taxon_id} = 1; | |
if (exists $genus_species{$taxon_id} && $new_taxid eq ''){ | |
$new_taxid = $taxon_id; | |
$new_name = $genus_species{$taxon_id}; | |
#print LOG "storing NEW NAME : $new_name [$new_taxid] for $accn OLD NAME : $name [$taxid]\n"; | |
} | |
} | |
if ($new_taxid eq ''){ | |
#print LOG "GENUS/SPECIES UNKNOWN: [$taxid - $name] : skipping $accn\n"; | |
next; | |
} | |
$accn_hash{$accn}{$new_taxid} = $new_name; | |
} | |
} | |
close NRFIL; | |
while ( my $line = <AFIL> ) { | |
chomp $line; | |
my @col = split(/\t/, $line); | |
my $accn = $col[2]; | |
my $taxid = $col[0]; | |
my $name = $col[1]; | |
next if (exists $accn_hash{$accn}); | |
if (exists $genus_species{$taxid}){ | |
$accn_hash{$accn}{$taxid} = $name; | |
#print LOG "storing EXISTING NAME : $name [$taxid] for $accn\n"; | |
}else{ | |
my $tax_tree = taxdump($taxid); | |
my @ncbi_lineage = split(/,/, $tax_tree); | |
my %ncbi_lineage_hash; | |
my $new_taxid = ''; | |
my $new_name = ''; | |
#get the species or genus name and store in hash | |
foreach my $taxon_id (@ncbi_lineage) { | |
$ncbi_lineage_hash{$taxon_id} = 1; | |
if (exists $genus_species{$taxon_id} && $new_taxid eq ''){ | |
$new_taxid = $taxon_id; | |
$new_name = $genus_species{$taxon_id}; | |
#print LOG "storing NEW NAME : $new_name [$new_taxid] for $accn OLD NAME : $name [$taxid]\n"; | |
} | |
} | |
if ($new_taxid eq ''){ | |
#print LOG "GENUS/SPECIES UNKNOWN: [$taxid - $name] : skipping $accn\n"; | |
next; | |
} | |
$accn_hash{$accn}{$new_taxid} = $new_name; | |
} | |
} | |
close AFIL; | |
} | |
sub read_fasta_sequence { | |
my ($IFIL, $seq_info) = @_; | |
$seq_info->{seq} = undef; # clear out previous sequence | |
# put the header into place | |
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header}; | |
my $file_not_empty = 0; | |
while (<$IFIL>) { | |
$file_not_empty = 1; | |
next if /^\s*$/; # skip blank lines | |
chomp; | |
if (/^>/) { # fasta header line | |
my $h = $_; | |
$h =~ s/^>//; | |
if ($seq_info->{header}) { | |
$seq_info->{next_header} = $h; | |
return $seq_info; | |
} | |
else { # first time through only | |
$seq_info->{header} = $h; | |
} | |
} | |
else { | |
s/\s+//; # remove any white space | |
$seq_info->{seq} .= $_; | |
} | |
} | |
if ($file_not_empty) { | |
return $seq_info; | |
} | |
else { | |
# clean everything up | |
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef; | |
return; | |
} | |
} |