Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
custom-refseq/make_refseq_dbs.sub
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
192 lines (156 sloc)
6.27 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=mkrsdb | |
#SBATCH --mail-type=END,FAIL | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks=1 | |
#SBATCH --cpus-per-task=20 | |
#SBATCH --mem=122G | |
#SBATCH --time=72:00:00 | |
#SBATCH --output=%x.o%j | |
#SBATCH --error=%x.e%j | |
# usage: sbatch [--mail-user='your_email@email.com' --account='your_queue'] make_refseq_dbs.sub | |
# must be run on halstead. Needs > 118 Gb memory | |
#### User specified variables | |
SCRIPTSDIR="${LAB_REPO}/custom-refseq" | |
MMETSP="/depot/jwisecav/data/ggavelis/greg_db/9_pep_for_cdhit/Sept12_938libraries_for_db.faa" | |
ONEKP="$LAB_DBS/onekp/onekp_proteins.fa" | |
CDHIT='/group/bioinfo/apps/apps/cd-hit-v4.6.5-2016-0304/cd-hit -M 90000' #-M to specify memory in MB | |
SFETCH='esl-sfetch' | |
DIAMOND='diamond' | |
CDHIT_THRESHOLD=0.95 | |
################################ | |
module load bioinfo parallel | |
CDHIT_ID=$(echo "$CDHIT_THRESHOLD" | cut -f 2 -d '.') | |
cd $LAB_DBS | |
# Make a new directory to hold database | |
now=`date +"%Y-%m-%d"` | |
DIR="$LAB_DBS/custom-refseq-${now}" | |
if [ ! -d $DIR ]; then mkdir $DIR; fi | |
cd $DIR | |
# copy the current lineages.dmp over from report | |
cp $SCRIPTSDIR/lineages.dmp . | |
################################ | |
# Download refseq data from NCBI | |
curl -O ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz | |
tar -zxvf taxdump.tar.gz | |
RLS=`curl ftp://ftp.ncbi.nlm.nih.gov/refseq//release/RELEASE_NUMBER` | |
curl -O ftp://ftp.ncbi.nlm.nih.gov/refseq//release/release-catalog/RefSeq-release${RLS}.catalog.gz | |
curl -O ftp://ftp.ncbi.nlm.nih.gov/refseq//release/release-catalog//release${RLS}.AutonomousProtein2Genomic.gz | |
rsync --exclude '**/*.fna.gz' --exclude '**/*.bna.gz' --exclude '**/*.gbff.gz' --exclude '**/*.gpff.gz' --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov/refseq/release/complete . | |
#wget –quiet ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete/*.faa.gz | |
gunzip complete/*faa.gz | |
################################ | |
# Filter and reformat refseq proteins | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : perl $SCRIPTSDIR/parse_refseq.pl RefSeq-release${RLS}.catalog.gz release${RLS}.AutonomousProtein2Genomic.gz refseq.fa \n\n" | |
perl $SCRIPTSDIR/parse_refseq.pl RefSeq-release${RLS}.catalog.gz release${RLS}.AutonomousProtein2Genomic.gz refseq.fa | |
################################ | |
# Perform indexing and calculate database statistics | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $SFETCH --index refseq.fa \n\n" | |
$SFETCH --index refseq.fa | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $DIAMOND makedb --in refseq.fa --db refseq \n\n" | |
$DIAMOND makedb --in refseq.fa --db refseq | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : perl $SCRIPTSDIR/calc_dbstats.pl refseq.fa refseq.stats.txt \n\n" | |
perl $SCRIPTSDIR/calc_dbstats.pl refseq.fa refseq.stats.txt | |
################################ | |
# Split up database by unique taxonomy id | |
OUTDIR="$DIR/uniq_taxids" | |
if [ ! -d uniq_taxids ]; then mkdir uniq_taxids; fi | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : perl $SCRIPTSDIR/parse_db_by_taxid.pl refseq.fa $OUTDIR \n\n" | |
perl $SCRIPTSDIR/parse_db_by_taxid.pl refseq.fa $OUTDIR | |
################################ | |
# Run cd-hit to collapse similar sequences within a taxonomy id | |
cd $OUTDIR | |
echo "Number of unique taxonomy ids in database" | |
ls *_pep.fa | wc -l | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : Performing cd-hit within each taxid subfile to remove redundant sequences\n\n" | |
export CDHIT CDHIT_THRESHOLD CDHIT_ID | |
runcdhit() { | |
file=$1 | |
taxid=$(echo "$file" | cut -f 1 -d '_') | |
echo $CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD | |
$CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD | |
} | |
export -f runcdhit | |
parallel runcdhit ::: *_pep.fa | |
# non parallel version | |
# for file in *_pep.fa; do | |
# echo $file | |
# taxid=$(echo "$file" | cut -f 1 -d '_') | |
# $CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD | |
# done | |
cd $DIR | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : cat $OUTDIR/*_cd95.fa > refseq_cd95.fa \n\n" | |
cat $OUTDIR/*_cd95.fa > refseq_cd95.fa | |
################################ | |
# Perform indexing and calculate reduced database statistics | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $SFETCH --index refseq_cd95.fa \n\n" | |
$SFETCH --index refseq_cd95.fa | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $DIAMOND makedb --in refseq_cd95.fa --db refseq_cd95 \n\n" | |
$DIAMOND makedb --in refseq_cd95.fa --db refseq_cd95 | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $SCRIPTSDIR/perl calc_dbstats.pl refseq_cd95.fa refseq_cd95.stats.txt \n\n" | |
perl $SCRIPTSDIR/calc_dbstats.pl refseq_cd95.fa refseq_cd95.stats.txt | |
################################ | |
# Add supplemental sequences from mmetsp and onekp to unique taxonomy id files | |
echo | |
echo Adding supplemental sequences to refseq database | |
(date +'%T - %Y-%m-%d') | |
#create temporary file for timestamp | |
epoch=`date +'%s'` | |
touch /tmp/JHW$epoch | |
echo -e "COMMAND : perl $SCRIPTSDIR/parse_db_by_taxid.pl $MMETSP $OUTDIR \n\n" | |
perl $SCRIPTSDIR/parse_db_by_taxid.pl $MMETSP $OUTDIR | |
echo -e "COMMAND : perl $SCRIPTSDIR/parse_db_by_taxid.pl $ONEKP $OUTDIR \n\n" | |
perl $SCRIPTSDIR/parse_db_by_taxid.pl $ONEKP $OUTDIR | |
################################ | |
# Run cd-hit to collapse similar sequences within a taxonomy id | |
cd $OUTDIR | |
echo "Number of unique taxonomy ids in database" | |
ls *_pep.fa | wc -l | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : Performing cd-hit within each taxid subfile to remove redundant sequences\n\n" | |
find *pep.fa -newer /tmp/JHW$epoch | parallel runcdhit {} | |
# for file in `find . -newer /tmp/JHW$epoch -print`; do | |
# echo $file | |
# taxid=$(echo "$file" | cut -f 1 -d '_') | |
# $CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD | |
# done | |
cd $DIR | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : cat $OUTDIR/*_cd95.fa > refseq_w_supp_cd95.fa \n\n" | |
cat $OUTDIR/*_cd95.fa > refseq_w_supp_cd95.fa | |
################################ | |
# Perform indexing and calculate reduced database statistics | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $SFETCH --index refseq_w_supp_cd95.fa \n\n" | |
$SFETCH --index refseq_w_supp_cd95.fa | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $DIAMOND makedb --in refseq_w_supp_cd95.fa --db refseq_w_supp_cd95 \n\n" | |
$DIAMOND makedb --in refseq_w_supp_cd95.fa --db refseq_w_supp_cd95 | |
echo | |
(date +'%T - %Y-%m-%d') | |
echo -e "COMMAND : $SCRIPTSDIR/perl calc_dbstats.pl refseq_w_supp_cd95.fa refseq_w_supp_cd95.stats.txt \n\n" | |
perl $SCRIPTSDIR/calc_dbstats.pl refseq_w_supp_cd95.fa refseq_w_supp_cd95.stats.txt | |