Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/bin/bash
#SBATCH --job-name=mkrsdb
#SBATCH --mail-type=END,FAIL
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=20
#SBATCH --mem=122G
#SBATCH --time=72:00:00
#SBATCH --output=%x.o%j
#SBATCH --error=%x.e%j
# usage: sbatch [--mail-user='your_email@email.com' --account='your_queue'] make_refseq_dbs.sub
# must be run on halstead. Needs > 118 Gb memory
#### User specified variables
SCRIPTSDIR="${LAB_REPO}/custom-refseq"
MMETSP="/depot/jwisecav/data/ggavelis/greg_db/9_pep_for_cdhit/Sept12_938libraries_for_db.faa"
ONEKP="$LAB_DBS/onekp/onekp_proteins.fa"
CDHIT='/group/bioinfo/apps/apps/cd-hit-v4.6.5-2016-0304/cd-hit -M 90000' #-M to specify memory in MB
SFETCH='esl-sfetch'
DIAMOND='diamond'
CDHIT_THRESHOLD=0.95
################################
module load bioinfo parallel
CDHIT_ID=$(echo "$CDHIT_THRESHOLD" | cut -f 2 -d '.')
cd $LAB_DBS
# Make a new directory to hold database
now=`date +"%Y-%m-%d"`
DIR="$LAB_DBS/custom-refseq-${now}"
if [ ! -d $DIR ]; then mkdir $DIR; fi
cd $DIR
# copy the current lineages.dmp over from report
cp $SCRIPTSDIR/lineages.dmp .
################################
# Download refseq data from NCBI
curl -O ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -zxvf taxdump.tar.gz
RLS=`curl ftp://ftp.ncbi.nlm.nih.gov/refseq//release/RELEASE_NUMBER`
curl -O ftp://ftp.ncbi.nlm.nih.gov/refseq//release/release-catalog/RefSeq-release${RLS}.catalog.gz
curl -O ftp://ftp.ncbi.nlm.nih.gov/refseq//release/release-catalog//release${RLS}.AutonomousProtein2Genomic.gz
rsync --exclude '**/*.fna.gz' --exclude '**/*.bna.gz' --exclude '**/*.gbff.gz' --exclude '**/*.gpff.gz' --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov/refseq/release/complete .
#wget –quiet ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete/*.faa.gz
gunzip complete/*faa.gz
################################
# Filter and reformat refseq proteins
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : perl $SCRIPTSDIR/parse_refseq.pl RefSeq-release${RLS}.catalog.gz release${RLS}.AutonomousProtein2Genomic.gz refseq.fa \n\n"
perl $SCRIPTSDIR/parse_refseq.pl RefSeq-release${RLS}.catalog.gz release${RLS}.AutonomousProtein2Genomic.gz refseq.fa
################################
# Perform indexing and calculate database statistics
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $SFETCH --index refseq.fa \n\n"
$SFETCH --index refseq.fa
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $DIAMOND makedb --in refseq.fa --db refseq \n\n"
$DIAMOND makedb --in refseq.fa --db refseq
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : perl $SCRIPTSDIR/calc_dbstats.pl refseq.fa refseq.stats.txt \n\n"
perl $SCRIPTSDIR/calc_dbstats.pl refseq.fa refseq.stats.txt
################################
# Split up database by unique taxonomy id
OUTDIR="$DIR/uniq_taxids"
if [ ! -d uniq_taxids ]; then mkdir uniq_taxids; fi
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : perl $SCRIPTSDIR/parse_db_by_taxid.pl refseq.fa $OUTDIR \n\n"
perl $SCRIPTSDIR/parse_db_by_taxid.pl refseq.fa $OUTDIR
################################
# Run cd-hit to collapse similar sequences within a taxonomy id
cd $OUTDIR
echo "Number of unique taxonomy ids in database"
ls *_pep.fa | wc -l
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : Performing cd-hit within each taxid subfile to remove redundant sequences\n\n"
export CDHIT CDHIT_THRESHOLD CDHIT_ID
runcdhit() {
file=$1
taxid=$(echo "$file" | cut -f 1 -d '_')
echo $CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD
$CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD
}
export -f runcdhit
parallel runcdhit ::: *_pep.fa
# non parallel version
# for file in *_pep.fa; do
# echo $file
# taxid=$(echo "$file" | cut -f 1 -d '_')
# $CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD
# done
cd $DIR
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : cat $OUTDIR/*_cd95.fa > refseq_cd95.fa \n\n"
cat $OUTDIR/*_cd95.fa > refseq_cd95.fa
################################
# Perform indexing and calculate reduced database statistics
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $SFETCH --index refseq_cd95.fa \n\n"
$SFETCH --index refseq_cd95.fa
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $DIAMOND makedb --in refseq_cd95.fa --db refseq_cd95 \n\n"
$DIAMOND makedb --in refseq_cd95.fa --db refseq_cd95
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $SCRIPTSDIR/perl calc_dbstats.pl refseq_cd95.fa refseq_cd95.stats.txt \n\n"
perl $SCRIPTSDIR/calc_dbstats.pl refseq_cd95.fa refseq_cd95.stats.txt
################################
# Add supplemental sequences from mmetsp and onekp to unique taxonomy id files
echo
echo Adding supplemental sequences to refseq database
(date +'%T - %Y-%m-%d')
#create temporary file for timestamp
epoch=`date +'%s'`
touch /tmp/JHW$epoch
echo -e "COMMAND : perl $SCRIPTSDIR/parse_db_by_taxid.pl $MMETSP $OUTDIR \n\n"
perl $SCRIPTSDIR/parse_db_by_taxid.pl $MMETSP $OUTDIR
echo -e "COMMAND : perl $SCRIPTSDIR/parse_db_by_taxid.pl $ONEKP $OUTDIR \n\n"
perl $SCRIPTSDIR/parse_db_by_taxid.pl $ONEKP $OUTDIR
################################
# Run cd-hit to collapse similar sequences within a taxonomy id
cd $OUTDIR
echo "Number of unique taxonomy ids in database"
ls *_pep.fa | wc -l
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : Performing cd-hit within each taxid subfile to remove redundant sequences\n\n"
find *pep.fa -newer /tmp/JHW$epoch | parallel runcdhit {}
# for file in `find . -newer /tmp/JHW$epoch -print`; do
# echo $file
# taxid=$(echo "$file" | cut -f 1 -d '_')
# $CDHIT -i $file -o ${taxid}_cd${CDHIT_ID}.fa -c $CDHIT_THRESHOLD
# done
cd $DIR
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : cat $OUTDIR/*_cd95.fa > refseq_w_supp_cd95.fa \n\n"
cat $OUTDIR/*_cd95.fa > refseq_w_supp_cd95.fa
################################
# Perform indexing and calculate reduced database statistics
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $SFETCH --index refseq_w_supp_cd95.fa \n\n"
$SFETCH --index refseq_w_supp_cd95.fa
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $DIAMOND makedb --in refseq_w_supp_cd95.fa --db refseq_w_supp_cd95 \n\n"
$DIAMOND makedb --in refseq_w_supp_cd95.fa --db refseq_w_supp_cd95
echo
(date +'%T - %Y-%m-%d')
echo -e "COMMAND : $SCRIPTSDIR/perl calc_dbstats.pl refseq_w_supp_cd95.fa refseq_w_supp_cd95.stats.txt \n\n"
perl $SCRIPTSDIR/calc_dbstats.pl refseq_w_supp_cd95.fa refseq_w_supp_cd95.stats.txt