diff --git a/script/adj/adj_adjust_gtex.sh b/script/adj/adj_adjust_gtex.sh index 0b58cb4..ffe5ce7 100755 --- a/script/adj/adj_adjust_gtex.sh +++ b/script/adj/adj_adjust_gtex.sh @@ -21,9 +21,10 @@ cd $SIGNET_ROOT ## adjust expression by the covariates: top pcs, without PEER factors, Sex, Platform, Protocol $SIGNET_SCRIPT_ROOT/adj/gexp_cov_adjust.py --expr ${rest}_expression_normalized_igt2log_GTEx_${tissue}.expression.bed.gz \ --covf $SIGNET_TMP_ROOT/tmpa/all_covs_withoutigt_${tissue}_no_peer.combined_covariates.txt \ ---prefix ${resa}_rmpc > log_lung_withoutigt_no_peer +--prefix tmp_rmpc > log_lung_withoutigt_no_peer_rmpc $SIGNET_SCRIPT_ROOT/adj/gexp_cov_adjust.py --expr ${rest}_expression_normalized_igt2log_GTEx_${tissue}.expression.bed.gz \ --covf $SIGNET_TMP_ROOT/tmpa/all_covs_withoutigtpc_${tissue}_no_peer.combined_covariates.txt \ ---prefix ${resa} > log_lung_withoutigt_no_peer +--prefix tmp > log_lung_withoutigt_no_peer +Rscript protein_coding_gtex.R diff --git a/script/adj/protein_coding_gtex.R b/script/adj/protein_coding_gtex.R new file mode 100644 index 0000000..949a6df --- /dev/null +++ b/script/adj/protein_coding_gtex.R @@ -0,0 +1,37 @@ + +##Filter and order genes +library(data.table) +library(rtracklayer) +gtf <- rtracklayer::import(Sys.getenv(gtf)) +gtf <- as.data.frame(gtf) +gtf[, 1] <- substring(gtf[, 1], 4) + +data_rmpc <- as.matrix(fread("tmp_rmpc.gexp.data")) + +data <- as.matrix(fread("tmp.gexp.data")) + +gexp1 <- fread(paste0(Sys.getenv("rest"), "_expression_normalized_igt2log_GTEx_", Sys.getenv("tissue"), ".expression.bed.gz")) # gene by sample +POS <- gexp1[, c(1:4)] +names(POS) <-c("chr","start","end","gene") + +gtf_cut <- gtf[which(gtf$gene_id %in% as.matrix(POS[, 4])), ] +idx_pc <- (gtf_cut[ ,12]=="protein_coding")&(gtf_cut[ ,7]=="gene") + +POS <- unique(gtf_cut[idx_pc, c(10, 1, 2, 3)]) +names(POS) <- c("gene","chr","start","end") +##16762 protein coding genes + +data <- data[, match(as.matrix(POS[, 1]), as.matrix(gexp1[, 4]))] +data_rmpc <- data_rmpc[, match(as.matrix(POS[, 1]), as.matrix(gexp1[, 4]))] + +# remove genes whose size is greater than 2.3Mb +distance=POS$end-POS$start +idx <- which(distance<=2.3e6) + +POS <- POS[idx,] #16761 genes left +data <- data[,idx] +data_rmpc <- data_rmpc[, idx] + +fwrite(POS, paste0(Sys.getenv("resa"), "_gene_pos"),row.names=F,col.names=F,quote=F,sep=" ") +fwrite(data, paste0(Sys.getenv("resa"), "_gexp.data"),row.names=F,col.names=F,quote=F,sep=" ") +fwrite(data_rmpc, paste0(Sys.getenv("resa"), "_gexp_rmpc.data"),row.names=F,col.names=F,quote=F,sep=" ") diff --git a/script/adj_portal_gtex.sh b/script/adj_portal_gtex.sh index e53b7b4..9461e0a 100755 --- a/script/adj_portal_gtex.sh +++ b/script/adj_portal_gtex.sh @@ -9,12 +9,13 @@ usage() { exit -1 } -pheno=$($SIGNET_ROOT/signet -s --pheno | sed -r '/^\s*$/d') +pheno=$($SIGNET_ROOT/signet -s --pheno | sed -r '/^\s*$/d' | xargs readlink -f) +gtf=$($SIGNET_ROOT/signet -s --gtf | sed -r '/^\s*$/d' | xargs readlink -f) tissue=$($SIGNET_ROOT/signet -s --tissue | sed -r '/^\s*$/d') -anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d' | readlink -f ) -resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d' | readlink -f ) -resg=$($SIGNET_ROOT/signet -s --resg.gtex | sed -r '/^\s*$/d' | readlink -f) -rest=$($SIGNET_ROOT/signet -s --rest.gtex | sed -r '/^\s*$/d' | readlink -f) +anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d' | xargs readlink -f) +resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d' | xargs readlink -f) +resg=$($SIGNET_ROOT/signet -s --resg.gtex | sed -r '/^\s*$/d' | xargs readlink -f) +rest=$($SIGNET_ROOT/signet -s --rest.gtex | sed -r '/^\s*$/d' | xargs readlink -f) forcerm=$($SIGNET_ROOT/signet -s --forcerm | sed -r '/^\s*$/d') ARGS=`getopt -a -o a:r -l h:,pheno:,resa:,help -- "$@"` @@ -26,6 +27,7 @@ do case "$1" in --pheno) pheno=$2 + pheno=$(readlink -f $pheno) $SIGNET_ROOT/signet -s --pheno $pheno shift;; --resa) @@ -51,7 +53,7 @@ if [[ "$resa" == *"doesn't exist"* ]]; then exit -1 fi -var="pheno tissue anno rest resa" +var="pheno tissue gtf anno rest resa" for i in $var do export "${i}"