From f091749cbdc7be6c92310b1d5e5ad4c2c448dd08 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sat, 30 Apr 2022 18:48:42 -0400 Subject: [PATCH 01/19] add geno --- config.ini | 1 + config.ini.default | 1 + 2 files changed, 2 insertions(+) diff --git a/config.ini b/config.ini index 89dda43..201d87a 100755 --- a/config.ini +++ b/config.ini @@ -41,6 +41,7 @@ cli.file = ./data/clinical.tsv resa.tcga = res/resa/signet (GTEX) +pheno = pheno.txt resa.gtex = res/resa/signet [plink] diff --git a/config.ini.default b/config.ini.default index 89dda43..201d87a 100755 --- a/config.ini.default +++ b/config.ini.default @@ -41,6 +41,7 @@ cli.file = ./data/clinical.tsv resa.tcga = res/resa/signet (GTEX) +pheno = pheno.txt resa.gtex = res/resa/signet [plink] From 2c624fb05b83741d06e7672438d103ec3683e1ba Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sat, 30 Apr 2022 18:52:00 -0400 Subject: [PATCH 02/19] add full path for adj of GTEx --- script/adj_portal_gtex.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/script/adj_portal_gtex.sh b/script/adj_portal_gtex.sh index 6c84f51..e53b7b4 100755 --- a/script/adj_portal_gtex.sh +++ b/script/adj_portal_gtex.sh @@ -11,10 +11,10 @@ usage() { pheno=$($SIGNET_ROOT/signet -s --pheno | sed -r '/^\s*$/d') tissue=$($SIGNET_ROOT/signet -s --tissue | sed -r '/^\s*$/d') -anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d') -resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d') -resg=$($SIGNET_ROOT/signet -s --resg.gtex | sed -r '/^\s*$/d') -rest=$($SIGNET_ROOT/signet -s --rest.gtex | sed -r '/^\s*$/d') +anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d' | readlink -f ) +resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d' | readlink -f ) +resg=$($SIGNET_ROOT/signet -s --resg.gtex | sed -r '/^\s*$/d' | readlink -f) +rest=$($SIGNET_ROOT/signet -s --rest.gtex | sed -r '/^\s*$/d' | readlink -f) forcerm=$($SIGNET_ROOT/signet -s --forcerm | sed -r '/^\s*$/d') ARGS=`getopt -a -o a:r -l h:,pheno:,resa:,help -- "$@"` From 3a6358ce37e46e3968f86f910973bcd69d0a39e5 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 01:30:33 -0400 Subject: [PATCH 03/19] protein coding for GTEx --- script/adj/adj_adjust_gtex.sh | 5 +++-- script/adj/protein_coding_gtex.R | 37 ++++++++++++++++++++++++++++++++ script/adj_portal_gtex.sh | 14 ++++++------ 3 files changed, 48 insertions(+), 8 deletions(-) create mode 100644 script/adj/protein_coding_gtex.R diff --git a/script/adj/adj_adjust_gtex.sh b/script/adj/adj_adjust_gtex.sh index 0b58cb4..ffe5ce7 100755 --- a/script/adj/adj_adjust_gtex.sh +++ b/script/adj/adj_adjust_gtex.sh @@ -21,9 +21,10 @@ cd $SIGNET_ROOT ## adjust expression by the covariates: top pcs, without PEER factors, Sex, Platform, Protocol $SIGNET_SCRIPT_ROOT/adj/gexp_cov_adjust.py --expr ${rest}_expression_normalized_igt2log_GTEx_${tissue}.expression.bed.gz \ --covf $SIGNET_TMP_ROOT/tmpa/all_covs_withoutigt_${tissue}_no_peer.combined_covariates.txt \ ---prefix ${resa}_rmpc > log_lung_withoutigt_no_peer +--prefix tmp_rmpc > log_lung_withoutigt_no_peer_rmpc $SIGNET_SCRIPT_ROOT/adj/gexp_cov_adjust.py --expr ${rest}_expression_normalized_igt2log_GTEx_${tissue}.expression.bed.gz \ --covf $SIGNET_TMP_ROOT/tmpa/all_covs_withoutigtpc_${tissue}_no_peer.combined_covariates.txt \ ---prefix ${resa} > log_lung_withoutigt_no_peer +--prefix tmp > log_lung_withoutigt_no_peer +Rscript protein_coding_gtex.R diff --git a/script/adj/protein_coding_gtex.R b/script/adj/protein_coding_gtex.R new file mode 100644 index 0000000..949a6df --- /dev/null +++ b/script/adj/protein_coding_gtex.R @@ -0,0 +1,37 @@ + +##Filter and order genes +library(data.table) +library(rtracklayer) +gtf <- rtracklayer::import(Sys.getenv(gtf)) +gtf <- as.data.frame(gtf) +gtf[, 1] <- substring(gtf[, 1], 4) + +data_rmpc <- as.matrix(fread("tmp_rmpc.gexp.data")) + +data <- as.matrix(fread("tmp.gexp.data")) + +gexp1 <- fread(paste0(Sys.getenv("rest"), "_expression_normalized_igt2log_GTEx_", Sys.getenv("tissue"), ".expression.bed.gz")) # gene by sample +POS <- gexp1[, c(1:4)] +names(POS) <-c("chr","start","end","gene") + +gtf_cut <- gtf[which(gtf$gene_id %in% as.matrix(POS[, 4])), ] +idx_pc <- (gtf_cut[ ,12]=="protein_coding")&(gtf_cut[ ,7]=="gene") + +POS <- unique(gtf_cut[idx_pc, c(10, 1, 2, 3)]) +names(POS) <- c("gene","chr","start","end") +##16762 protein coding genes + +data <- data[, match(as.matrix(POS[, 1]), as.matrix(gexp1[, 4]))] +data_rmpc <- data_rmpc[, match(as.matrix(POS[, 1]), as.matrix(gexp1[, 4]))] + +# remove genes whose size is greater than 2.3Mb +distance=POS$end-POS$start +idx <- which(distance<=2.3e6) + +POS <- POS[idx,] #16761 genes left +data <- data[,idx] +data_rmpc <- data_rmpc[, idx] + +fwrite(POS, paste0(Sys.getenv("resa"), "_gene_pos"),row.names=F,col.names=F,quote=F,sep=" ") +fwrite(data, paste0(Sys.getenv("resa"), "_gexp.data"),row.names=F,col.names=F,quote=F,sep=" ") +fwrite(data_rmpc, paste0(Sys.getenv("resa"), "_gexp_rmpc.data"),row.names=F,col.names=F,quote=F,sep=" ") diff --git a/script/adj_portal_gtex.sh b/script/adj_portal_gtex.sh index e53b7b4..9461e0a 100755 --- a/script/adj_portal_gtex.sh +++ b/script/adj_portal_gtex.sh @@ -9,12 +9,13 @@ usage() { exit -1 } -pheno=$($SIGNET_ROOT/signet -s --pheno | sed -r '/^\s*$/d') +pheno=$($SIGNET_ROOT/signet -s --pheno | sed -r '/^\s*$/d' | xargs readlink -f) +gtf=$($SIGNET_ROOT/signet -s --gtf | sed -r '/^\s*$/d' | xargs readlink -f) tissue=$($SIGNET_ROOT/signet -s --tissue | sed -r '/^\s*$/d') -anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d' | readlink -f ) -resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d' | readlink -f ) -resg=$($SIGNET_ROOT/signet -s --resg.gtex | sed -r '/^\s*$/d' | readlink -f) -rest=$($SIGNET_ROOT/signet -s --rest.gtex | sed -r '/^\s*$/d' | readlink -f) +anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d' | xargs readlink -f) +resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d' | xargs readlink -f) +resg=$($SIGNET_ROOT/signet -s --resg.gtex | sed -r '/^\s*$/d' | xargs readlink -f) +rest=$($SIGNET_ROOT/signet -s --rest.gtex | sed -r '/^\s*$/d' | xargs readlink -f) forcerm=$($SIGNET_ROOT/signet -s --forcerm | sed -r '/^\s*$/d') ARGS=`getopt -a -o a:r -l h:,pheno:,resa:,help -- "$@"` @@ -26,6 +27,7 @@ do case "$1" in --pheno) pheno=$2 + pheno=$(readlink -f $pheno) $SIGNET_ROOT/signet -s --pheno $pheno shift;; --resa) @@ -51,7 +53,7 @@ if [[ "$resa" == *"doesn't exist"* ]]; then exit -1 fi -var="pheno tissue anno rest resa" +var="pheno tissue gtf anno rest resa" for i in $var do export "${i}" From 0ea5df517ca39dee8a3974e69afaf7993e606baf Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 02:15:26 -0400 Subject: [PATCH 04/19] protein coding --- script/adj/adj_adjust_gtex.sh | 4 ++-- script/adj/protein_coding_gtex.R | 3 ++- script/adj_portal_gtex.sh | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/script/adj/adj_adjust_gtex.sh b/script/adj/adj_adjust_gtex.sh index ffe5ce7..4c172cd 100755 --- a/script/adj/adj_adjust_gtex.sh +++ b/script/adj/adj_adjust_gtex.sh @@ -17,7 +17,7 @@ empty.txt \ all_covs_withoutigtpc_${tissue}_no_peer && echo -e 'Begin Preprocessing ...\n' -cd $SIGNET_ROOT +cd $SIGNET_TMP_ROOT/tmpa ## adjust expression by the covariates: top pcs, without PEER factors, Sex, Platform, Protocol $SIGNET_SCRIPT_ROOT/adj/gexp_cov_adjust.py --expr ${rest}_expression_normalized_igt2log_GTEx_${tissue}.expression.bed.gz \ --covf $SIGNET_TMP_ROOT/tmpa/all_covs_withoutigt_${tissue}_no_peer.combined_covariates.txt \ @@ -27,4 +27,4 @@ $SIGNET_SCRIPT_ROOT/adj/gexp_cov_adjust.py --expr ${rest}_expression_normalized_ --covf $SIGNET_TMP_ROOT/tmpa/all_covs_withoutigtpc_${tissue}_no_peer.combined_covariates.txt \ --prefix tmp > log_lung_withoutigt_no_peer -Rscript protein_coding_gtex.R +Rscript $SIGNET_SCRIPT_ROOT/adj/protein_coding_gtex.R diff --git a/script/adj/protein_coding_gtex.R b/script/adj/protein_coding_gtex.R index 949a6df..09a01d4 100644 --- a/script/adj/protein_coding_gtex.R +++ b/script/adj/protein_coding_gtex.R @@ -2,7 +2,8 @@ ##Filter and order genes library(data.table) library(rtracklayer) -gtf <- rtracklayer::import(Sys.getenv(gtf)) +print(Sys.getenv("gtf")) +gtf <- rtracklayer::import(Sys.getenv("gtf")) gtf <- as.data.frame(gtf) gtf[, 1] <- substring(gtf[, 1], 4) diff --git a/script/adj_portal_gtex.sh b/script/adj_portal_gtex.sh index 9461e0a..39b64ff 100755 --- a/script/adj_portal_gtex.sh +++ b/script/adj_portal_gtex.sh @@ -10,7 +10,7 @@ usage() { } pheno=$($SIGNET_ROOT/signet -s --pheno | sed -r '/^\s*$/d' | xargs readlink -f) -gtf=$($SIGNET_ROOT/signet -s --gtf | sed -r '/^\s*$/d' | xargs readlink -f) +gtf=$($SIGNET_ROOT/signet -s --gtf.file | sed -r '/^\s*$/d' | xargs readlink -f) tissue=$($SIGNET_ROOT/signet -s --tissue | sed -r '/^\s*$/d') anno=$($SIGNET_ROOT/signet -s --anno | sed -r '/^\s*$/d' | xargs readlink -f) resa=$($SIGNET_ROOT/signet -s --resa.gtex | sed -r '/^\s*$/d' | xargs readlink -f) @@ -59,6 +59,7 @@ do export "${i}" done + # check file existence input_file="pheno anno" for i in $input_file From 23401c47a5644aba93c7781915a7f359605c28f4 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 02:25:28 -0400 Subject: [PATCH 05/19] ensemble id to gene name --- script/adj/protein_coding_gtex.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/script/adj/protein_coding_gtex.R b/script/adj/protein_coding_gtex.R index 09a01d4..2ffba85 100644 --- a/script/adj/protein_coding_gtex.R +++ b/script/adj/protein_coding_gtex.R @@ -18,7 +18,10 @@ names(POS) <-c("chr","start","end","gene") gtf_cut <- gtf[which(gtf$gene_id %in% as.matrix(POS[, 4])), ] idx_pc <- (gtf_cut[ ,12]=="protein_coding")&(gtf_cut[ ,7]=="gene") -POS <- unique(gtf_cut[idx_pc, c(10, 1, 2, 3)]) +#ensembl id +#POS <- unique(gtf_cut[idx_pc, c(10, 1, 2, 3)]) +#gene name +POS <- unique(gtf_cut[idx_pc, c(13, 1, 2, 3)]) names(POS) <- c("gene","chr","start","end") ##16762 protein coding genes @@ -30,6 +33,7 @@ distance=POS$end-POS$start idx <- which(distance<=2.3e6) POS <- POS[idx,] #16761 genes left +POS[, 2] <- paste0("chr", POS[, 2]) data <- data[,idx] data_rmpc <- data_rmpc[, idx] From 3a89b9bfd3ae32b59a3df655b426eb6cc24f3978 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 02:36:15 -0400 Subject: [PATCH 06/19] remove print gtf --- script/adj/protein_coding_gtex.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/script/adj/protein_coding_gtex.R b/script/adj/protein_coding_gtex.R index 2ffba85..99a50a2 100644 --- a/script/adj/protein_coding_gtex.R +++ b/script/adj/protein_coding_gtex.R @@ -1,8 +1,7 @@ - ##Filter and order genes library(data.table) library(rtracklayer) -print(Sys.getenv("gtf")) + gtf <- rtracklayer::import(Sys.getenv("gtf")) gtf <- as.data.frame(gtf) gtf[, 1] <- substring(gtf[, 1], 4) From 69cc2a2b118e4c44c0cd1a2d2b8e614557c9a9c3 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 02:46:32 -0400 Subject: [PATCH 07/19] add info about ensmbl id --- script/adj/protein_coding_gtex.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/script/adj/protein_coding_gtex.R b/script/adj/protein_coding_gtex.R index 99a50a2..8f0f41f 100644 --- a/script/adj/protein_coding_gtex.R +++ b/script/adj/protein_coding_gtex.R @@ -18,14 +18,14 @@ gtf_cut <- gtf[which(gtf$gene_id %in% as.matrix(POS[, 4])), ] idx_pc <- (gtf_cut[ ,12]=="protein_coding")&(gtf_cut[ ,7]=="gene") #ensembl id -#POS <- unique(gtf_cut[idx_pc, c(10, 1, 2, 3)]) +POS_ensmbl <- unique(gtf_cut[idx_pc, c(10, 1, 2, 3)]) #gene name POS <- unique(gtf_cut[idx_pc, c(13, 1, 2, 3)]) -names(POS) <- c("gene","chr","start","end") +names(POS) <- names(POS_ensmbl) <- c("gene","chr","start","end") ##16762 protein coding genes -data <- data[, match(as.matrix(POS[, 1]), as.matrix(gexp1[, 4]))] -data_rmpc <- data_rmpc[, match(as.matrix(POS[, 1]), as.matrix(gexp1[, 4]))] +data <- data[, match(as.matrix(POS_ensmbl[, 1]), as.matrix(gexp1[, 4]))] +data_rmpc <- data_rmpc[, match(as.matrix(POS_ensmbl[, 1]), as.matrix(gexp1[, 4]))] # remove genes whose size is greater than 2.3Mb distance=POS$end-POS$start From dff6d2590ae0c795e1dc3d0bb2dbe37530e17430 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 03:05:32 -0400 Subject: [PATCH 08/19] message --- script/adj_portal_gtex.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/adj_portal_gtex.sh b/script/adj_portal_gtex.sh index 39b64ff..1f26598 100755 --- a/script/adj_portal_gtex.sh +++ b/script/adj_portal_gtex.sh @@ -67,6 +67,6 @@ do file_check $(eval "$(echo "echo \$${i}")") done -$SIGNET_SCRIPT_ROOT/adj/adj_gtex.sh && echo -e "Gene Expression and Genotype matching Finished\n" +$SIGNET_SCRIPT_ROOT/adj/adj_gtex.sh && echo -e "Adjusting for covariates finished\n" echo -e "Finish time: $(date)" From 7613b164344c9ee24c32e2910dd563f9607bc447 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 03:10:30 -0400 Subject: [PATCH 09/19] message update --- script/cis-eQTL/cis-eQTL.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/cis-eQTL/cis-eQTL.sh b/script/cis-eQTL/cis-eQTL.sh index 3457141..e495a0b 100755 --- a/script/cis-eQTL/cis-eQTL.sh +++ b/script/cis-eQTL/cis-eQTL.sh @@ -1,5 +1,5 @@ #!/bin/bash -echo -e "Splitting the snps by maf\n" +echo -e "Splitting the SNPs by maf\n" $SIGNET_SCRIPT_ROOT/cis-eQTL/snpsplit.sh && From 62e2bd455a039e448f90a51816595535b6a2da40 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 17:28:44 -0400 Subject: [PATCH 10/19] enable fread --- script/cis-eQTL/common.ciseQTL.r | 5 +++-- script/cis-eQTL/low.ciseQTL.r | 4 ++-- script/cis-eQTL/rare.ciseQTL.r | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/script/cis-eQTL/common.ciseQTL.r b/script/cis-eQTL/common.ciseQTL.r index 26b8b90..88e9ab5 100755 --- a/script/cis-eQTL/common.ciseQTL.r +++ b/script/cis-eQTL/common.ciseQTL.r @@ -1,8 +1,9 @@ ### common.ciseQTL.R ### Calculate p-values of all common cis-eQTL # -y=read.table(Sys.getenv("gexp")) -x=read.table('common.Geno.data') +library(data.table) +y=fread(Sys.getenv("gexp")) +x=fread('common.Geno.data') y=as.matrix(y) x=as.matrix(x) y=scale(y) diff --git a/script/cis-eQTL/low.ciseQTL.r b/script/cis-eQTL/low.ciseQTL.r index 521b746..5062644 100755 --- a/script/cis-eQTL/low.ciseQTL.r +++ b/script/cis-eQTL/low.ciseQTL.r @@ -1,8 +1,8 @@ ### low.ciseQTL.R ### Calculate p-values of low-freq cis-eQTL via permutation # -y=read.table(Sys.getenv("gexp")) -x=read.table('low.Geno.data') +y=fread(Sys.getenv("gexp")) +x=fread('low.Geno.data') idx=read.table('low.cispair.idx') y=as.matrix(y) x=as.matrix(x) diff --git a/script/cis-eQTL/rare.ciseQTL.r b/script/cis-eQTL/rare.ciseQTL.r index 6b56754..bf7b2bf 100644 --- a/script/cis-eQTL/rare.ciseQTL.r +++ b/script/cis-eQTL/rare.ciseQTL.r @@ -1,8 +1,9 @@ ### rare.ciseQTL.R ### Calculate p-values of rare-freq cis-eQTL via permutation # -y=read.table(Sys.getenv("gexp")) -x=read.table('rare.Geno.data') +library(data.table) +y=fread(Sys.getenv("gexp")) +x=fread('rare.Geno.data') idx=read.table('rare.cispair.idx') y=as.matrix(y) x=as.matrix(x) From 7e54c1444e8f65af40137198a31d705a641cf88b Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 18:28:40 -0400 Subject: [PATCH 11/19] speed improvement --- script/cis-eQTL/cis-SNPs.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/script/cis-eQTL/cis-SNPs.sh b/script/cis-eQTL/cis-SNPs.sh index a5d19f3..3bc9c84 100755 --- a/script/cis-eQTL/cis-SNPs.sh +++ b/script/cis-eQTL/cis-SNPs.sh @@ -11,11 +11,11 @@ awk -f low.snps.idx < $geno 1>low.Geno.data 2>err_low & awk -f common.snps.idx < $geno 1>common.Geno.data 2>err_common & ### SNP position (chr#, SNP pos) -awk '{print $1,$4}' $snps_map > all.SNPpos -awk '{print $1,$4}' rare.Geno.map > rare.SNPpos -awk '{print $1,$4}' low.Geno.map > low.SNPpos -awk '{print $1,$4}' common.Geno.map > common.SNPpos +awk '{print $1,$4}' $snps_map > all.SNPpos & +awk '{print $1,$4}' rare.Geno.map > rare.SNPpos & +awk '{print $1,$4}' low.Geno.map > low.SNPpos & +awk '{print $1,$4}' common.Geno.map > common.SNPpos & -Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/rare.cispair.r "upstream='$upstream'" "downstream='$downstream'" -Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/common.cispair.r "upstream='$upstream'" "downstream='$downstream'" -Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/low.cispair.r "upstream='$upstream'" "downstream='$downstream'" +Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/rare.cispair.r "upstream='$upstream'" "downstream='$downstream'" & +Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/common.cispair.r "upstream='$upstream'" "downstream='$downstream'" & +Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/low.cispair.r "upstream='$upstream'" "downstream='$downstream'" & From 92fc3e28f294ff74c75840e7f8cccd96c16310ec Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 18:45:00 -0400 Subject: [PATCH 12/19] add wait command --- script/cis-eQTL/cis-SNPs.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/script/cis-eQTL/cis-SNPs.sh b/script/cis-eQTL/cis-SNPs.sh index 3bc9c84..71db4c6 100755 --- a/script/cis-eQTL/cis-SNPs.sh +++ b/script/cis-eQTL/cis-SNPs.sh @@ -11,11 +11,12 @@ awk -f low.snps.idx < $geno 1>low.Geno.data 2>err_low & awk -f common.snps.idx < $geno 1>common.Geno.data 2>err_common & ### SNP position (chr#, SNP pos) -awk '{print $1,$4}' $snps_map > all.SNPpos & -awk '{print $1,$4}' rare.Geno.map > rare.SNPpos & -awk '{print $1,$4}' low.Geno.map > low.SNPpos & -awk '{print $1,$4}' common.Geno.map > common.SNPpos & +awk '{print $1,$4}' $snps_map > all.SNPpos +awk '{print $1,$4}' rare.Geno.map > rare.SNPpos +awk '{print $1,$4}' low.Geno.map > low.SNPpos +awk '{print $1,$4}' common.Geno.map > common.SNPpos Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/rare.cispair.r "upstream='$upstream'" "downstream='$downstream'" & Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/common.cispair.r "upstream='$upstream'" "downstream='$downstream'" & Rscript $SIGNET_SCRIPT_ROOT/cis-eQTL/low.cispair.r "upstream='$upstream'" "downstream='$downstream'" & +wait From 37d5dbe08ecefb6cefbc7eb8b3e95f3be7473732 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 20:11:10 -0400 Subject: [PATCH 13/19] user output refine --- script/cis-eQTL/common.ciseQTL.r | 2 +- script/cis-eQTL/low.ciseQTL.r | 2 +- script/cis-eQTL/rare.ciseQTL.r | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/script/cis-eQTL/common.ciseQTL.r b/script/cis-eQTL/common.ciseQTL.r index 88e9ab5..3dd3b8d 100755 --- a/script/cis-eQTL/common.ciseQTL.r +++ b/script/cis-eQTL/common.ciseQTL.r @@ -19,7 +19,7 @@ for (i in 1:len){ fit=lm(y[,idx[i,1]]~x[,idx[i,2]]) p[i]=summary(fit)$coefficients[2,4] } - if( (i%%100)==0) print(i) + if( (i%%10000)==0) print(i) } p=cbind(idx,p) # Col 1 is index of gene, Col 2 is index of its cis-SNP, Col 3 is p-value diff --git a/script/cis-eQTL/low.ciseQTL.r b/script/cis-eQTL/low.ciseQTL.r index 5062644..2e9bab9 100755 --- a/script/cis-eQTL/low.ciseQTL.r +++ b/script/cis-eQTL/low.ciseQTL.r @@ -93,7 +93,7 @@ for (i in YYstartYY:YYendYY){ } } - if((i%%100)==0) print(i) + if((i%%10000)==0) print(i) } write.table(w,"low.ciseQTL.weightYYY",row.names=F,col.names=F,quote=F,sep=" ") diff --git a/script/cis-eQTL/rare.ciseQTL.r b/script/cis-eQTL/rare.ciseQTL.r index bf7b2bf..7e5997d 100644 --- a/script/cis-eQTL/rare.ciseQTL.r +++ b/script/cis-eQTL/rare.ciseQTL.r @@ -94,7 +94,7 @@ for (i in YYstartYY:YYendYY){ } } - if((i%%100)==0) print(i) + if((i%%10000)==0) print(i) } write.table(w,"rare.ciseQTL.weightYYY",row.names=F,col.names=F,quote=F,sep=" ") From e0ab66788bc5e4dc41e19925fd262f4aca97191f Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 20:45:32 -0400 Subject: [PATCH 14/19] data.table --- script/cis-eQTL/low.ciseQTL.r | 1 + 1 file changed, 1 insertion(+) diff --git a/script/cis-eQTL/low.ciseQTL.r b/script/cis-eQTL/low.ciseQTL.r index 2e9bab9..84db0e9 100755 --- a/script/cis-eQTL/low.ciseQTL.r +++ b/script/cis-eQTL/low.ciseQTL.r @@ -1,6 +1,7 @@ ### low.ciseQTL.R ### Calculate p-values of low-freq cis-eQTL via permutation # +library(data.table) y=fread(Sys.getenv("gexp")) x=fread('low.Geno.data') idx=read.table('low.cispair.idx') From e6e28cfe635b7c1b52a3d4e5ecbf9ff302adfe17 Mon Sep 17 00:00:00 2001 From: Zhongli Jiang Date: Sun, 1 May 2022 20:53:17 -0400 Subject: [PATCH 15/19] welcome message --- signet | 1 + 1 file changed, 1 insertion(+) diff --git a/signet b/signet index 9c2fda8..70fe2d3 100755 --- a/signet +++ b/signet @@ -81,6 +81,7 @@ case "$operate" in -a|-adj) adj;; *) + echo -e "WELCOME TO SIGNET !!! \n" echo -e "Usage params: -g|-t|-n|-c|-s|-v|-a" ;; esac From 47721bb391f2c0fe985ba9c247f5964a8d8f6a6d Mon Sep 17 00:00:00 2001 From: "Jiang, Zhongli" Date: Sun, 1 May 2022 20:56:18 -0400 Subject: [PATCH 16/19] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e369ce3..9a1a3ad 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ signet -v ### Settings -Settings command is used for look up and modify parameter in the configuration file config.ini. You don't have to modify the parameters at the very beginning, as you will have options to change your input parameters in each step. +`signet -s` command is used for look up and modify parameter in the configuration file config.ini. You don't have to modify the parameters at the very beginning, as you will have options to change your input parameters in each step. [click here](#config-file) for detailed introduction for configuration file. @@ -187,7 +187,7 @@ echo: Please check the file name ### Transcript-prep (TCGA) -This command will take the matrix of log2(x+1) transcriptome count data and preprocess it. +`signet -t` command will take the matrix of log2(x+1) transcriptome count data and preprocess it. #### Usage @@ -268,7 +268,7 @@ signet -t --reads data/gexp/GTEx_gene_reads.gct \ (TCGA) -`geno-prep` command provide the user the interface of preprocessing genotype data. We will do quality control, after which we will use IMPUTE2 for imputation. +`signet -g` command provide the user the interface of preprocessing genotype data. We will do quality control, after which we will use IMPUTE2 for imputation. #### Usage @@ -324,7 +324,7 @@ Output of `geno-prep` will be saved under `/res/resg`: (GTEx) -`geno-prep` command provide the user the interface of preprocessing genotype data. We will first extract the genotype data that has corresponding samples from gene expression data for a particular tissue. +`signet -g` command provide the user the interface of preprocessing genotype data. We will first extract the genotype data that has corresponding samples from gene expression data for a particular tissue. @@ -373,8 +373,8 @@ signet -g --vcf0 data/geno-prep/Geno_GTEx.vcf \ ### Adj -`adj` command provide users the interface of matching genotype and gene expression file and the calculation for minor allele frequency (MAF) -`adj` read the output from `geno-prep` and `gexp-prep` +`signet -a` command provide users the interface of matching genotype and gene expression file and the calculation for minor allele frequency (MAF) +`signet -a` read the output from `geno-prep` and `gexp-prep` output of `adj` will be saved under `/res/resa`: (TCGA) @@ -429,7 +429,7 @@ signet -a --pheno \ ### Cis-eqtl -`cis-eqtl` command provide the basic tool for cis-eQTL analysis. `cis-eqtl` command receive the input file from the previous preprocess step. +`signet -c` command provide the basic tool for cis-eQTL analysis. `signet -c` command receive the input file from the previous preprocess step. @@ -482,7 +482,7 @@ Output of `cie-eQTL` will be saved to `res/resc`: ### Network -`network` command provide the tools for constructing a gene regulatory network (GRN) following the two-stage penalized least squares (2SPLS) approach proposed by [D. Zhang, M. Zhang, Ren, and Chen](https://arxiv.org/abs/1511.00370). +`signet -n` command provide the tools for constructing a gene regulatory network (GRN) following the two-stage penalized least squares (2SPLS) approach proposed by [D. Zhang, M. Zhang, Ren, and Chen](https://arxiv.org/abs/1511.00370). `network` receive the input from the previous step, or it could be the output data from your own pipeline: @@ -543,7 +543,7 @@ signet -n --nboots 100 --queue standby --walltime 4:00:00 --memory 256 ### Netvis -`netvis` provide tools to visualize our constructed gene regulatory networks. Users can choose the bootstrap frequency threshold and number of subnetworks to visualize the network. +`signet -v` provide tools to visualize our constructed gene regulatory networks. Users can choose the bootstrap frequency threshold and number of subnetworks to visualize the network. You should first ssh -Y $(hostname) to a server with DISPLAY if you would like to use the singularity container, and the result can be viewed through a pop up firefox web browser From b8d7c1d71a9fb29867a5c2cb0b10122db4ab3ba5 Mon Sep 17 00:00:00 2001 From: jiang548 Date: Mon, 2 May 2022 10:04:44 -0400 Subject: [PATCH 17/19] Remove file management --- File management.md | 70 ---------------------------------------------- 1 file changed, 70 deletions(-) delete mode 100644 File management.md diff --git a/File management.md b/File management.md deleted file mode 100644 index a87e26a..0000000 --- a/File management.md +++ /dev/null @@ -1,70 +0,0 @@ -Gene Expression - -Input files - -``` ---g | --gexp set gene expression file ---p | --pmap set the genecode gtf file ---tmpt set the temporary file directory ---rest set the result file directory - - -Temporary files - -name - -gexp - -Result files - -gene_name File with name for all the genes - -gene_pos File with the physical location for all the genes - -gene_gexp File with the gene expression after preprocessing - -gexpID File with the ID for genes left after preprocessing -``` - -Genotype - -Input files - -``` - --r | --reads set the GTEx gene reads file in gct format - --t | --tpm set the gene tpm file - --g | --gtf set the genecode gtf file -``` - -``` - --p | --ped set ped file - --m | --map set map file - --mind set the missing per individual cutoff - --geno set the missing per markder cutoff - --hwe set Hardy-Weinberg equilibrium cutoff - --nchr set the chromosome number - --r | --ref set the reference file for imputation - --gmap set the genomic map file - --i | --int set the interval length for impute2 - --ncores set the number of cores -``` - -Temporary files - -intermediate ped files - -intermediate map files - -Result files - -Genotype.sampleID - -Geno - -# Adjust for Covariates - -Input files - -Temporary files - -Result files From d424dccc6b405847e9b350ad57222581175dbc95 Mon Sep 17 00:00:00 2001 From: jiang548 Date: Mon, 2 May 2022 10:05:27 -0400 Subject: [PATCH 18/19] remove readme_back --- README_back.md | 555 ------------------------------------------------- 1 file changed, 555 deletions(-) delete mode 100755 README_back.md diff --git a/README_back.md b/README_back.md deleted file mode 100755 index a57b7ef..0000000 --- a/README_back.md +++ /dev/null @@ -1,555 +0,0 @@ - - -# Documentation for TSPLS streamline project - - -## Getting started -First you should clone the directory to your path in server and add the path you installed the software to enable directly running the command without specifying a particular path -```bash -git clone https://github.itap.purdue.edu/jiang548/2SPLS.git - -export PATH=$PATH:/path/to/tslps -``` - -## Introduction - -This streamline project provide users easy linux user interface for constructing whole-genome gene regulatory networks using transciptomic data (frome RNA sequence) and genotypic data. - -Procedures of constructing gene regulatory networks can be split into five main steps: -1. genenotype preprocess -2. gene expression preprocess -3. cis-eQTL analysis -4. network analysis -5. network visualization - -To use this streamline tool, user need first to prepare the genetype data in xxx format and gene expression data in xxx format. Then set the configuration file properly, and run each step command seperately. - -**Comments** - -We have to think about how to organize the data, especially intermediate results and final results in general. - -**1.** Setting (or configuration) files should be put in the current directory; - -**2.** Intermediate results may be put in different subdirectory of current directory? Like in `./tmp/` (or inside it to have `./tmpt/` for transcriptomic preprocessing; `./tmpg/` for genotpic preprocessing; `./tmpc/` for cis-eQTL mapping; './tmpn/' for network construction)? - -**3.** Final results may be put in current directory or a subdirectory? Like in `./res/` (or inside it to have `./rest/` for transcriptomic preprocessing; `./resg/` for genotpic preprocessing; `./resc/` for cis-eQTL mapping; './resn/' for network construction)? - - -## Quit Start - -#### 1. Prepare the DataSet - -We highly recommand you to prepare the gene expression data and genotype data first, and place them to a specific data folder, to organize each step as it may involve many files: - -[Click here](#data-format) for more detail about genotype and genexpression dataset - -#### 2. Set configuration - -Here we set the number of chromosome to 22 - -```bash -tspls config -m nchr 22 -``` - -**Comments** - -**1.** I would like to change the whole package name of tspls to `signet` (for **Statistical Inference on Gene (or Global) Networks**, or even simpler with `sign`?), so in the following I will always use `signet` in my comments; - -**2.** What about use `-[character]` for each function? So we may replace `config` with `-s` (for settings). For example, we can use -```bash -signet -s --nchr 22 -``` -to set the #chromosome to 22. Otherwise, if we want to check the #chromosome, we can use -```bash -signet -s --nchr -``` -That is, when no value is provided, we will display the value of the specified parameter. We can also use -```bash -signet -s -``` -to display the values of all parameters. We may also provide a way to reset the value of one parameter or all parameters to default values? Like the following? -```bash -signet -s --d -``` -or -```bash -signet -s --nchar --d -``` - -**3.** We also need a parameter to record the number of cohorts (or groups) so we can later on incoporate the codes of ReDNet and NANOVA into this package: -```bash -signet -s --ngrp 1 -``` -If `ngrp` is set to be larger than 1, we have to decide how to manage the transcriptomic files and genotype files. For example, if we want to use one file for each group, we need a separate file to map files for different groups? - - - -#### 3. Genotype Preprocess - -```bash -tspls geno-preprocess -``` - -**Comments** - -I would suggest to take -```bash -signet -g -``` -for preprocessing genotype data - - -#### 4. Genexpression Preprocess - -```bash -tspls genexp-preprocess -``` - -**Comments** - -I would suggest to take -```bash -signet -t -``` -for preprocessing transcriptomic (gene expression) data - - - -#### 5. cis-eQTL Analysis - -```bash -tspls cis-eQTL -``` - -**Comments** - -I would suggest to take -```bash -signet -c -``` -for cis-eQTL analysis - - - -#### 6. Network Analysis - -```bash -tspls network --nboots 10 -``` - -**Comments** - -I would suggest to take -```bash -signet -n --nboots 10 -``` -for network construction. Is it necessary to separate the two stages of the network construction (like `-n1` and `-n2`)? Of cource, it will be better if we can take it in one step. - - -#### 7. Network Visualization -Based on the coefficient matrix we got in the network analysis part, we will visualize our constructed gene regulatory networks. - -```bash -tspls netvis --freq 0.8 --ncount 2 -``` - -**Comments** - -**1.** I would suggest to take -```bash -signet -v --freq 0.8 --ntop 2 -``` -for network visualization. - -**2.** We should unify the way to set up options (or configurations). Previously, we simply use, for example, `nchar`, to list/set up its value. Should we remove the double dashes here (`--`) or include the double dashes also for configurations? Is `ncount 2` for the top 2 networks? If so, I'd rather use `ntop 2`? - - -## Command Guide - -### config - -Config command is used for look up and modify parameter in the config file config.ini -[click here](#config-file) for detailed introduction for configuration file - -#### Usage -```bash -config [-l [SECTION,]PARAM][-m [SECTION,]PARAM VALUE] -``` - -**Comments** - -My proposed commands instead are -```bash -signet -s [PARAM [PARAM VALUE]] -``` -Please see related comments in the above. I would rather not use the section name. - - -#### Description -```bash --l: list parameter value (section name may not be necessary) --m: modify parameter value (section name may not be necessary) -``` - -**Comments** - -Following my suggested way, when a parameter value is provided, we reset the parameter value as given; otherwise, we display the specified parameter value. - - - -#### Example -```bash -# List the paramter -config -l Basic,nchr -## echo: 26 -config -l nchr -## echo: 26 - -# Modify the paramter -config -m Basic,nchr 24 -``` - -**Comments** - -Following my suggestion, we will have -```bash -# List the paramter -signet -s --nchr -## echo: 22 - -# Modify the paramter -signet -s --nchr 24 -``` - -### gexp-prep - - - - -### geno-prep - -`geno-prep` command provide the user the interface of preprocessing genotype data - -`geno-prep` receive the `map` file and `ped` file as input: -- `data.map`: includes SNP location information with four columns,i.e.,[chromosomeSNP_name genetic_distance locus] for each of p SNPs. -- `data.ped`: includes pedgree information, i.e.,[family_IDindividual_IDmother_IDfather_ID gender phenotype] in the first six columns, followed by 2p columns with two columns for each of p SNPs - -Output of `geno-prep` will be saved under `/data/geno-prep`: - -• `Geno`: each row is a sample and each column is a SNP, with the first column for Sample ID; -• `clean_Genotype.map`: the corresponding map file; -• `clean_Genotype_chr$i.map`: map file for ith chromosome, with i =1,2,··· - -#### Usage - -```bash -geno-prep [--map MAP_FILE] [--ped PED_FILE][--imputation] -``` - -**Comments** - -Following my previous suggestion, -```bash -signet -g [--map MAP_FILE] [--ped PED_FILE][--imput] -``` -Does ``--imput`` imply to impute the missing genotype values? Are there any other options on how to impute the missing genotypes? - - -#### Options - -``` --p | --ped, set ped file --m | --map, set map file --i | --imputaion, use 1000genome for imputation -``` - -**Comments** - -**1.** Do you mean to use `-p` instead of `--p`? Anyway, we should unify the way to set up options/configurations. - -**2.** Should the intermediate results be saved in `./tmp/tmpg/`? I would replace `./data/` with either `./tmp/[tmpg/]` or `./res/[tmpg/]`, depending on whether you want to save the results for users. - - - - -### match -`match` command provide users the interface of matching genotype and gene expression file and the calculation for maf - -`match` read the output from `geno-prep` and `gexp-prep` - -output of `match` will be saved under `/data/match`: -- `new.Geno`: genotype file; -- `new.Geno.idx`: index of SNPs selected to new.Geno; -- `new.Geno.map`: map file; -- `new.Geno.maf`: each element is the minor allele frequency (MAF) for each SNP. - -**Comments** - -**1.** `match` should be replaced by `signet -match` (or simply `signet -m`)? - -**2.** The output should be saved under `./res/` or `./tmp/`? - -**2.** Should we unify all files with `new.Geno*` by `matched.Geno*` (any better names)? - - -#### usage -```bash -match [--ma 5] -``` - - - - - -#### option - -```bash ---ma | -m, minor alleles threshold -``` - -**Comments** - -Have to separate it from `-match`: should we use `--amin` for *minimum number of alleles*? - - -### cis-eqtl - -`cis-eqtl` command provide the basic tool for cis-eQTL analysis. `cis-eqtl` command receive the input file from the previous preprocess step. We will automatically copy the following output from the precious preprocess step to `data/cis-eQTL`: -- `snps.map` : snp map data -- `snps.maf` : snp maf data from previous step -- `gexp.file` : gene expression file -- `matched.geno` : matched genotype file from previous step -- `gene.pos` : gene position file - - -The results of `cis-eqtl` are output in to the following files, and they are all saved under `/data/cis-eQTL`: - -* `net.Gexp.data`: is the expression data for genes with cis-eQTL; -* `net.genepos`: include the position for genes in `net.Gexp.data`; -* `[common|low|rare|all].eQTL.data`: includes the genotype data for marginally significant [ common | low | rare | all ] cis-eQTL; -* `[common|low|rare|all].sig.pValue_0.05`: includes the p-value of each pair of gene and its marginally significant [ common | low | rare | all ] cis-eQTL, where Column 1 is Gene Index, Column is SNP Index in `common.eQTL.data`, and Column 3 is p-Value. -* `[common|low|rare|all].sig.weight_0.05`: includes the weight of collapsed SNPs for marginally significant cis-eQTL. The first column is the gene index, the second column is the SNP index, the third column is the index of collapsed SNP group, and the fourth column is the weight of each SNP in its collapsed group (with value 1 or -1). - -**Comments** - -**1.** Why shouldn't we directly access the output from the previous steps instead as we should try to minimize the usage of space? - -**2.** Use `signet -c` instead of `cis-eqtl`? - -**3.** The outputs should be saved into `./tmp/[tmpc/]` or `./res/[resc/]`? - -**4.** While we may allow users to specify the output file names, we should have default file names at the same time so users may not have to specify. - - -#### Usage -```bash - cis-eqtl --alpha 0.5 --ncis 9 --maxcor 1 --nperms 5 --upstream 1000 --downstream 1000 --map ./data/cis-eQTL/snps.map --maf ./data/cis-eQTL/snps.maf - ``` - -**Comments** - -**1.** Would suggest: -```bash -signet -c --alpha 0.05 --ncis 5 --maxcor 0.8 --nperms 100 --up 1000 --down 1000 -``` - -**2.** Use -``` - --rmax MAX_COR maximum corr. coeff. b/w cis-eQTL of same gene - --up UP_STREAM upstream region to flank the genetic region - --down DOWN_STREAM downstream region to flank the genetic region - --map MAP_FILE snps map file path [file or path?] - --maf MAF_FILE snps maf file path [file or path?] - --gexp GEXP_FILE gene expression file path [file or path?] - --geno GENO_FILE genotype file path [file or path?] -``` - - -#### Options -``` - --alpha | -a significant level for cis-eQTL - --ncis NCIS maximum number of cis-eQTL for each gene - --maxcor MAX_COR maximum corr. coeff. b/w cis-eQTL of same gene - --nperms N_PERMS numer of permutations - --upstream UP_STREAM upstream region to flank the genetic region - --downstream DOWN_STREAM downstream region to flank the genetic region - --map MAP_FILE snps map file path - --maf MAF_FILE snps maf file path - --gexp GEXP_FILE gene expression file path - --geno GENO_FILE genotype file path - --help | -h user guide -``` - -#### Example -```bash -cis-eqtl -a 0.05 --upstream 1000 --map snp.map -``` - -### network - -`network` command provide the tools for constructing a gene regulatory network (GRN) following the two-stage penalized least squares (2SPLS) approach proposed by [D. Zhang, M. Zhang, Ren, and Chen](https://arxiv.org/abs/1511.00370). - -`network` receive the input from the previous step: - -* `net.Gexp.data`: output from `cis-eqtl`, is the expression data for genes with cis-eQTL: -* `all.eQTL.data`: output from `cis-eqtl`, includes the genotype data for marginally significant cis-eQTL: -* `all.sig.pValue_0.05`: output from `cis-eqtl`,includes the $p$-value of each pair of gene and its marginally significant (p-Value < 0.05) cis-eQTL, where Column 1 is Gene Index (in `net.Gexp.data`), Column is SNP Index (in `all.Geno.data`), and Column 3 is p-Value. - -The final output files of `network` will be saved under `/data/network/stage2`: -* `adjacency_matrix`: the adjancency matrix for the estimated regulatory effects; -* `coefficient_matrix`: the coefficient matrix for the estimated regulatory effects; - -#### usage -```bash -network [OPTION VAL] ... -``` - -**Comments** - -Would rather take the commands -```bash -signet -n [OPTION VAL] ... -``` - - - -#### description - -```bash - --ncis NCIS maximum number of cis-eQTL for each gene - -r MAX_COR maximum corr. coeff. b/w cis-eQTL of same gene - --nnodes N_NODE - --ncores N_CORES - --memory MEMORY - --walltime WALLTIME -``` - -**Comments** - -Possible changes: -```bash - --rmax MAX_COR maximum corr. coeff. b/w cis-eQTL of same gene -``` - - - -### netvis - -`netvis` provide tools to visualize our constructed gene regulatory networks. Users can choose the bootstrap frequency threshold and number of subnetworks to visualize the network. - -`netvis` will automatically read the output from `network` step: -* `adjacency_matrix`: the adjancency matrix for the estimated regulatory effects; -* `coefficient_matrix`: the coefficient matrix for the estimated regulatory effects; - -In addition, we also need user to provide node information file to identify transcription factor for visualization. - - -#### usage -```bash -netvis [OPTION VAL] ... -``` - -**Comments** -Possible changes: -```bash -signet -v [OPTION VAL] ... -``` - - -#### description - -```bash - --freq FREQENCY bootstrap frequecy for the visualization - --ncount NET_COUNT number of sub-networks - --ninfo NODE_INFO_FILE node information file - --h | --help usage -``` - - -## Appendix - -### Data Format -* Gene Expression Data (`GeneExpression/jpt.ge` \& `GeneExpression/jpt.gpos`): - + `jpt.ge` includes the sample IDs in the first column, and the rest is an $n\times p$ matrix with $p$ genes for each of $n$ individuals; - + `jpt.gpos` is a gene annotation file, including four columns with the first one for *Gene Symbol*, the second one for *Chromosome No.*, the third for *Start Position*, and the last for *End Position*. - -* Genotype Data (`Genotype/jpt.map` \& `Genotype/jpt.ped`): - + `jpt.ped` includes pedgree information, i.e., [family_ID individual_ID mother_ID father_ID gender phenotype] in the first six columns, followed by $2p$ columns with two columns for each of $p$ SNPs. - + `jpt.map` includes SNP location information with four columns, i.e., [chromosome SNP_name genetic_distance locus] for each of $p$ SNPs. - - -### config File - -config.ini file is under the main folder and saving the costomized parameters for the four stages of tspls process. Settings in config.ini are orgnized by different sections. - -Users can change the tspls process by modifying the paramter settings in the configuration file. - - -```bash -# basic settings -# basic settings -[BASIC] -nchr = 26 -storage.path = STORAGE_PATH - -[GENO] -ped.file = xxx -map.file = PATH/xxx.map - - -[GENEXP] -ge.file = PATH/xxx.ge -gpos.file = PATH/xxx.gpos - -[PLINK] -mind = 0.1 -ggeno = 0.1 -hwe = 0.0001 -recode = A - -[CISEQTL] -snps.map = ./data/cis-eQTL/snps.map -snps.maf = ./data/cis-eQTL/snps.maf -gexp.file = ./data/cis-eQTL/gexp_prostate -matched.geno = ./data/cis-eQTL/matched.Geno -gene.pos = ./data/cis-eQTL/prostate_gene_pos -alpha.cis = 0.05 -nperms = 100 -upstream = 1000 -downstream = 500 - -[NETWORK] -uncor.ncis = 3 -uncor.r = 0.5 -nboots = 5 -nnodes = 500 -ncores = 16 -memory = 64 -walltime = 4 - -[NETVIS] -freq = 0.8 -ncount = 2 -ninfo = ./data/netvis/mart_export_protein_coding_37.txt -``` - -### File Structure - - -```bash -# script folder save all the code -- script/ - - cis-eQTL/ - - network/ - - netvis/ - - cis_portal.sh # entrance for cis-eqtl analysis - - network_portal.sh # entrance for network analysis - - netvis_portal.sh #entrance for network visualization - - main.sh # main entrance -config.ini -# data folder save all the relavant data and intermediate data -- data/ - - cis-eQTL/ - - network/ - - netvis/ -``` From 60b397efc1d3fb612d3f12702a92be044def0c3a Mon Sep 17 00:00:00 2001 From: jiang548 Date: Tue, 3 May 2022 15:20:44 -0400 Subject: [PATCH 19/19] add some information --- README.md | 8 ++++---- script/network_portal.sh | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9a1a3ad..15623f0 100644 --- a/README.md +++ b/README.md @@ -508,14 +508,14 @@ signet -n [OPTION VAL] ... --nboots NBOOTS number of bootstraps datasets --memory MEMEORY memory in each node in GB --queue QUEUE queue name - --ncores number of cores for each node + --ncores number of cores to use for each node --walltime WALLTIME maximum walltime of the server in seconds --resn result prefix --sif singularity container ``` -* `net.gexp.data`: output from `cis-eqtl`, includes the expression data for genes with cis-eQTL. It's a n * p matrix, with each row encodes the gene expression data for each sample. -* `net.geno.data`: output from `cis-eqtl`, includes the genotype data for marginally significant cis-eQTL. It's a n * p matrix, with each row encodes the genotype data for each sample. -* `sig.pair`: output from `cis-eqtl`, includes the p-value of each pair of gene and its marginally significant (p-Value < 0.05) cis-eQTL, where Column 1 is Gene Index (in `net.Gexp.data`), Column is SNP Index (in `all.Geno.data`), and Column 3 is p-Value. +* `net.gexp.data`: output from `signet -c`, includes the expression data for genes with cis-eQTL. It's a n * p matrix, with each row encodes the gene expression data for each sample. +* `net.geno.data`: output from `signet -c`, includes the genotype data for marginally significant cis-eQTL. It's a n * p matrix, with each row encodes the genotype data for each sample. +* `sig.pair`: output from `signet -c`, includes the p-value of each pair of gene and its marginally significant (p-Value < 0.05) cis-eQTL, where Column 1 is Gene Index (in `net.Gexp.data`), Column is SNP Index (in `all.Geno.data`), and Column 3 is p-Value. ....The third column is the p value for each pair. * `net.genename`: includes information of gene name. It's a p * 1 vector. * `net.genepos`: includes information of gene position. It's a p * 4 matrix, with first column to be gene names, second columns chromosome index, e.g, "chr1", third and fourth columns are the start and end position of genes in the chromosome, respectly. diff --git a/script/network_portal.sh b/script/network_portal.sh index 596a1ae..5c2dd9f 100755 --- a/script/network_portal.sh +++ b/script/network_portal.sh @@ -35,7 +35,7 @@ function usage() { echo ' --nboots number of bootstraps datasets' echo ' --memory memory in each node in GB' echo ' --queue queue name' - echo ' --ncores number of cores for each node' + echo ' --ncores number of cores to use for each node' echo ' --walltime maximum walltime of the server in seconds' echo " --interactive T, F for interactive job scheduling or not" # echo " --filter To focus on a list of genes"