From c26f5a9c77be471fac61a898e356deec88452ba8 Mon Sep 17 00:00:00 2001 From: Sungchan Oh Date: Wed, 10 Jul 2024 23:31:34 -0400 Subject: [PATCH] Display cluster --- display_cluster.r | 133 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 display_cluster.r diff --git a/display_cluster.r b/display_cluster.r new file mode 100644 index 0000000..925ce5d --- /dev/null +++ b/display_cluster.r @@ -0,0 +1,133 @@ +library(dplyr) +library(reshape2) +library(Rtsne) + + + +# Path to input and output data +path.rgb.long <- ("./df_rgb_long.csv") +path.hsi.long <- ("./df_hsi_long.csv") +path.rpt <- ("./rpt_no_public.csv") + +# List of varieties to be excluded, if any; otherwise, exclude.variety as "c()" +exclude.variety <-c("P1", "P2", "P3", "P4") + +## Number of phenotypes demonstrating highest to n-th highest repeatability +## across all TREATMENT and GROWTH_STAGE +#n <- 30 + + +case <- 1 + +drops.1 <- c("View", "frame_nr", "variable") + + + +if (case==1){ + drops.2 <- c("EXP.ID", "POT_BARCODE", "TREATMENT", "DFP", "GROWTH_STAGE") +} + + + +# Load RGB and HSI data in long format +message("Loading data...") +df.rgb <- read.csv(path.rgb.long) +df.hsi <- read.csv(path.hsi.long) +df.rpt <- read.csv(path.rpt) + +# Exclude varieties, as needed +if (length(exclude.variety)>0){ + df.rgb <- df.rgb %>% filter(!VARIETY %in% exclude.variety) + df.hsi <- df.hsi %>% filter(!VARIETY %in% exclude.variety) +} + +# Combine RGB and HSI data +df <- rbind(df.rgb, df.hsi) + +# Rename variables +df$variable.concat <- paste(df$View, df$frame_nr, df$variable, sep="_") + +# Remove columns used to define variables +df <- df[ , !(names(df) %in% drops.1)] + +# Reshape data +form <- paste0("EXP.ID+POT_BARCODE+TREATMENT+VARIETY", + "+DFP+GROWTH_STAGE ~ variable.concat") +df <- reshape2::dcast(df, as.formula(form), value.var="value") + + + + + + + + +for (treatment in unique(df$TREATMENT)){ + for (growth.stage in unique(df$GROWTH_STAGE)){ + print(paste0("Generating plots for ", + treatment, ", ", growth.stage, " case...")) + + # Subset data by treatment and growth stage + df.temp <- df[which(df$TREATMENT==treatment & + df$GROWTH_STAGE==growth.stage), ] + + # Remove unnecessary columns for clustering + df.temp <- df.temp[ , !(names(df.temp) %in% drops.2)] + + # Select complete columns (no NaN, infinite) for analysis + df.temp <- do.call(data.frame, + lapply(df.temp, + function(x) replace(x, is.infinite(x),NA))) + df.temp <- df.temp[ , colSums(is.na(df.temp))==0] + if (nrow(df.temp)<5) next + + + + if (case==1){ + + # Set label as factor (VARIETY) + df.temp$VARIETY <- as.factor(df.temp$VARIETY) + + # TODO only for byr + df.temp$VARIETY <- factor(df.temp$VARIETY, levels = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ", "KK", "LL", "MM", "NN")) + + # Get training data + num.train <- 0.8 * nrow(df.temp) + set.seed(1) + rows <- sample(1:nrow(df.temp), num.train) + train <- df.temp[rows, ] + + # T-SNE + colors = rainbow(length(unique(df.temp$VARIETY))) + names(colors) = unique(df.temp$VARIETY) + for (pp in c(5, 10, 20, 40)){ ## pp in [5,50] + for (iter in c(10, 15, 20, 30, 40, 50, 100, 200, 500, 1000, 2000, 5000)){ + tsne <- Rtsne(train[,-1], + dims=2, + perplexity=pp, + verbose=F, + max_iter=iter) + + # Visuzlize clusters + png(paste0("./tsne_", treatment, "_", growth.stage, "_", + pp, "_", iter, ".png")) + par(mgp=c(2.5,1,0)) + plot(tsne$Y, t='n', + main=paste("tSNE", treatment, growth.stage, pp, iter), + xlab="tSNE dimension 1", + ylab="tSNE dimension 2", + "cex.main"=2, "cex.lab"=1.5) + text(tsne$Y, labels=train$VARIETY, col=colors[train$VARIETY]) + dev.off() + } + } + } + } +} + + + + + + +# EOF