From c26f5a9c77be471fac61a898e356deec88452ba8 Mon Sep 17 00:00:00 2001
From: Sungchan Oh <oh231@login07.negishi.rcac.purdue.edu>
Date: Wed, 10 Jul 2024 23:31:34 -0400
Subject: [PATCH] Display cluster

---
 display_cluster.r | 133 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 display_cluster.r

diff --git a/display_cluster.r b/display_cluster.r
new file mode 100644
index 0000000..925ce5d
--- /dev/null
+++ b/display_cluster.r
@@ -0,0 +1,133 @@
+library(dplyr)
+library(reshape2)
+library(Rtsne)
+
+
+
+# Path to input and output data
+path.rgb.long <- ("./df_rgb_long.csv")
+path.hsi.long <- ("./df_hsi_long.csv")
+path.rpt      <- ("./rpt_no_public.csv")
+
+# List of varieties to be excluded, if any; otherwise, exclude.variety as "c()"
+exclude.variety <-c("P1", "P2", "P3", "P4") 
+
+## Number of phenotypes demonstrating highest to n-th highest repeatability 
+## across all TREATMENT and GROWTH_STAGE
+#n <- 30
+
+
+case <- 1
+
+drops.1 <- c("View", "frame_nr", "variable")
+
+
+
+if (case==1){
+    drops.2 <- c("EXP.ID", "POT_BARCODE", "TREATMENT", "DFP", "GROWTH_STAGE")
+}
+
+
+
+# Load RGB and HSI data in long format
+message("Loading data...")
+df.rgb <- read.csv(path.rgb.long)
+df.hsi <- read.csv(path.hsi.long)
+df.rpt <- read.csv(path.rpt)
+
+# Exclude varieties, as needed
+if (length(exclude.variety)>0){
+    df.rgb <- df.rgb %>% filter(!VARIETY %in% exclude.variety)
+    df.hsi <- df.hsi %>% filter(!VARIETY %in% exclude.variety)
+}
+
+# Combine RGB and HSI data
+df <- rbind(df.rgb, df.hsi)
+
+# Rename variables
+df$variable.concat <- paste(df$View, df$frame_nr, df$variable, sep="_")
+
+# Remove columns used to define variables
+df <- df[ , !(names(df) %in% drops.1)]
+
+# Reshape data
+form <- paste0("EXP.ID+POT_BARCODE+TREATMENT+VARIETY",
+               "+DFP+GROWTH_STAGE ~ variable.concat")
+df <- reshape2::dcast(df, as.formula(form), value.var="value")
+
+
+
+
+
+
+
+
+for (treatment in unique(df$TREATMENT)){
+    for (growth.stage in unique(df$GROWTH_STAGE)){
+        print(paste0("Generating plots for ", 
+                     treatment, ", ", growth.stage, " case..."))
+
+        # Subset data by treatment and growth stage
+        df.temp <- df[which(df$TREATMENT==treatment & 
+                            df$GROWTH_STAGE==growth.stage), ]
+
+        # Remove unnecessary columns for clustering
+        df.temp <- df.temp[ , !(names(df.temp) %in% drops.2)]
+
+        # Select complete columns (no NaN, infinite) for analysis
+        df.temp <- do.call(data.frame, 
+                           lapply(df.temp, 
+                                  function(x) replace(x, is.infinite(x),NA)))
+        df.temp <- df.temp[ , colSums(is.na(df.temp))==0]
+        if (nrow(df.temp)<5) next
+
+
+
+        if (case==1){
+
+            # Set label as factor (VARIETY)
+            df.temp$VARIETY <- as.factor(df.temp$VARIETY)
+
+            # TODO only for byr
+            df.temp$VARIETY <- factor(df.temp$VARIETY, levels = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ", "KK", "LL", "MM", "NN"))
+
+            # Get training data
+            num.train <- 0.8 * nrow(df.temp)
+            set.seed(1)
+            rows <- sample(1:nrow(df.temp), num.train)
+            train <- df.temp[rows, ]
+
+            # T-SNE
+            colors = rainbow(length(unique(df.temp$VARIETY)))
+            names(colors) = unique(df.temp$VARIETY)
+            for (pp in c(5, 10, 20, 40)){ ## pp in [5,50]
+                for (iter in c(10, 15, 20, 30, 40, 50, 100, 200, 500, 1000, 2000, 5000)){
+                    tsne <- Rtsne(train[,-1],
+                                  dims=2, 
+                                  perplexity=pp, 
+                                  verbose=F, 
+                                  max_iter=iter)
+
+                    # Visuzlize clusters
+                    png(paste0("./tsne_", treatment, "_", growth.stage, "_", 
+                               pp, "_", iter, ".png"))
+                    par(mgp=c(2.5,1,0))
+                    plot(tsne$Y, t='n', 
+                         main=paste("tSNE", treatment, growth.stage, pp, iter),
+                         xlab="tSNE dimension 1", 
+                         ylab="tSNE dimension 2", 
+                         "cex.main"=2, "cex.lab"=1.5)
+                    text(tsne$Y, labels=train$VARIETY, col=colors[train$VARIETY])
+                    dev.off()
+                }
+            }
+        }
+    }
+}
+
+
+
+
+
+
+# EOF