From bf1801d760a0498099224c80782c485760675f4a Mon Sep 17 00:00:00 2001 From: Sungchan Oh Date: Fri, 5 Jul 2024 18:47:13 -0400 Subject: [PATCH] Calculate repeatability --- calculate_repeatability.r | 102 +++++++++++++++++++++++++++ import_and_reshape.r | 144 ++++++++++++++------------------------ 2 files changed, 155 insertions(+), 91 deletions(-) create mode 100644 calculate_repeatability.r diff --git a/calculate_repeatability.r b/calculate_repeatability.r new file mode 100644 index 0000000..41a57cc --- /dev/null +++ b/calculate_repeatability.r @@ -0,0 +1,102 @@ +# calculate_repeatability.r + +library(lme4) +library(heritability) + + + +# Input long format data +path.rgb.long <- ("./df_rgb_long.csv") +path.hsi.long <- ("./df_hsi_long.csv") +path.rpt <- ("./rpt.csv") + +factor.col <- c('EXP.ID', + 'POT_BARCODE', + 'TREATMENT', + 'VARIETY', + 'GROWTH_STAGE', + 'View', + 'frame_nr') + + +# Load RGB and HSI data in long format +message("Loading data...") +df.rgb <- read.csv(path.rgb.long) +df.hsi <- read.csv(path.hsi.long) + +# Combine RGB and HSI data +df <- rbind(df.rgb, df.hsi) + +# Set factor columns +df[, factor.col] <- lapply(df[, factor.col], factor) + +# Create a dataframe to record repeatability +message("Finding unique conditions to evaluate repeatability...") +df.rpt <- unique(df[, c('TREATMENT', + 'GROWTH_STAGE', + 'View', + 'frame_nr', + 'variable')]) +df.rpt <- df.rpt[order(df.rpt$TREATMENT, + df.rpt$GROWTH_STAGE, + df.rpt$View, + df.rpt$frame_nr, + df.rpt$variable), ] +df.rpt$repeatability <- NA +df.rpt$gen.variance <- NA +df.rpt$res.variance <- NA + +# Calculate repeatability +message('Calculating repeatability...') +for(i in 1:nrow(df.rpt)){ + row <- df.rpt[i,] + # Subset data by condition by which repeatability is measured + message(paste(row$TREATMENT, + row$GROWTH_STAGE, + row$View, + row$frame_nr, + row$variable, + " (", i, "/", nrow(df.rpt), ")" sep=" ")) + df.temp <- df[which(df$TREATMENT==row$TREATMENT & + df$GROWTH_STAGE==row$GROWTH_STAGE & + df$View==row$View & + df$frame_nr==row$frame_nr & + df$variable==row$variable), ] + + # Count NA values and continue if excessive + num.na <- sum(is.na(df.temp$value)) + if (num.na/nrow(df.temp) > 0.3) next + + # Select valid data + df.temp <- df.temp[!is.na(df.temp$value) & is.finite(df.temp$value), ] + + # Evaluate repeatability (line repeatability=True) + rpt <- repeatability(df.temp$value, df.temp$VARIETY, line.repeatability=T, + covariates.frame = data.frame()) + + # Save data + df.rpt$repeatability[i] <- rpt$repeatability + df.rpt$gen.variance[i] <- rpt$gen.variance + df.rpt$res.variance[i] <- rpt$res.variance +} + + + + +# Export repeatability table +message('\nExporting repeatability measurements...') +write.csv(df.rpt, path.rpt, row.names=F) + + + + +# factor(EXP.ID, frame_nr, VARIETY, TREATMENT) +# value ~ 1|EXP.ID + VARIETY + TREATMENT + GROWTH_STAGE +# value ~ 1|EXP.ID + VARIETY + TREATMENT Seth +# value ~ 1|EXP.ID + VARIETY Current + + + + + +# EOF diff --git a/import_and_reshape.r b/import_and_reshape.r index a8b5fe7..a86b911 100644 --- a/import_and_reshape.r +++ b/import_and_reshape.r @@ -1,7 +1,12 @@ -# DataExplore.r -# Explore AAPF data products and get some insight -# Currently, developed for RGB and HSI masterfiles (.xlsx) +# import_and_reshape.r +# +# This R script import RGB and HSI masterfiles (tables) +# produced from Ag Alumni Seed Phenotyping Facility (AAPF), Purdue, +# then convert the data into long format compatible for +# data analysis using various R libraries and functions + +# Import libraries library(dplyr) library(readxl) library(data.table) @@ -34,6 +39,7 @@ c("/depot/smarterag/data/HSI/Master files/Hyperspectral_data_AAPF_experiment_374 "/depot/smarterag/data/HSI/Master files/Hyperspectral_data_AAPF_experiment_396.xlsx", "/depot/smarterag/data/HSI/Master files/Hyperspectral_data_AAPF_experiment_401.xlsx") +# Output filenames for converted long tables path.rgb.long <- ("./df_rgb_long.csv") path.hsi.long <- ("./df_hsi_long.csv") @@ -228,30 +234,55 @@ col.factor <- c("EXP ID", "VARIETY", "TREATMENT", -# Create empty rgb dataframe +# Create empty RGB dataframe mat <- matrix(ncol=0, nrow=0) df.rgb <- data.frame(mat) -# Import rgb data +# Import RGB data for (path.rgb in paths.rgb){ for (tab.rgb in excel_sheets(path = path.rgb)){ if (tab.rgb=="PPEW") next if (tab.rgb=="Information") next - # Read a rgb worksheet temp <- read_excel(path.rgb, sheet=tab.rgb) - # Remove hand-made columns cols <- grepl(regex.col.rgb, as.character(colnames(temp))) temp <- temp[cols] - # Remove unused columns temp <- temp[,!names(temp) %in% nouse.col.rgb] - # Simplify values under "View" column # Side{Bottom|Small|Full|Tall} to Side{Average|All} - if (tolower(tab.rgb) =="side average") temp["View"] <- "SideAverage" - if (tolower(tab.rgb) =="side all") temp["View"] <- "SideAll" + if (tolower(tab.rgb) == "side average") temp["View"] <- "SideAverage" + if (tolower(tab.rgb) == "side all") temp["View"] <- "SideAll" + # Re-index frame_nr with the largest plant volume in the side view + # + # Replace values under frame_nr (0, 1, 2, ..., 11) + # starting from the default pot orientation + # to new indexes (also 0, 1, 2, ..., 11) + # starting from the direction with the largest plant volume from side + #print(temp$frame_nr) + list.sideall <- unique(temp[, c('POT_BARCODE', 'SCAN_DATE', 'SCAN_TIME')]) + if (tolower(tab.rgb) == "side all"){ + for(i in 1:nrow(list.sideall)){ + row <- list.sideall[i,] + + # Get plant surface area from side view + where <- vector(length=12) + for (f in seq(1,12)){ + where[f] <- which(temp$POT_BARCODE==row$POT_BARCODE & + temp$SCAN_DATE==row$SCAN_DATE & + temp$SCAN_TIME==row$SCAN_TIME & + temp$frame_nr==(f-1)) + } + temp.surface <- temp[where, "Surface"] + # Find direction with largest plant surface area + max.f <- which.max(temp.surface$Surface) + # Update frame_nr (0: direction with largest plant surface) + for (f in seq(1,12)){ + temp[where[f], "frame_nr"] <- ((f-1)-max.f) %% 12 + } + } + } # Find id columns that does not exist cols.to.add <-id.vars[!id.vars %in% names(temp)] @@ -284,17 +315,13 @@ message('Succeeded importing RGB data...') - - -# Overview 1: Show summary of rgb data +# Overview 1: Show summary of RGB data df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] message('\nRGB Data Acquisiton Summary (Overview 1)\n') print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 2: Round down scan date and time down to nearest 15 minute and display format <- "%Y-%m-%d %H:%M:%S" df.rgb.perscan["SCAN_DATETIME"] = @@ -310,9 +337,7 @@ message('Check df.unwanted.rgb in DataExplore.r to exclude such records...\n') - - -# Overview 3: Drop unwanted rgb data +# Overview 3: Drop unwanted RGB data df.rgb["SCAN_DATETIME"] = as.POSIXct(paste(as.Date(df.rgb$SCAN_DATE), df.rgb$SCAN_TIME), format=format) df.rgb["SCAN_DATETIME_BY15MIN"] = @@ -329,8 +354,6 @@ print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 4: Convert treatment attributes for(i in 1:nrow(df.convert.treatment)){ row <- df.convert.treatment[i,] @@ -342,8 +365,6 @@ print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 5: DFP is always smaller than 365 (to prevent case with wrong year input) df.rgb$DFP = df.rgb$DFP %% 365 df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] @@ -352,8 +373,6 @@ print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 6: Add growth stage attribute df.rgb$GROWTH_STAGE <- NA for(i in 1:nrow(df.growth.stage)){ @@ -368,17 +387,13 @@ print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP, GROWTH_STAGE)) - - # Drop unused columns df.rgb <- df.rgb[ , !(names(df.rgb) %in% drops.col.rgb)] - # Change column order df.rgb <- df.rgb[, rgb.col.order] df.rgb[col.factor] <- lapply(df.rgb[col.factor], factor) - -# Export rgb dataframe -message('\nExporting rgb dataframe (long format)...') +# Export RGB dataframe +message('\nExporting RGB dataframe (long format)...') write.csv(df.rgb, path.rgb.long, row.names=F) @@ -390,44 +405,35 @@ write.csv(df.rgb, path.rgb.long, row.names=F) -# Create empty hsi dataframe +# Create empty HSI dataframe mat <- matrix(ncol=0, nrow=0) df.hsi <- data.frame(mat) -# Import hsi data +# Import HSI data for (path.hsi in paths.hsi){ for (tab.hsi in excel_sheets(path = path.hsi)){ if (tab.hsi=="PPEW") next if (tab.hsi=="Information") next - - # Read a hsi worksheet + # Read a HSI worksheet temp <- read_excel(path.hsi, sheet=tab.hsi) - # Remove erroneous ghost column temp <- select(temp, -starts_with("...")) - # Remove hand-made columns cols <- grepl(regex.col.hsi, as.character(colnames(temp))) temp <- temp[cols] - # Remove unused columns temp <- temp[,!names(temp) %in% nouse.col.hsi] - # Add "View" column temp["View"] <- NA if (grepl("side", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Side" if (grepl("top", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Top" - # Add "frame_nr" column temp["frame_nr"] = -1 - # Change table into long format temp.long <- reshape2::melt(temp, id.vars = id.vars, variable.name = "variable") - # Combine rows from different worksheets df.hsi <- rbind(df.hsi, temp.long) - ## Debugging #print(paste(path.hsi, " ", tab.hsi)) #print(unique(temp$View)) @@ -446,8 +452,6 @@ print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 2: Round down scan date and time down to nearest 15 minute and display format <- "%Y-%m-%d %H:%M:%S" df.hsi.perscan["SCAN_DATETIME"] = @@ -463,9 +467,7 @@ message('Check df.unwanted.hsi in DataExplore.r to exclude such records...\n') - - -# Overview 3: Drop unwanted hsi data +# Overview 3: Drop unwanted HSI data df.hsi["SCAN_DATETIME"] = as.POSIXct(paste(as.Date(df.hsi$SCAN_DATE), df.hsi$SCAN_TIME), format=format) df.hsi["SCAN_DATETIME_BY15MIN"] = @@ -482,8 +484,6 @@ print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 4: Convert treatment attributes for(i in 1:nrow(df.convert.treatment)){ row <- df.convert.treatment[i,] @@ -495,8 +495,6 @@ print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 5: DFP is always smaller than 365 (to prevent case with wrong year input) df.hsi$DFP = df.hsi$DFP %% 365 df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ] @@ -505,8 +503,6 @@ print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) - - # Overview 6: Add growth stage attribute df.hsi$GROWTH_STAGE <- NA for(i in 1:nrow(df.growth.stage)){ @@ -521,49 +517,15 @@ print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP, GROWTH_STAGE)) - - # Drop unused columns df.hsi <- df.hsi[ , !(names(df.hsi) %in% drops.col.hsi)] - # Change column order df.hsi <- df.hsi[, hsi.col.order] df.hsi[col.factor] <- lapply(df.hsi[col.factor], factor) - -# Export hsi dataframe -message('\nExporting hsi dataframe (long format)...') +# Export HSI dataframe +message('\nExporting HSI dataframe (long format)...') write.csv(df.hsi, path.hsi.long, row.names=F) - - - - - -# Combine rgb and hsi data using df <- rbind(df.rgb, df.hsi) - - - -# frame_nr 0-11 (normal) , 0-23 for one scan -# For "RGB-SideAll", change column names Frame0-11 to major, major+30... - - - - - - - - - - - - - - -# Check input variables with repeatability and ANOVA -# For "RGB-SideAll", visualize importance along the side angle - -# Visualize feature importance - -# RGB-visualization +# EOF