Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
############################################################################
# reshape_table_round2.r
############################################################################
#
# This R script processes RGB and HSI data (xlsx format) from
# the Ag Alumni Seed Phenotyping Facility (AAPF), Purdue.
# It transforms the data into a long format suitable for
# analysis using R packages.
# User input needed here
# Set file paths to the individual masterfiles-per-repetition
paths.rgb <-
c("../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_374.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_375.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_378.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_380.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_390.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_391.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_396.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_401.xlsx")
paths.hsi <-
c("../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_374.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_375.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_378.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_380.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_390.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_391.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_396.xlsx",
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_401.xlsx")
# Set file paths to the output tables in long format (R-compatible)
path.rgb.long <- ("./df_rgb_long.csv")
path.hsi.long <- ("./df_hsi_long.csv")
# Specify unwanted measurements
# Note: Unwanted data includes incomplete scan results, test scans,
# and data generated during AAPF tour demos, etc.
# Steps
# 1. Run this R script.
# 2. Identify potential unwanted measurements:
# Examine the script's output (Data Overview 1 & 2) for data with a
# significantly lower plant count compared to the available plants.
# 3. Review and select:
# Based on Data Overviews 1 & 2, determine which measurements
# to exclude from your analysis.
# 4. Locate variables:
# Find the df.unwanted.{rgb|hsi} variables within the script.
# 5. Add timestamps:
# Append the date and time (e.g., "2023-10-30 11:15:00") to the
# placeholder df.unwanted.{rgb|hsi}.
df.unwanted.rgb <- data.frame(
EXP_ID =c( 374, 374, 374, 374, 396),
TREATMENT=c("20 -1", "50-3", "50-3", "50-4", "20-1"),
SCAN_DATETIME_BY15MIN=c("2023-10-30 11:15:00",
"2023-09-28 09:15:00",
"2023-09-28 09:00:00",
"2023-09-28 09:45:00",
"2024-04-29 12:00:00"),
stringsAsFactors=FALSE)
df.unwanted.hsi <- data.frame(
EXP_ID=c(390),
TREATMENT=c("20-1"),
SCAN_DATETIME_BY30MIN=c("2024-03-08 10:30:00"),
stringsAsFactors=FALSE)
# Update treatment description
# If incorrect data is entered during the PPEW process,
# use df.convert.treatment to make corrections.
df.convert.treatment <- data.frame(
from=c("20 -1", "20-2", "50-3", "50-4", "20-1", "20-2"),
to =c("20", "20", "80", "80", "20", "20"),
stringAsFactors=FALSE)
# Assign growth stage according to
# (1) experiment ID, (2) treatment, and (3) days since planting (DFP).
# Use Data Overview 5 to identify the correct growth stage for
# each data point based on the given combination.
df.growth.stage <- data.frame(
EXP_ID =c(374, 374, 374, 374, 375, 375, 375, 375,
378, 378, 378, 378, 380, 380, 380, 380,
390, 390, 390, 390, 391, 391, 391, 391,
396, 396, 396, 396, 401, 401, 401, 401),
TREATMENT =c(20, 20, 80, 80, 20, 20, 80, 80,
20, 20, 80, 80, 20, 20, 80, 80,
20, 20, 80, 80, 20, 20, 80, 80,
20, 20, 80, 80, 20, 20, 80, 80),
DFP =c(33, 59, 32, 50, 35, 56, 28, 48,
35, 60, 28, 56, 34, 63, 28, 53,
35, 63, 28, 48, 35, 63, 28, 56,
42, 70, 42, 63, 42, 73, 43, 65),
GROWTH_STAGE=c('V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT',
'V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT',
'V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT',
'V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT'))
# No changes required in this section
# Columns in master files
regex.col.rgb <- paste("Filename", "EXP ID", "POT_BARCODE", "VARIETY",
"TREATMENT", "SCAN_TIME", "SCAN_DATE", "DFP", "View",
"frame_nr", "Width", "Height", "Surface", "Angle",
"Convex_hull", "Roundness", "Center_of_mass_distance",
"Center_of_mass_x", "Center_of_mass_y",
"Hue", "Saturation", "Intensity", "Fluorescence",
"[HSVF][[:digit:]]{1,3}", sep="|")
regex.col.hsi <- paste("EXP ID", "POT_BARCODE", "VARIETY", "TREATMENT",
"SCAN_TIME", "SCAN_DATE", "DFP",
"[[:alnum:]]_+(mean|max|min|std|p[[:digit:]]{1,2})",
"[[:digit:]]{3,4}(\\.[[:digit:]]{1,15})?",
sep="|")
# Columns to be removed after importing
nouse.col.rgb <- c("Filename")
nouse.col.hsi <- c("Filename-VNIR-SIDE", "Filename-VNIR-TOP",
"Filename-SWIR-SIDE", "Filename-SWIR-TOP")
# Columns used as interim identifier
id.vars <- c("EXP ID", "POT_BARCODE", "SCAN_TIME", "SCAN_DATE",
"VARIETY", "TREATMENT", "DFP", "View", "frame_nr")
# Columns to be removed while processing
drops.col.rgb <- c("SCAN_TIME", "SCAN_DATE",
"SCAN_DATETIME", "SCAN_DATETIME_BY15MIN")
drops.col.hsi <- c("SCAN_TIME", "SCAN_DATE",
"SCAN_DATETIME", "SCAN_DATETIME_BY30MIN")
# Column order of output table
rgb.col.order <- c("EXP ID", "POT_BARCODE", "TREATMENT", "VARIETY",
"DFP", "GROWTH_STAGE", "View", "frame_nr",
"variable", "value")
hsi.col.order <- c("EXP ID", "POT_BARCODE", "TREATMENT", "VARIETY",
"DFP", "GROWTH_STAGE", "View", "frame_nr",
"variable", "value")
# Column used as factor while processing
col.factor <- c("EXP ID", "VARIETY", "TREATMENT",
"View", "frame_nr", "GROWTH_STAGE")
# Main code starts here
# Import libraries
library(dplyr)
library(readxl)
library(data.table)
library(lubridate)
# Create empty RGB dataframe
mat <- matrix(ncol=0, nrow=0)
df.rgb <- data.frame(mat)
# Import RGB data
for (path.rgb in paths.rgb){
for (tab.rgb in excel_sheets(path = path.rgb)){
if (tab.rgb=="PPEW") next
if (tab.rgb=="Information") next
# Read a rgb worksheet
temp <- read_excel(path.rgb, sheet=tab.rgb)
# Remove hand-made columns
cols <- grepl(regex.col.rgb, as.character(colnames(temp)))
temp <- temp[cols]
# Remove unused columns
temp <- temp[,!names(temp) %in% nouse.col.rgb]
# Simplify "View" (Side{Bottom|Small|Full|Tall} to Side{Average|All})
if (tolower(tab.rgb) == "side average") temp["View"] <- "SideAverage"
if (tolower(tab.rgb) == "side all") temp["View"] <- "SideAll"
# Reorder frames (frame_nr)
# Steps
# 1. Identify the side view with the largest plant volume.
# 2. Reindex frame_nr values (0 to 11, from the default pot orientation)
# to match the new order starting from the side with the largest
# plant area
list.sideall <- unique(temp[, c('POT_BARCODE', 'SCAN_DATE', 'SCAN_TIME')])
if (tolower(tab.rgb) == "side all"){
for(i in 1:nrow(list.sideall)){
row <- list.sideall[i,]
# Get plant surface area
where <- vector(length=12)
for (f in seq(1,12)){
where[f] <- which(temp$POT_BARCODE==row$POT_BARCODE &
temp$SCAN_DATE==row$SCAN_DATE &
temp$SCAN_TIME==row$SCAN_TIME &
temp$frame_nr==(f-1))
}
temp.surface <- temp[where, "Surface"]
# Find direction with largest plant area
max.f <- which.max(temp.surface$Surface)
# Update frame_nr (0 as the view with the largest plant area)
for (f in seq(1,12)){
temp[where[f], "frame_nr"] <- ((f-1)-max.f) %% 12
}
}
}
# Find id columns that does not exist
cols.to.add <-id.vars[!id.vars %in% names(temp)]
# Assign values in the id columns that didn't exist
for (col.to.add in cols.to.add){
if (col.to.add == "View" && tolower(tab.rgb) =="top"){
temp["View"] = "Top"
}
if (col.to.add == "frame_nr"){
temp["frame_nr"] = -1
}
}
# Change table to long format
temp.long <- reshape2::melt(temp, id.vars = id.vars,
variable.name = "variable")
# Combine rows from different worksheets
df.rgb <- rbind(df.rgb, temp.long)
# Debugging
#print(paste(path.rgb, " ", tab.rgb))
#print(unique(temp$View))
#print(unique(temp$frame_nr))
#print(unique(temp.long$variable))
}
}
message('Succeeded importing RGB data...')
# Overview 1: Show summary of RGB data by date
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ]
message('\n* Data Overview 1 (RGB): Summary by Date\n')
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP))
# Overview 2: Show summary of RGB data by date and time
format <- "%Y-%m-%d %H:%M:%S"
df.rgb.perscan["SCAN_DATETIME"] =
as.POSIXct(paste(as.Date(df.rgb.perscan$SCAN_DATE), df.rgb.perscan$SCAN_TIME),
format=format)
df.rgb.perscan["SCAN_DATETIME_BY15MIN"] =
lubridate::round_date(df.rgb.perscan$SCAN_DATETIME, "15 minutes")
message('\n* Data Overview 2 (RGB): Summary by Date and Time\n')
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP, SCAN_DATETIME_BY15MIN))
message('\nThoroughly review the data to identify incomplete or test scans.')
message('Record the details of any unwanted data in ')
message('"EXP ID", "TREATMENT", and "SCAN_DATETIME_BY15MIN" formats')
message('and input these values into df.unwanted.rgb')
message('to exclude them from the analysis...\n')
if (!askYesNo('Proceed?')) stop()
# Overview 3: Show summary of RGB data after removing unwanted data
df.rgb["SCAN_DATETIME"] =
as.POSIXct(paste(as.Date(df.rgb$SCAN_DATE), df.rgb$SCAN_TIME), format=format)
df.rgb["SCAN_DATETIME_BY15MIN"] =
lubridate::round_date(df.rgb$SCAN_DATETIME, "15 minutes")
for(i in 1:nrow(df.unwanted.rgb)){
row <- df.unwanted.rgb[i,]
df.rgb <- df.rgb %>% filter(!(`EXP ID` == row$EXP_ID &
TREATMENT == row$TREATMENT &
SCAN_DATETIME_BY15MIN == row$SCAN_DATETIME_BY15MIN))
}
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ]
message('\n* Data Overview 3 (RGB): Summary After Filtering Unwanted Data\n')
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP))
message('\nReview the data to identify incorrect TREATMENT values')
message('Input the incorrect and corrected values into df.convert.treatment...\n')
if (!askYesNo('Proceed?')) stop()
# Overview 4: Update treatment in the RGB data
for(i in 1:nrow(df.convert.treatment)){
row <- df.convert.treatment[i,]
df.rgb$TREATMENT[df.rgb$TREATMENT==row$from] <- row$to
}
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ]
message('\n* Data Overview 4 (RGB): Treatment Updated\n')
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP))
# Overview 5: Update DFP in the RGB data (should be less than 365)
df.rgb$DFP = df.rgb$DFP %% 365
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ]
message('\n* Data Overview 5 (RGB): DFP Updated\n')
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP))
message('\nReview the data to identify growth stage according to DFP.')
message('Record the details of growth stage information in ')
message('"EXP ID", "TREATMENT", "DFP", and "GROWTH_STAGE" formats')
message('and input these values into df.growth.stage...\n')
if (!askYesNo('Proceed?')) stop()
# Overview 6: Add growth stage in the RGB data
df.rgb$GROWTH_STAGE <- NA
for(i in 1:nrow(df.growth.stage)){
row <- df.growth.stage[i,]
df.rgb$GROWTH_STAGE[(df.rgb$`EXP ID` == row$EXP_ID) &
(df.rgb$TREATMENT == row$TREATMENT) &
(df.rgb$DFP == row$DFP)] <- row$GROWTH_STAGE
}
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ]
message('\n* Data Overview 6 (RGB): Growth Stage Added\n')
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP, GROWTH_STAGE))
# Drop unused columns
df.rgb <- df.rgb[ , !(names(df.rgb) %in% drops.col.rgb)]
# Change column order
df.rgb <- df.rgb[, rgb.col.order]
df.rgb[col.factor] <- lapply(df.rgb[col.factor], factor)
# Export RGB dataframe
message('\nExporting RGB dataframe (long format)...')
write.csv(df.rgb, path.rgb.long, row.names=F)
# Create empty HSI dataframe
mat <- matrix(ncol=0, nrow=0)
df.hsi <- data.frame(mat)
# Import HSI data
for (path.hsi in paths.hsi){
for (tab.hsi in excel_sheets(path = path.hsi)){
if (tab.hsi=="PPEW") next
if (tab.hsi=="Information") next
# Read a HSI worksheet
temp <- read_excel(path.hsi, sheet=tab.hsi)
# Remove erroneous ghost column
temp <- select(temp, -starts_with("..."))
# Remove hand-made columns
cols <- grepl(regex.col.hsi, as.character(colnames(temp)))
temp <- temp[cols]
# Remove unused columns
temp <- temp[,!names(temp) %in% nouse.col.hsi]
# Add "View" column
temp["View"] <- NA
if (grepl("side", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Side"
if (grepl("top", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Top"
# Add "frame_nr" column
temp["frame_nr"] = -1
# Change table into long format
temp.long <- reshape2::melt(temp, id.vars = id.vars,
variable.name = "variable")
# Combine rows from different worksheets
df.hsi <- rbind(df.hsi, temp.long)
## Debugging
#print(paste(path.hsi, " ", tab.hsi))
#print(unique(temp$View))
#print(unique(temp$frame_nr))
#print(unique(temp.long$variable))
}
}
message('Succeeded importing HSI data...')
# Overview 1: Show summary of HSI data by date
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ]
message('\n* Data Overview 1 (HSI): Summary by Date\n')
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP))
# Overview 2: Show summary of HSI data by date and time
format <- "%Y-%m-%d %H:%M:%S"
df.hsi.perscan["SCAN_DATETIME"] =
as.POSIXct(paste(as.Date(df.hsi.perscan$SCAN_DATE), df.hsi.perscan$SCAN_TIME),
format=format)
df.hsi.perscan["SCAN_DATETIME_BY30MIN"] =
lubridate::round_date(df.hsi.perscan$SCAN_DATETIME, "30 minutes")
message('\n* Data Overview 2 (HSI): Summary by Date and Time\n')
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP, SCAN_DATETIME_BY30MIN))
message('\nThoroughly review the data to identify incomplete or test scans.')
message('Record the details of any unwanted data in ')
message('"EXP ID", "TREATMENT", and "SCAN_DATETIME_BY15MIN" formats')
message('and input these values into df.unwanted.hsi')
message('to exclude them from the analysis...\n')
if (!askYesNo('Proceed?')) stop()
# Overview 3: Show summary of HSI data after removing unwanted data
df.hsi["SCAN_DATETIME"] =
as.POSIXct(paste(as.Date(df.hsi$SCAN_DATE), df.hsi$SCAN_TIME), format=format)
df.hsi["SCAN_DATETIME_BY15MIN"] =
lubridate::round_date(df.hsi$SCAN_DATETIME, "30 minutes")
for(i in 1:nrow(df.unwanted.hsi)){
row <- df.unwanted.hsi[i,]
df.hsi <- df.hsi %>% filter(!(`EXP ID` == row$EXP_ID &
TREATMENT == row$TREATMENT &
SCAN_DATETIME_BY15MIN == row$SCAN_DATETIME_BY30MIN))
}
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ]
message('\n* Data Overview 3 (HSI): Summary After Filtering Unwanted Data\n')
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP))
message('\nReview the data to identify incorrect TREATMENT values')
message('Input the incorrect and corrected values into df.convert.treatment...\n')
if (!askYesNo('Proceed?')) stop()
# Overview 4: Update treatment in the HSI data
for(i in 1:nrow(df.convert.treatment)){
row <- df.convert.treatment[i,]
df.hsi$TREATMENT[df.hsi$TREATMENT==row$from] <- row$to
}
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ]
message('\n* Data Overview 4 (HSI): Treatment Updated\n')
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP))
# Overview 5: Update DFP in the HSI data (should be less than 365)
df.hsi$DFP = df.hsi$DFP %% 365
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ]
message('\n* Data Overview 5 (HSI): DFP Updated\n')
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP))
message('\nReview the data to identify growth stage according to DFP.')
message('Record the details of growth stage information in ')
message('"EXP ID", "TREATMENT", "DFP", and "GROWTH_STAGE" formats')
message('and input these values into df.growth.stage...\n')
if (!askYesNo('Proceed?')) stop()
# Overview 6: Add growth stage in the HSI data
df.hsi$GROWTH_STAGE <- NA
for(i in 1:nrow(df.growth.stage)){
row <- df.growth.stage[i,]
df.hsi$GROWTH_STAGE[(df.hsi$`EXP ID` == row$EXP_ID) &
(df.hsi$TREATMENT == row$TREATMENT) &
(df.hsi$DFP == row$DFP)] <- row$GROWTH_STAGE
}
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ]
message('\n* Data Overview 6 (HSI): Growth Stage Added\n')
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP, GROWTH_STAGE))
# Drop unused columns
df.hsi <- df.hsi[ , !(names(df.hsi) %in% drops.col.hsi)]
# Change column order
df.hsi <- df.hsi[, hsi.col.order]
df.hsi[col.factor] <- lapply(df.hsi[col.factor], factor)
# Export HSI dataframe
message('\nExporting HSI dataframe (long format)...')
write.csv(df.hsi, path.hsi.long, row.names=F)
# EOF