Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
DataExploration-AAPF/reshape_rgb_hsi.r
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
433 lines (383 sloc)
18.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################ | |
# reshape_table_round2.r | |
############################################################################ | |
# | |
# This R script processes RGB and HSI data (xlsx format) from | |
# the Ag Alumni Seed Phenotyping Facility (AAPF), Purdue. | |
# It transforms the data into a long format suitable for | |
# analysis using R packages. | |
# User input needed here | |
# Set file paths to the individual masterfiles-per-repetition | |
paths.rgb <- | |
c("../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_374.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_375.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_378.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_380.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_390.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_391.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_396.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/RGB_data_AAPF_experiment_401.xlsx") | |
paths.hsi <- | |
c("../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_374.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_375.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_378.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_380.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_390.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_391.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_396.xlsx", | |
"../1_masterfiles_per_repetition/round2_2023_2024/Hyperspectral_data_AAPF_experiment_401.xlsx") | |
# Set file paths to the output tables in long format (R-compatible) | |
path.rgb.long <- ("./df_rgb_long.csv") | |
path.hsi.long <- ("./df_hsi_long.csv") | |
# Specify unwanted measurements | |
# Note: Unwanted data includes incomplete scan results, test scans, | |
# and data generated during AAPF tour demos, etc. | |
# Steps | |
# 1. Run this R script. | |
# 2. Identify potential unwanted measurements: | |
# Examine the script's output (Data Overview 1 & 2) for data with a | |
# significantly lower plant count compared to the available plants. | |
# 3. Review and select: | |
# Based on Data Overviews 1 & 2, determine which measurements | |
# to exclude from your analysis. | |
# 4. Locate variables: | |
# Find the df.unwanted.{rgb|hsi} variables within the script. | |
# 5. Add timestamps: | |
# Append the date and time (e.g., "2023-10-30 11:15:00") to the | |
# placeholder df.unwanted.{rgb|hsi}. | |
df.unwanted.rgb <- data.frame( | |
EXP_ID =c( 374, 374, 374, 374, 396), | |
TREATMENT=c("20 -1", "50-3", "50-3", "50-4", "20-1"), | |
SCAN_DATETIME_BY15MIN=c("2023-10-30 11:15:00", | |
"2023-09-28 09:15:00", | |
"2023-09-28 09:00:00", | |
"2023-09-28 09:45:00", | |
"2024-04-29 12:00:00"), | |
stringsAsFactors=FALSE) | |
df.unwanted.hsi <- data.frame( | |
EXP_ID=c(390), | |
TREATMENT=c("20-1"), | |
SCAN_DATETIME_BY30MIN=c("2024-03-08 10:30:00"), | |
stringsAsFactors=FALSE) | |
# Update treatment description | |
# If incorrect data is entered during the PPEW process, | |
# use df.convert.treatment to make corrections. | |
df.convert.treatment <- data.frame( | |
from=c("20 -1", "20-2", "50-3", "50-4", "20-1", "20-2"), | |
to =c("20", "20", "80", "80", "20", "20"), | |
stringAsFactors=FALSE) | |
# Assign growth stage according to | |
# (1) experiment ID, (2) treatment, and (3) days since planting (DFP). | |
# Use Data Overview 5 to identify the correct growth stage for | |
# each data point based on the given combination. | |
df.growth.stage <- data.frame( | |
EXP_ID =c(374, 374, 374, 374, 375, 375, 375, 375, | |
378, 378, 378, 378, 380, 380, 380, 380, | |
390, 390, 390, 390, 391, 391, 391, 391, | |
396, 396, 396, 396, 401, 401, 401, 401), | |
TREATMENT =c(20, 20, 80, 80, 20, 20, 80, 80, | |
20, 20, 80, 80, 20, 20, 80, 80, | |
20, 20, 80, 80, 20, 20, 80, 80, | |
20, 20, 80, 80, 20, 20, 80, 80), | |
DFP =c(33, 59, 32, 50, 35, 56, 28, 48, | |
35, 60, 28, 56, 34, 63, 28, 53, | |
35, 63, 28, 48, 35, 63, 28, 56, | |
42, 70, 42, 63, 42, 73, 43, 65), | |
GROWTH_STAGE=c('V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT', | |
'V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT', | |
'V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT', | |
'V8', 'VT', 'V8', 'VT', 'V8', 'VT', 'V8', 'VT')) | |
# No changes required in this section | |
# Columns in master files | |
regex.col.rgb <- paste("Filename", "EXP ID", "POT_BARCODE", "VARIETY", | |
"TREATMENT", "SCAN_TIME", "SCAN_DATE", "DFP", "View", | |
"frame_nr", "Width", "Height", "Surface", "Angle", | |
"Convex_hull", "Roundness", "Center_of_mass_distance", | |
"Center_of_mass_x", "Center_of_mass_y", | |
"Hue", "Saturation", "Intensity", "Fluorescence", | |
"[HSVF][[:digit:]]{1,3}", sep="|") | |
regex.col.hsi <- paste("EXP ID", "POT_BARCODE", "VARIETY", "TREATMENT", | |
"SCAN_TIME", "SCAN_DATE", "DFP", | |
"[[:alnum:]]_+(mean|max|min|std|p[[:digit:]]{1,2})", | |
"[[:digit:]]{3,4}(\\.[[:digit:]]{1,15})?", | |
sep="|") | |
# Columns to be removed after importing | |
nouse.col.rgb <- c("Filename") | |
nouse.col.hsi <- c("Filename-VNIR-SIDE", "Filename-VNIR-TOP", | |
"Filename-SWIR-SIDE", "Filename-SWIR-TOP") | |
# Columns used as interim identifier | |
id.vars <- c("EXP ID", "POT_BARCODE", "SCAN_TIME", "SCAN_DATE", | |
"VARIETY", "TREATMENT", "DFP", "View", "frame_nr") | |
# Columns to be removed while processing | |
drops.col.rgb <- c("SCAN_TIME", "SCAN_DATE", | |
"SCAN_DATETIME", "SCAN_DATETIME_BY15MIN") | |
drops.col.hsi <- c("SCAN_TIME", "SCAN_DATE", | |
"SCAN_DATETIME", "SCAN_DATETIME_BY30MIN") | |
# Column order of output table | |
rgb.col.order <- c("EXP ID", "POT_BARCODE", "TREATMENT", "VARIETY", | |
"DFP", "GROWTH_STAGE", "View", "frame_nr", | |
"variable", "value") | |
hsi.col.order <- c("EXP ID", "POT_BARCODE", "TREATMENT", "VARIETY", | |
"DFP", "GROWTH_STAGE", "View", "frame_nr", | |
"variable", "value") | |
# Column used as factor while processing | |
col.factor <- c("EXP ID", "VARIETY", "TREATMENT", | |
"View", "frame_nr", "GROWTH_STAGE") | |
# Main code starts here | |
# Import libraries | |
library(dplyr) | |
library(readxl) | |
library(data.table) | |
library(lubridate) | |
# Create empty RGB dataframe | |
mat <- matrix(ncol=0, nrow=0) | |
df.rgb <- data.frame(mat) | |
# Import RGB data | |
for (path.rgb in paths.rgb){ | |
for (tab.rgb in excel_sheets(path = path.rgb)){ | |
if (tab.rgb=="PPEW") next | |
if (tab.rgb=="Information") next | |
# Read a rgb worksheet | |
temp <- read_excel(path.rgb, sheet=tab.rgb) | |
# Remove hand-made columns | |
cols <- grepl(regex.col.rgb, as.character(colnames(temp))) | |
temp <- temp[cols] | |
# Remove unused columns | |
temp <- temp[,!names(temp) %in% nouse.col.rgb] | |
# Simplify "View" (Side{Bottom|Small|Full|Tall} to Side{Average|All}) | |
if (tolower(tab.rgb) == "side average") temp["View"] <- "SideAverage" | |
if (tolower(tab.rgb) == "side all") temp["View"] <- "SideAll" | |
# Reorder frames (frame_nr) | |
# Steps | |
# 1. Identify the side view with the largest plant volume. | |
# 2. Reindex frame_nr values (0 to 11, from the default pot orientation) | |
# to match the new order starting from the side with the largest | |
# plant area | |
list.sideall <- unique(temp[, c('POT_BARCODE', 'SCAN_DATE', 'SCAN_TIME')]) | |
if (tolower(tab.rgb) == "side all"){ | |
for(i in 1:nrow(list.sideall)){ | |
row <- list.sideall[i,] | |
# Get plant surface area | |
where <- vector(length=12) | |
for (f in seq(1,12)){ | |
where[f] <- which(temp$POT_BARCODE==row$POT_BARCODE & | |
temp$SCAN_DATE==row$SCAN_DATE & | |
temp$SCAN_TIME==row$SCAN_TIME & | |
temp$frame_nr==(f-1)) | |
} | |
temp.surface <- temp[where, "Surface"] | |
# Find direction with largest plant area | |
max.f <- which.max(temp.surface$Surface) | |
# Update frame_nr (0 as the view with the largest plant area) | |
for (f in seq(1,12)){ | |
temp[where[f], "frame_nr"] <- ((f-1)-max.f) %% 12 | |
} | |
} | |
} | |
# Find id columns that does not exist | |
cols.to.add <-id.vars[!id.vars %in% names(temp)] | |
# Assign values in the id columns that didn't exist | |
for (col.to.add in cols.to.add){ | |
if (col.to.add == "View" && tolower(tab.rgb) =="top"){ | |
temp["View"] = "Top" | |
} | |
if (col.to.add == "frame_nr"){ | |
temp["frame_nr"] = -1 | |
} | |
} | |
# Change table to long format | |
temp.long <- reshape2::melt(temp, id.vars = id.vars, | |
variable.name = "variable") | |
# Combine rows from different worksheets | |
df.rgb <- rbind(df.rgb, temp.long) | |
# Debugging | |
#print(paste(path.rgb, " ", tab.rgb)) | |
#print(unique(temp$View)) | |
#print(unique(temp$frame_nr)) | |
#print(unique(temp.long$variable)) | |
} | |
} | |
message('Succeeded importing RGB data...') | |
# Overview 1: Show summary of RGB data by date | |
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] | |
message('\n* Data Overview 1 (RGB): Summary by Date\n') | |
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) | |
# Overview 2: Show summary of RGB data by date and time | |
format <- "%Y-%m-%d %H:%M:%S" | |
df.rgb.perscan["SCAN_DATETIME"] = | |
as.POSIXct(paste(as.Date(df.rgb.perscan$SCAN_DATE), df.rgb.perscan$SCAN_TIME), | |
format=format) | |
df.rgb.perscan["SCAN_DATETIME_BY15MIN"] = | |
lubridate::round_date(df.rgb.perscan$SCAN_DATETIME, "15 minutes") | |
message('\n* Data Overview 2 (RGB): Summary by Date and Time\n') | |
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP, SCAN_DATETIME_BY15MIN)) | |
message('\nThoroughly review the data to identify incomplete or test scans.') | |
message('Record the details of any unwanted data in ') | |
message('"EXP ID", "TREATMENT", and "SCAN_DATETIME_BY15MIN" formats') | |
message('and input these values into df.unwanted.rgb') | |
message('to exclude them from the analysis...\n') | |
if (!askYesNo('Proceed?')) stop() | |
# Overview 3: Show summary of RGB data after removing unwanted data | |
df.rgb["SCAN_DATETIME"] = | |
as.POSIXct(paste(as.Date(df.rgb$SCAN_DATE), df.rgb$SCAN_TIME), format=format) | |
df.rgb["SCAN_DATETIME_BY15MIN"] = | |
lubridate::round_date(df.rgb$SCAN_DATETIME, "15 minutes") | |
for(i in 1:nrow(df.unwanted.rgb)){ | |
row <- df.unwanted.rgb[i,] | |
df.rgb <- df.rgb %>% filter(!(`EXP ID` == row$EXP_ID & | |
TREATMENT == row$TREATMENT & | |
SCAN_DATETIME_BY15MIN == row$SCAN_DATETIME_BY15MIN)) | |
} | |
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] | |
message('\n* Data Overview 3 (RGB): Summary After Filtering Unwanted Data\n') | |
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) | |
message('\nReview the data to identify incorrect TREATMENT values') | |
message('Input the incorrect and corrected values into df.convert.treatment...\n') | |
if (!askYesNo('Proceed?')) stop() | |
# Overview 4: Update treatment in the RGB data | |
for(i in 1:nrow(df.convert.treatment)){ | |
row <- df.convert.treatment[i,] | |
df.rgb$TREATMENT[df.rgb$TREATMENT==row$from] <- row$to | |
} | |
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] | |
message('\n* Data Overview 4 (RGB): Treatment Updated\n') | |
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) | |
# Overview 5: Update DFP in the RGB data (should be less than 365) | |
df.rgb$DFP = df.rgb$DFP %% 365 | |
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] | |
message('\n* Data Overview 5 (RGB): DFP Updated\n') | |
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP)) | |
message('\nReview the data to identify growth stage according to DFP.') | |
message('Record the details of growth stage information in ') | |
message('"EXP ID", "TREATMENT", "DFP", and "GROWTH_STAGE" formats') | |
message('and input these values into df.growth.stage...\n') | |
if (!askYesNo('Proceed?')) stop() | |
# Overview 6: Add growth stage in the RGB data | |
df.rgb$GROWTH_STAGE <- NA | |
for(i in 1:nrow(df.growth.stage)){ | |
row <- df.growth.stage[i,] | |
df.rgb$GROWTH_STAGE[(df.rgb$`EXP ID` == row$EXP_ID) & | |
(df.rgb$TREATMENT == row$TREATMENT) & | |
(df.rgb$DFP == row$DFP)] <- row$GROWTH_STAGE | |
} | |
df.rgb.perscan <- df.rgb[which(df.rgb$View=="Top" & df.rgb$variable=="Surface"), ] | |
message('\n* Data Overview 6 (RGB): Growth Stage Added\n') | |
print(count(df.rgb.perscan, `EXP ID`, TREATMENT, DFP, GROWTH_STAGE)) | |
# Drop unused columns | |
df.rgb <- df.rgb[ , !(names(df.rgb) %in% drops.col.rgb)] | |
# Change column order | |
df.rgb <- df.rgb[, rgb.col.order] | |
df.rgb[col.factor] <- lapply(df.rgb[col.factor], factor) | |
# Export RGB dataframe | |
message('\nExporting RGB dataframe (long format)...') | |
write.csv(df.rgb, path.rgb.long, row.names=F) | |
# Create empty HSI dataframe | |
mat <- matrix(ncol=0, nrow=0) | |
df.hsi <- data.frame(mat) | |
# Import HSI data | |
for (path.hsi in paths.hsi){ | |
for (tab.hsi in excel_sheets(path = path.hsi)){ | |
if (tab.hsi=="PPEW") next | |
if (tab.hsi=="Information") next | |
# Read a HSI worksheet | |
temp <- read_excel(path.hsi, sheet=tab.hsi) | |
# Remove erroneous ghost column | |
temp <- select(temp, -starts_with("...")) | |
# Remove hand-made columns | |
cols <- grepl(regex.col.hsi, as.character(colnames(temp))) | |
temp <- temp[cols] | |
# Remove unused columns | |
temp <- temp[,!names(temp) %in% nouse.col.hsi] | |
# Add "View" column | |
temp["View"] <- NA | |
if (grepl("side", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Side" | |
if (grepl("top", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Top" | |
# Add "frame_nr" column | |
temp["frame_nr"] = -1 | |
# Change table into long format | |
temp.long <- reshape2::melt(temp, id.vars = id.vars, | |
variable.name = "variable") | |
# Combine rows from different worksheets | |
df.hsi <- rbind(df.hsi, temp.long) | |
## Debugging | |
#print(paste(path.hsi, " ", tab.hsi)) | |
#print(unique(temp$View)) | |
#print(unique(temp$frame_nr)) | |
#print(unique(temp.long$variable)) | |
} | |
} | |
message('Succeeded importing HSI data...') | |
# Overview 1: Show summary of HSI data by date | |
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ] | |
message('\n* Data Overview 1 (HSI): Summary by Date\n') | |
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) | |
# Overview 2: Show summary of HSI data by date and time | |
format <- "%Y-%m-%d %H:%M:%S" | |
df.hsi.perscan["SCAN_DATETIME"] = | |
as.POSIXct(paste(as.Date(df.hsi.perscan$SCAN_DATE), df.hsi.perscan$SCAN_TIME), | |
format=format) | |
df.hsi.perscan["SCAN_DATETIME_BY30MIN"] = | |
lubridate::round_date(df.hsi.perscan$SCAN_DATETIME, "30 minutes") | |
message('\n* Data Overview 2 (HSI): Summary by Date and Time\n') | |
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP, SCAN_DATETIME_BY30MIN)) | |
message('\nThoroughly review the data to identify incomplete or test scans.') | |
message('Record the details of any unwanted data in ') | |
message('"EXP ID", "TREATMENT", and "SCAN_DATETIME_BY15MIN" formats') | |
message('and input these values into df.unwanted.hsi') | |
message('to exclude them from the analysis...\n') | |
if (!askYesNo('Proceed?')) stop() | |
# Overview 3: Show summary of HSI data after removing unwanted data | |
df.hsi["SCAN_DATETIME"] = | |
as.POSIXct(paste(as.Date(df.hsi$SCAN_DATE), df.hsi$SCAN_TIME), format=format) | |
df.hsi["SCAN_DATETIME_BY15MIN"] = | |
lubridate::round_date(df.hsi$SCAN_DATETIME, "30 minutes") | |
for(i in 1:nrow(df.unwanted.hsi)){ | |
row <- df.unwanted.hsi[i,] | |
df.hsi <- df.hsi %>% filter(!(`EXP ID` == row$EXP_ID & | |
TREATMENT == row$TREATMENT & | |
SCAN_DATETIME_BY15MIN == row$SCAN_DATETIME_BY30MIN)) | |
} | |
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ] | |
message('\n* Data Overview 3 (HSI): Summary After Filtering Unwanted Data\n') | |
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) | |
message('\nReview the data to identify incorrect TREATMENT values') | |
message('Input the incorrect and corrected values into df.convert.treatment...\n') | |
if (!askYesNo('Proceed?')) stop() | |
# Overview 4: Update treatment in the HSI data | |
for(i in 1:nrow(df.convert.treatment)){ | |
row <- df.convert.treatment[i,] | |
df.hsi$TREATMENT[df.hsi$TREATMENT==row$from] <- row$to | |
} | |
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ] | |
message('\n* Data Overview 4 (HSI): Treatment Updated\n') | |
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) | |
# Overview 5: Update DFP in the HSI data (should be less than 365) | |
df.hsi$DFP = df.hsi$DFP %% 365 | |
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ] | |
message('\n* Data Overview 5 (HSI): DFP Updated\n') | |
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP)) | |
message('\nReview the data to identify growth stage according to DFP.') | |
message('Record the details of growth stage information in ') | |
message('"EXP ID", "TREATMENT", "DFP", and "GROWTH_STAGE" formats') | |
message('and input these values into df.growth.stage...\n') | |
if (!askYesNo('Proceed?')) stop() | |
# Overview 6: Add growth stage in the HSI data | |
df.hsi$GROWTH_STAGE <- NA | |
for(i in 1:nrow(df.growth.stage)){ | |
row <- df.growth.stage[i,] | |
df.hsi$GROWTH_STAGE[(df.hsi$`EXP ID` == row$EXP_ID) & | |
(df.hsi$TREATMENT == row$TREATMENT) & | |
(df.hsi$DFP == row$DFP)] <- row$GROWTH_STAGE | |
} | |
df.hsi.perscan <- df.hsi[which(df.hsi$View=="Top" & df.hsi$variable=="NDVI_mean"), ] | |
message('\n* Data Overview 6 (HSI): Growth Stage Added\n') | |
print(count(df.hsi.perscan, `EXP ID`, TREATMENT, DFP, GROWTH_STAGE)) | |
# Drop unused columns | |
df.hsi <- df.hsi[ , !(names(df.hsi) %in% drops.col.hsi)] | |
# Change column order | |
df.hsi <- df.hsi[, hsi.col.order] | |
df.hsi[col.factor] <- lapply(df.hsi[col.factor], factor) | |
# Export HSI dataframe | |
message('\nExporting HSI dataframe (long format)...') | |
write.csv(df.hsi, path.hsi.long, row.names=F) | |
# EOF |