diff --git a/DataExplore.r b/DataExplore.r new file mode 100644 index 0000000..fe80e72 --- /dev/null +++ b/DataExplore.r @@ -0,0 +1,172 @@ +# DataExplore.r +# Explore AAPF data products and get some insight +# Currently, developed for RGB and HSI masterfiles (.xlsx) + +library(dplyr) +library(readxl) +library(data.table) + + +# File path to masterfiles +paths.rgb <- c("../RGB_Bayer_2.xlsx") +paths.hsi <- c("../HS_Bayer_2.xlsx") + + + + + +# Column names in the generic master files +regex.col.rgb <- paste("Filename", "EXP ID", "POT_BARCODE", "VARIETY", + "TREATMENT", "SCAN_TIME", "SCAN_DATE", "DFP", "View", + "frame_nr", "Width", "Height", "Surface", "Angle", + "Convex_hull", "Roundness", "Center_of_mass_distance", + "Center_of_mass_x", "Center_of_mass_y", + "Hue", "Saturation", "Intensity", "Fluorescence", + "[HSVF][[:digit:]]{1,3}", sep="|") +regex.col.hsi <- paste("EXP ID", "POT_BARCODE", "VARIETY", "TREATMENT", + "SCAN_TIME", "SCAN_DATE", "DFP", + "[[:alnum:]]_+(mean|max|min|std|p[[:digit:]]{1,2})", + "[[:digit:]]{3,4}(\\.[[:digit:]]{1,15})?", + sep="|") + +# Column names to be disregarded +nouse.col.rgb <- c("Filename", "SCAN_TIME", "SCAN_DATE") +nouse.col.hsi <- c("Filename-VNIR-SIDE", "Filename-VNIR-TOP", + "Filename-SWIR-SIDE", "Filename-SWIR-TOP", + "SCAN_TIME", "SCAN_DATE") + +# Column names used as identifier +id.vars <- c("EXP ID", "POT_BARCODE", "VARIETY", + "TREATMENT", "DFP", "View", "frame_nr") + + + + + +# Create empty rgb dataframe +mat <- matrix(ncol=0, nrow=0) +df.rgb <- data.frame(mat) + +for (path.rgb in paths.rgb){ + for (tab.rgb in excel_sheets(path = path.rgb)){ + if (tab.rgb=="PPEW") next + + # Read a rgb worksheet + temp <- read_excel(path.rgb, sheet=tab.rgb) + + # Remove hand-made columns + cols <- grepl(regex.col.rgb, as.character(colnames(temp))) + temp <- temp[cols] + + # Remove unused columns + temp <- temp[,!names(temp) %in% nouse.col.rgb] + + # Simplify values under "View" column + # Side{Bottom|Small|Full|Tall} to Side{Average|All} + if (tolower(tab.rgb) =="side average") temp["View"] <- "SideAverage" + if (tolower(tab.rgb) =="side all") temp["View"] <- "SideAll" + + # Find id columns that does not exist + cols.to.add <-id.vars[!id.vars %in% names(temp)] + + # Assign values in the id columns that didn't exist + for (col.to.add in cols.to.add){ + # TODO + if (col.to.add == "View" && tolower(tab.rgb) =="top"){ + temp["View"] = "Top" + } + if (col.to.add == "frame_nr"){ + temp["frame_nr"] = -1 + } + } + + # Change table into long format + temp.long <- reshape2::melt(temp, id.vars = id.vars, + variable.name = "variable") + + # Combine rows from different worksheets + df.rgb <- rbind(df.rgb, temp.long) + + # Debugging + #print(paste(path.rgb, " ", tab.rgb)) + #print(unique(temp$View)) + #print(unique(temp$frame_nr)) + #print(unique(temp.long$variable)) + } +} +message('Succeeded importing RGB data...') + + + +# Create empty rgb dataframe +mat <- matrix(ncol=0, nrow=0) +df.hsi <- data.frame(mat) + +for (path.hsi in paths.hsi){ + for (tab.hsi in excel_sheets(path = path.hsi)){ + if (tab.hsi=="PPEW") next + + # Read a hsi worksheet + temp <- read_excel(path.hsi, sheet=tab.hsi) + + # Remove erroneous ghost column + temp <- select(temp, -starts_with("...")) + + # Remove hand-made columns + cols <- grepl(regex.col.hsi, as.character(colnames(temp))) + temp <- temp[cols] + + # Remove unused columns + temp <- temp[,!names(temp) %in% nouse.col.hsi] + + # Add "View" column + temp["View"] <- NA + if (grepl("side", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Side" + if (grepl("top", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Top" + + # Add "frame_nr" column + temp["frame_nr"] = -1 + + # Change table into long format + temp.long <- reshape2::melt(temp, id.vars = id.vars, + variable.name = "variable") + + # Combine rows from different worksheets + df.hsi <- rbind(df.hsi, temp.long) + + ## Debugging + #print(paste(path.hsi, " ", tab.hsi)) + #print(unique(temp$View)) + #print(unique(temp$frame_nr)) + #print(unique(temp.long$variable)) + } +} +message('Succeeded importing HSI data...') + + + + + + + + + + +# Merge RGB and HSI data + +# For "RGB-SideAll", change column names Frame0-11 to major, major+30... + +# Change DFPs to growth stage + + + + + + + +# Check input variables with repeatability and ANOVA +# For "RGB-SideAll", visualize importance along the side angle + +# Visualize feature importance + +# RGB-visualization diff --git a/DataGuide_AAPF_RGB.md b/DataGuide_AAPF_RGB.md index 0b640c5..f3b507d 100644 --- a/DataGuide_AAPF_RGB.md +++ b/DataGuide_AAPF_RGB.md @@ -112,17 +112,26 @@ Both the master data file (.xlsx) and individual measurement files (.csv) share | Column Head | Description | |----------------------------------------|-------------------------------------------------------------------------------------------------| -| `frame_nr` | Frame number from 0-11 for side view data | +| `Filename` | Input spreadsheet filename (.csv) | +| `EXP ID` | Experiment number within AAPF | +| `POT_BARCODE` | Unique identifier for the plant pot | +| `VARIETY` | Variety assigned in PPEW | +| `TREATMENT` | Treatment applied to plant | +| `SCAN_TIME` | Scan start time | +| `SCAN_DATE` | Scan start date | +| `DFP` | Age of plant in days from planting at time of imaging | | `Angle` | Side view angle where the vegetation is most dispersed | +| `View` | TOP FRAME: Ignore this row
TOP AVG: Data from the top view
Side{Bottom, Small, Top, Full} FRAME 0 to 11: Data from various side view angles (0, 30, ..., 330 degrees)
Side{Bottom, Small, Top, Full} AVG: Average of Side view FRAME data | +| `frame_nr` | Frame number from 0-11 for side view data | | `Width`, `Height` | Dimensions of the smallest enclosing rectangle of the vegetation segments | | `Surface` | Area of the vegetation within the pixel-wise boundary of vegetation segments | | `Convex hull` | Area of the convex hull measured by the smallest enclosing polygon of the vegetation segments | -| `Roundedness` | Roundness of the pixel-wise boundary of vegetation segments (❗ calculation method unclear) | +| `Roundness` | Roundness of the pixel-wise boundary of vegetation segments (❗ calculation method unclear) | | `Center_of_mass_distance` | ❗ Unclear meaning | | `Center_of_mass_x`, `Center_of_mass_y` | Coordinates of the center of mass point | | `Hue, Saturation`, `Intensity` | Average values for these color properties in the image | | `Fluorescence` | Average fluorescence within the pixel-wise boundary of vegetation segments | | `H###`, `S##`, `V##` | Frequency of pixels with specific hue (###: 0-359), saturation (##: 0-99), and value (##: 0-99)
Check the definitions of hue, saturation, and value in [this link](https://changingminds.org/explanations/perception/visual/hsl.htm) | | `F##` | Frequency of pixels with a specific fluorescence value (##: 0-99) | -| `View` | TOP FRAME: Ignore this row
TOP AVG: Data from the top view
Side{Bottom, Small, Top, Full} FRAME 0 to 11: Data from various side view angles (0, 30, ..., 330 degrees)
Side{Bottom, Small, Top, Full} AVG: Average of Side view FRAME data | + diff --git a/README.md b/README.md index 420e27d..364af2e 100644 --- a/README.md +++ b/README.md @@ -3,52 +3,3 @@ Exploratory data analysis for AAPF dataset (RGB, HSI, XRAY) -# Change xlsx files to csv files - -# Load csv files -# Merge DFPs? range->text -# Replace DFPs with growth stage if available - - -# RGB, HSI -# Check input variables with repeatability and ANOVA - - - -# RGB-visualization - - - -# Column header -# frame_nr: frame number -# Angle: side view angle when vegetation is mostly dispersed along the plane -# Width: width measured by green box in MES file (see 2.1) -# Height: height measured by green box in MES file (see 2.1) -# Surface: vegetation area measured by red polygon (see 2.3) -# Convex hull: convex hull area measured by blue polygon (see 2.2) -# Roundedness: roundedness measured by red polygon (unsure, see 2.3) -# Center_of_mass_distance: unclear -# Center_of_mass_x: x coordinate of center of mass point (see 2.4) -# Center_of_mass_y: y coordinate of center of mass point (see 2.4) -# Hue: average hue value -# Saturation: average saturation value -# Intensity: average intensity value -# Fluorescene: average flurescence value -# H##: frequency of pixels with a hue value of ## (##: 0-359) -# S##: frequency of pixels with a saturation value of ## (##: 0-99) -# V##: frequency of pixels with a intensity value of ## (##: 0-99) -# F##: frequency of pixels with a flurescence value of ## (##: 0-99) -# -# Row index (view) -# TOP FRAME: Ignore this row -# TOP AVG: Top view -# SideBottom FRAME 0 to 11: Side view data from various angles (0, 30, ..., 330 degree) -# SideBottom AVG: average of SideBottom FRAME data - - - - - - - -