diff --git a/DataExplore.r b/DataExplore.r
new file mode 100644
index 0000000..fe80e72
--- /dev/null
+++ b/DataExplore.r
@@ -0,0 +1,172 @@
+# DataExplore.r
+# Explore AAPF data products and get some insight
+# Currently, developed for RGB and HSI masterfiles (.xlsx)
+
+library(dplyr)
+library(readxl)
+library(data.table)
+
+
+# File path to masterfiles
+paths.rgb <- c("../RGB_Bayer_2.xlsx")
+paths.hsi <- c("../HS_Bayer_2.xlsx")
+
+
+
+
+
+# Column names in the generic master files
+regex.col.rgb <- paste("Filename", "EXP ID", "POT_BARCODE", "VARIETY",
+ "TREATMENT", "SCAN_TIME", "SCAN_DATE", "DFP", "View",
+ "frame_nr", "Width", "Height", "Surface", "Angle",
+ "Convex_hull", "Roundness", "Center_of_mass_distance",
+ "Center_of_mass_x", "Center_of_mass_y",
+ "Hue", "Saturation", "Intensity", "Fluorescence",
+ "[HSVF][[:digit:]]{1,3}", sep="|")
+regex.col.hsi <- paste("EXP ID", "POT_BARCODE", "VARIETY", "TREATMENT",
+ "SCAN_TIME", "SCAN_DATE", "DFP",
+ "[[:alnum:]]_+(mean|max|min|std|p[[:digit:]]{1,2})",
+ "[[:digit:]]{3,4}(\\.[[:digit:]]{1,15})?",
+ sep="|")
+
+# Column names to be disregarded
+nouse.col.rgb <- c("Filename", "SCAN_TIME", "SCAN_DATE")
+nouse.col.hsi <- c("Filename-VNIR-SIDE", "Filename-VNIR-TOP",
+ "Filename-SWIR-SIDE", "Filename-SWIR-TOP",
+ "SCAN_TIME", "SCAN_DATE")
+
+# Column names used as identifier
+id.vars <- c("EXP ID", "POT_BARCODE", "VARIETY",
+ "TREATMENT", "DFP", "View", "frame_nr")
+
+
+
+
+
+# Create empty rgb dataframe
+mat <- matrix(ncol=0, nrow=0)
+df.rgb <- data.frame(mat)
+
+for (path.rgb in paths.rgb){
+ for (tab.rgb in excel_sheets(path = path.rgb)){
+ if (tab.rgb=="PPEW") next
+
+ # Read a rgb worksheet
+ temp <- read_excel(path.rgb, sheet=tab.rgb)
+
+ # Remove hand-made columns
+ cols <- grepl(regex.col.rgb, as.character(colnames(temp)))
+ temp <- temp[cols]
+
+ # Remove unused columns
+ temp <- temp[,!names(temp) %in% nouse.col.rgb]
+
+ # Simplify values under "View" column
+ # Side{Bottom|Small|Full|Tall} to Side{Average|All}
+ if (tolower(tab.rgb) =="side average") temp["View"] <- "SideAverage"
+ if (tolower(tab.rgb) =="side all") temp["View"] <- "SideAll"
+
+ # Find id columns that does not exist
+ cols.to.add <-id.vars[!id.vars %in% names(temp)]
+
+ # Assign values in the id columns that didn't exist
+ for (col.to.add in cols.to.add){
+ # TODO
+ if (col.to.add == "View" && tolower(tab.rgb) =="top"){
+ temp["View"] = "Top"
+ }
+ if (col.to.add == "frame_nr"){
+ temp["frame_nr"] = -1
+ }
+ }
+
+ # Change table into long format
+ temp.long <- reshape2::melt(temp, id.vars = id.vars,
+ variable.name = "variable")
+
+ # Combine rows from different worksheets
+ df.rgb <- rbind(df.rgb, temp.long)
+
+ # Debugging
+ #print(paste(path.rgb, " ", tab.rgb))
+ #print(unique(temp$View))
+ #print(unique(temp$frame_nr))
+ #print(unique(temp.long$variable))
+ }
+}
+message('Succeeded importing RGB data...')
+
+
+
+# Create empty rgb dataframe
+mat <- matrix(ncol=0, nrow=0)
+df.hsi <- data.frame(mat)
+
+for (path.hsi in paths.hsi){
+ for (tab.hsi in excel_sheets(path = path.hsi)){
+ if (tab.hsi=="PPEW") next
+
+ # Read a hsi worksheet
+ temp <- read_excel(path.hsi, sheet=tab.hsi)
+
+ # Remove erroneous ghost column
+ temp <- select(temp, -starts_with("..."))
+
+ # Remove hand-made columns
+ cols <- grepl(regex.col.hsi, as.character(colnames(temp)))
+ temp <- temp[cols]
+
+ # Remove unused columns
+ temp <- temp[,!names(temp) %in% nouse.col.hsi]
+
+ # Add "View" column
+ temp["View"] <- NA
+ if (grepl("side", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Side"
+ if (grepl("top", tolower(tab.hsi), fixed=T)==T) temp["View"] <- "Top"
+
+ # Add "frame_nr" column
+ temp["frame_nr"] = -1
+
+ # Change table into long format
+ temp.long <- reshape2::melt(temp, id.vars = id.vars,
+ variable.name = "variable")
+
+ # Combine rows from different worksheets
+ df.hsi <- rbind(df.hsi, temp.long)
+
+ ## Debugging
+ #print(paste(path.hsi, " ", tab.hsi))
+ #print(unique(temp$View))
+ #print(unique(temp$frame_nr))
+ #print(unique(temp.long$variable))
+ }
+}
+message('Succeeded importing HSI data...')
+
+
+
+
+
+
+
+
+
+
+# Merge RGB and HSI data
+
+# For "RGB-SideAll", change column names Frame0-11 to major, major+30...
+
+# Change DFPs to growth stage
+
+
+
+
+
+
+
+# Check input variables with repeatability and ANOVA
+# For "RGB-SideAll", visualize importance along the side angle
+
+# Visualize feature importance
+
+# RGB-visualization
diff --git a/DataGuide_AAPF_RGB.md b/DataGuide_AAPF_RGB.md
index 0b640c5..f3b507d 100644
--- a/DataGuide_AAPF_RGB.md
+++ b/DataGuide_AAPF_RGB.md
@@ -112,17 +112,26 @@ Both the master data file (.xlsx) and individual measurement files (.csv) share
| Column Head | Description |
|----------------------------------------|-------------------------------------------------------------------------------------------------|
-| `frame_nr` | Frame number from 0-11 for side view data |
+| `Filename` | Input spreadsheet filename (.csv) |
+| `EXP ID` | Experiment number within AAPF |
+| `POT_BARCODE` | Unique identifier for the plant pot |
+| `VARIETY` | Variety assigned in PPEW |
+| `TREATMENT` | Treatment applied to plant |
+| `SCAN_TIME` | Scan start time |
+| `SCAN_DATE` | Scan start date |
+| `DFP` | Age of plant in days from planting at time of imaging |
| `Angle` | Side view angle where the vegetation is most dispersed |
+| `View` | TOP FRAME: Ignore this row
TOP AVG: Data from the top view
Side{Bottom, Small, Top, Full} FRAME 0 to 11: Data from various side view angles (0, 30, ..., 330 degrees)
Side{Bottom, Small, Top, Full} AVG: Average of Side view FRAME data |
+| `frame_nr` | Frame number from 0-11 for side view data |
| `Width`, `Height` | Dimensions of the smallest enclosing rectangle of the vegetation segments |
| `Surface` | Area of the vegetation within the pixel-wise boundary of vegetation segments |
| `Convex hull` | Area of the convex hull measured by the smallest enclosing polygon of the vegetation segments |
-| `Roundedness` | Roundness of the pixel-wise boundary of vegetation segments (❗ calculation method unclear) |
+| `Roundness` | Roundness of the pixel-wise boundary of vegetation segments (❗ calculation method unclear) |
| `Center_of_mass_distance` | ❗ Unclear meaning |
| `Center_of_mass_x`, `Center_of_mass_y` | Coordinates of the center of mass point |
| `Hue, Saturation`, `Intensity` | Average values for these color properties in the image |
| `Fluorescence` | Average fluorescence within the pixel-wise boundary of vegetation segments |
| `H###`, `S##`, `V##` | Frequency of pixels with specific hue (###: 0-359), saturation (##: 0-99), and value (##: 0-99)
Check the definitions of hue, saturation, and value in [this link](https://changingminds.org/explanations/perception/visual/hsl.htm) |
| `F##` | Frequency of pixels with a specific fluorescence value (##: 0-99) |
-| `View` | TOP FRAME: Ignore this row
TOP AVG: Data from the top view
Side{Bottom, Small, Top, Full} FRAME 0 to 11: Data from various side view angles (0, 30, ..., 330 degrees)
Side{Bottom, Small, Top, Full} AVG: Average of Side view FRAME data |
+
diff --git a/README.md b/README.md
index 420e27d..364af2e 100644
--- a/README.md
+++ b/README.md
@@ -3,52 +3,3 @@ Exploratory data analysis for AAPF dataset (RGB, HSI, XRAY)
-# Change xlsx files to csv files
-
-# Load csv files
-# Merge DFPs? range->text
-# Replace DFPs with growth stage if available
-
-
-# RGB, HSI
-# Check input variables with repeatability and ANOVA
-
-
-
-# RGB-visualization
-
-
-
-# Column header
-# frame_nr: frame number
-# Angle: side view angle when vegetation is mostly dispersed along the plane
-# Width: width measured by green box in MES file (see 2.1)
-# Height: height measured by green box in MES file (see 2.1)
-# Surface: vegetation area measured by red polygon (see 2.3)
-# Convex hull: convex hull area measured by blue polygon (see 2.2)
-# Roundedness: roundedness measured by red polygon (unsure, see 2.3)
-# Center_of_mass_distance: unclear
-# Center_of_mass_x: x coordinate of center of mass point (see 2.4)
-# Center_of_mass_y: y coordinate of center of mass point (see 2.4)
-# Hue: average hue value
-# Saturation: average saturation value
-# Intensity: average intensity value
-# Fluorescene: average flurescence value
-# H##: frequency of pixels with a hue value of ## (##: 0-359)
-# S##: frequency of pixels with a saturation value of ## (##: 0-99)
-# V##: frequency of pixels with a intensity value of ## (##: 0-99)
-# F##: frequency of pixels with a flurescence value of ## (##: 0-99)
-#
-# Row index (view)
-# TOP FRAME: Ignore this row
-# TOP AVG: Top view
-# SideBottom FRAME 0 to 11: Side view data from various angles (0, 30, ..., 330 degree)
-# SideBottom AVG: average of SideBottom FRAME data
-
-
-
-
-
-
-
-