From f4418e203c84013ac1713ea57589dffcf186591c Mon Sep 17 00:00:00 2001
From: Dawith Lim <lim185@purdue.edu>
Date: Wed, 1 Oct 2025 15:01:39 -0400
Subject: [PATCH] Data read modified to work with the new pipeline

---
 analysis.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/analysis.py b/analysis.py
index f8b7cfa..ec6a808 100644
--- a/analysis.py
+++ b/analysis.py
@@ -45,7 +45,12 @@ def pca(data, features):
 
 if __name__ == "__main__":
     spark = SparkSession.builder.appName("train").getOrCreate()
-    data = load(spark, split=[0.9, 0.5, 0.5])
+    SPLIT = [0.9, 0.05, 0.05]
+    load_from_scratch = False
+    if load_from_scratch:
+        data = etl(spark, split=SPLIT)
+    else:
+        data = read(spark)
 
     pca(data)
     category_distribution(data)