From f4418e203c84013ac1713ea57589dffcf186591c Mon Sep 17 00:00:00 2001 From: Dawith Lim Date: Wed, 1 Oct 2025 15:01:39 -0400 Subject: [PATCH] Data read modified to work with the new pipeline --- analysis.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/analysis.py b/analysis.py index f8b7cfa..ec6a808 100644 --- a/analysis.py +++ b/analysis.py @@ -45,7 +45,12 @@ def pca(data, features): if __name__ == "__main__": spark = SparkSession.builder.appName("train").getOrCreate() - data = load(spark, split=[0.9, 0.5, 0.5]) + SPLIT = [0.9, 0.05, 0.05] + load_from_scratch = False + if load_from_scratch: + data = etl(spark, split=SPLIT) + else: + data = read(spark) pca(data) category_distribution(data)