Skip to content

Commit

Permalink
SPLIT parameter properly used in read() now
Browse files Browse the repository at this point in the history
  • Loading branch information
Dawith committed Oct 21, 2025
1 parent 4779324 commit 125d6d4
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions pipe/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def etl(spark: SparkSession, split: list=None) -> DataFrame:
data = split_sets(data, split=split)
return data

def read(spark: SparkSession) -> DataFrame:
def read(spark: SparkSession, split=None) -> DataFrame:
"""
Reads the processed data from a Parquet file and splits it into training,
validation, and test sets.
Expand All @@ -51,7 +51,7 @@ def read(spark: SparkSession) -> DataFrame:
"""

data = spark.read.parquet("/app/workdir/parquet/data.parquet")
data = split_sets(data)
data = split_sets(data, split=split)
return data

def split_sets(data: DataFrame, split=[0.99, 0.005, 0.005]) -> tuple:
Expand Down

0 comments on commit 125d6d4

Please sign in to comment.