Function description added

Nolte-Group · Sep 30, 2025 · 940d227 · 940d227
1 parent fe00fc6
commit 940d227
Showing 1 changed file with 9 additions and 16 deletions.
diff --git a/pipe/etl.py b/pipe/etl.py
@@ -10,33 +10,26 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from pathlib import Path
-from pyspark.sql import SparkSession, functions, types, Row
+from pyspark.sql import SparkSession
 from sklearn.metrics import confusion_matrix
 import tensorflow as tf
 
 from pipe.extract import extract
 from pipe.transform import transform
 from pipe.load import load
 
-def etl(spark):
+def etl(spark: SparkSession) -> types.DataFrame:
     """
-    Performs the ETL process in series.
+    Performs the ETL process in series and returns the final DataFrame.
+
+    Args:
+        spark (SparkSession): The Spark session to use for data processing.
+
+    Returns:
+        types.DataFrame: The final processed DataFrame after ETL.
     """
     data = extract(spark)
     data = transform(spark, data, keys=["treatment", "target"])
     data = load(data)
     return data
 
-def visualize_data_distribution(data):
-    for category in ["treatment", "target"]:
-        select = data.select(category) \
-                .groupby(category) \
-                .count()
-        plt.barh(
-            np.array(select.select(category).collect()).squeeze(),
-            np.array(select.select("count").collect()).astype("float") \
-                .squeeze())
-        plt.xlabel("Count")
-        plt.ylabel(category)
-        plt.savefig(f"{category}_counts.png", bbox_inches="tight")
-        plt.close()