From 825ff186f2a41c06e7eb9c67ccd4097703c196d4 Mon Sep 17 00:00:00 2001 From: Dawith Lim Date: Tue, 30 Sep 2025 13:00:36 -0400 Subject: [PATCH] Function descriptions added --- pipe/extract.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/pipe/extract.py b/pipe/extract.py index 9415ee9..849246e 100644 --- a/pipe/extract.py +++ b/pipe/extract.py @@ -14,11 +14,13 @@ from pyspark.sql import SparkSession, Row, DataFrame import scipy as sp -def extract(spark): +def extract(spark: SparkSession) -> DataFrame: """ First step of the ETL pipeline. It reads the list of .mat files from a CSV list, opens and pulls the spectrogram from each respective file. + Args: + spark (SparkSession): Spark session object. """ path = Path("/app/workdir") @@ -33,13 +35,34 @@ def extract(spark): def image_pipe(spark: SparkSession, imagepath: Path, namepattern: str, stacksize: int) -> np.ndarray: - images = np.zeros((stacksize, 800,800)) + """ + Loads a stack of images from a path based on the given name pattern. + + Args: + imagepath (Path): Path to the image files. + namepattern (str): Name pattern for the image files. + stacksize (int): Number of images in the stack. + + Returns: + images: 3D numpy array of stacked images. + """ + + images = np.zeros((stacksize, 800, 800)) for i in range(stacksize): images[i,:,:] = cv.imread(imagepath/namepattern.format(i), -1) return images class SpectrogramReader: + """ + Class to read spectrograms and metadata from different file formats based + on user specified filetype. + + Args: + spark (SparkSession): Spark session object. + filetype (str): File format type. Supported types are 'hdf5', + 'shards', and 'matfiles'. + """ def __init__(self, spark: SparkSession, filetype: str = "hdf5"): self.spark = spark @@ -86,7 +109,8 @@ def spectrogram_read_matfiles(self, specpath: Path, labels:list, labels (list): List of target labels. Returns: - + DataFrame: Spark DataFrame containing the spectrograms and + associated metadata. """ spectrograms = [] row = {} @@ -127,6 +151,8 @@ def spectrogram_read_hdf5(self, specpath: Path, labels: list, specpath (Path): Path to the spectrogram files. Returns: + DataFrame: Spark DataFrame containing the spectrograms and + associated metadata. """ metadata = self.metadata_pipe(specpath, labels) @@ -156,6 +182,8 @@ def spectrogram_read_shards(self, specpath: Path, namepattern: str, spectrogram. Returns: + DataFrame: Spark DataFrame containing the spectrograms and + associated metadata. """ return