Skip to content

Commit

Permalink
Working on cosine similarity matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
Dawith committed Oct 21, 2025
1 parent 9851cc9 commit 3f43970
Showing 1 changed file with 24 additions and 1 deletion.
25 changes: 24 additions & 1 deletion inspect_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""

import numpy as np
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import col, udf, array, lit
from pyspark.sql.types import ArrayType, FloatType
import pyspark

Expand Down Expand Up @@ -64,6 +64,10 @@ def unpack_timeseriesdata(dataframe):

return spectra, bb, ncnt, nsd

@udf(FloatType())
def cos_sim(a, b):
return float(np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def unpack_constants(dataframe):
"""
"""
Expand All @@ -76,6 +80,23 @@ def unpack_constants(dataframe):

return t0, tf, dt, f1, f2

def cosine_similarity(dataframe, column):
dataframe.orderBy(column)
df_cos_sim = dataframe.select([column, "timeseries"])
df_cos_sim = df_cos_sim.withColumnRenamed("timeseries", "timeseries0") \
.crossJoin(df_cos_sim)
df_cos_sim = df_cos_sim.withColumn(
"dot_product",
lit(
np.dot(col("timeseries"), col("timeseries0"))
/ (np.linalg.norm(col("timeseries"))
* np.linalg.norm(col("timeseries0")))
)
)
print(df_cos_sim.head(0))

return

if __name__ == "__main__":
spark = pyspark.sql.SparkSession \
.builder \
Expand All @@ -84,6 +105,8 @@ def unpack_constants(dataframe):

# Load data
data = extract(spark)
cosine_similarity(data, "treatment")
exit()
labels = ["spectra", "backscatter_brightness",
"n_count", "normalized_standard_deviation"]
for dataframe, label in zip(unpack_timeseriesdata(data), labels):
Expand Down

0 comments on commit 3f43970

Please sign in to comment.