Skip to content

Commit

Permalink
data rejection added temporarily
Browse files Browse the repository at this point in the history
  • Loading branch information
Dawith committed Oct 21, 2025
1 parent fbdceb2 commit 3fee715
Showing 1 changed file with 45 additions and 39 deletions.
84 changes: 45 additions & 39 deletions pipe/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,46 +138,52 @@ def read_matfiles(self, specpath: Path,

for label in labels:
matdata = sp.io.loadmat(specpath/"matfiles"/label)
header = matdata["header"]
constants_array = np.zeros((1,5), dtype="float16")
constants_array[0,0] = header["f1"][0][0][0]
constants_array[0,1] = header["f2"][0][0][0]
constants_array[0,2] = header["t0"][0][0][0]
constants_array[0,4] = header["dt"][0][0][0]
timeseries_array = np.zeros((nloops, nfreq+3), dtype="float16")
spec = matdata["SP"]
for _ in range(len(spec.shape)-2):
spec = spec[0]
spec[np.abs(spec) == np.inf] = pad_value
spec[np.isnan(spec)] = pad_value
try:
constants_array[0,3] = header["tf"][0][0][0]
except ValueError:
constants_array[0,3] = float(spec.shape[-2])
mean += np.mean(spec)
maxv += max(np.max(spec), maxv)
minv += min(np.min(spec), minv)
time_offset = int(nloops-constants_array[0,3])
timeseries_array[time_offset:, :130] = \
(spec - spec_meanshift) / spec_scale
timeseries_array[:time_offset, :130] = \
timeseries_array[time_offset, :130]
timeseries_array[time_offset:, 130] = \
(np.log10(matdata["BB"][0]) - bb_meanshift) / bb_scale
timeseries_array[time_offset:, 131] = \
(matdata["NSD"][0] - nsd_meanshift)
timeseries_array[time_offset:, 132] = \
np.log10(matdata["NCNT"][0]) / ncnt_scale
row["timeseries"] = timeseries_array.tolist()

if DataKind.TREATMENT in datakinds:
row["treatment"] = matdata["header"]["drug"][0][0][0].lower()
if DataKind.TARGET in datakinds:
ncnt = np.log10(matdata["NCNT"][0])
if np.min(ncnt) < 2:
print(f"Skipping file {label} due to low counts.")
print(np.min(ncnt))
continue
else:
header = matdata["header"]
constants_array = np.zeros((1,5), dtype="float16")
constants_array[0,0] = header["f1"][0][0][0]
constants_array[0,1] = header["f2"][0][0][0]
constants_array[0,2] = header["t0"][0][0][0]
constants_array[0,4] = header["dt"][0][0][0]
timeseries_array = np.zeros((nloops, nfreq+3), dtype="float16")
spec = matdata["SP"]
for _ in range(len(spec.shape)-2):
spec = spec[0]
spec[np.abs(spec) == np.inf] = pad_value
spec[np.isnan(spec)] = pad_value
try:
row["target"] = matdata["header"]["cell"][0][0][0].lower()
except:
row["target"] = "unknown"
data.append(Row(**row))
constants_array[0,3] = header["tf"][0][0][0]
except ValueError:
constants_array[0,3] = float(spec.shape[-2])
mean += np.mean(spec)
maxv += max(np.max(spec), maxv)
minv += min(np.min(spec), minv)
time_offset = int(nloops-constants_array[0,3])
timeseries_array[time_offset:, :130] = \
(spec - spec_meanshift) / spec_scale
timeseries_array[:time_offset, :130] = \
timeseries_array[time_offset, :130]
timeseries_array[time_offset:, 130] = \
(np.log10(matdata["BB"][0]) - bb_meanshift) / bb_scale
timeseries_array[time_offset:, 131] = \
(matdata["NSD"][0] - nsd_meanshift)
timeseries_array[time_offset:, 132] = \
np.log10(matdata["NCNT"][0]) / ncnt_scale
row["timeseries"] = timeseries_array.tolist()

if DataKind.TREATMENT in datakinds:
row["treatment"] = matdata["header"]["drug"][0][0][0].lower()
if DataKind.TARGET in datakinds:
try:
row["target"] = matdata["header"]["cell"][0][0][0].lower()
except:
row["target"] = "unknown"
data.append(Row(**row))

return self.spark.createDataFrame(data)

Expand Down

0 comments on commit 3fee715

Please sign in to comment.