diff --git a/pipe/extract.py b/pipe/extract.py index 40c650a..d905972 100644 --- a/pipe/extract.py +++ b/pipe/extract.py @@ -138,46 +138,52 @@ def read_matfiles(self, specpath: Path, for label in labels: matdata = sp.io.loadmat(specpath/"matfiles"/label) - header = matdata["header"] - constants_array = np.zeros((1,5), dtype="float16") - constants_array[0,0] = header["f1"][0][0][0] - constants_array[0,1] = header["f2"][0][0][0] - constants_array[0,2] = header["t0"][0][0][0] - constants_array[0,4] = header["dt"][0][0][0] - timeseries_array = np.zeros((nloops, nfreq+3), dtype="float16") - spec = matdata["SP"] - for _ in range(len(spec.shape)-2): - spec = spec[0] - spec[np.abs(spec) == np.inf] = pad_value - spec[np.isnan(spec)] = pad_value - try: - constants_array[0,3] = header["tf"][0][0][0] - except ValueError: - constants_array[0,3] = float(spec.shape[-2]) - mean += np.mean(spec) - maxv += max(np.max(spec), maxv) - minv += min(np.min(spec), minv) - time_offset = int(nloops-constants_array[0,3]) - timeseries_array[time_offset:, :130] = \ - (spec - spec_meanshift) / spec_scale - timeseries_array[:time_offset, :130] = \ - timeseries_array[time_offset, :130] - timeseries_array[time_offset:, 130] = \ - (np.log10(matdata["BB"][0]) - bb_meanshift) / bb_scale - timeseries_array[time_offset:, 131] = \ - (matdata["NSD"][0] - nsd_meanshift) - timeseries_array[time_offset:, 132] = \ - np.log10(matdata["NCNT"][0]) / ncnt_scale - row["timeseries"] = timeseries_array.tolist() - - if DataKind.TREATMENT in datakinds: - row["treatment"] = matdata["header"]["drug"][0][0][0].lower() - if DataKind.TARGET in datakinds: + ncnt = np.log10(matdata["NCNT"][0]) + if np.min(ncnt) < 2: + print(f"Skipping file {label} due to low counts.") + print(np.min(ncnt)) + continue + else: + header = matdata["header"] + constants_array = np.zeros((1,5), dtype="float16") + constants_array[0,0] = header["f1"][0][0][0] + constants_array[0,1] = header["f2"][0][0][0] + constants_array[0,2] = header["t0"][0][0][0] + constants_array[0,4] = header["dt"][0][0][0] + timeseries_array = np.zeros((nloops, nfreq+3), dtype="float16") + spec = matdata["SP"] + for _ in range(len(spec.shape)-2): + spec = spec[0] + spec[np.abs(spec) == np.inf] = pad_value + spec[np.isnan(spec)] = pad_value try: - row["target"] = matdata["header"]["cell"][0][0][0].lower() - except: - row["target"] = "unknown" - data.append(Row(**row)) + constants_array[0,3] = header["tf"][0][0][0] + except ValueError: + constants_array[0,3] = float(spec.shape[-2]) + mean += np.mean(spec) + maxv += max(np.max(spec), maxv) + minv += min(np.min(spec), minv) + time_offset = int(nloops-constants_array[0,3]) + timeseries_array[time_offset:, :130] = \ + (spec - spec_meanshift) / spec_scale + timeseries_array[:time_offset, :130] = \ + timeseries_array[time_offset, :130] + timeseries_array[time_offset:, 130] = \ + (np.log10(matdata["BB"][0]) - bb_meanshift) / bb_scale + timeseries_array[time_offset:, 131] = \ + (matdata["NSD"][0] - nsd_meanshift) + timeseries_array[time_offset:, 132] = \ + np.log10(matdata["NCNT"][0]) / ncnt_scale + row["timeseries"] = timeseries_array.tolist() + + if DataKind.TREATMENT in datakinds: + row["treatment"] = matdata["header"]["drug"][0][0][0].lower() + if DataKind.TARGET in datakinds: + try: + row["target"] = matdata["header"]["cell"][0][0][0].lower() + except: + row["target"] = "unknown" + data.append(Row(**row)) return self.spark.createDataFrame(data)