Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b0ba2c9
Make data upload work for all registered equipment
ngholiza Jun 15, 2026
1e2cdcf
Address Codex review: hint-row skip, identity, outputs, metrics
ngholiza Jun 15, 2026
4b09124
Address Codex review round 2: templates, project match, FAIR metadata
ngholiza Jun 15, 2026
9f07290
Address Codex review round 3: non-finite cells, fleet-wide metrics
ngholiza Jun 15, 2026
91b5092
Address Codex review round 4: export scope, timestamp validation
ngholiza Jun 15, 2026
5f3a6dc
Address Codex review round 5: required schema, typed cells, FAIR config
ngholiza Jun 15, 2026
046cb7b
Address Codex review round 6: stable ids, bool render, ontology, config
ngholiza Jun 15, 2026
970451e
Address Codex review round 7: FAIR fallback + hint-row detection
ngholiza Jun 15, 2026
7b25522
Address Codex review round 8: consistent required-column derivation
ngholiza Jun 15, 2026
45e47a7
Address Codex review round 9: optional-output upload robustness
ngholiza Jun 15, 2026
d0bd9af
Address Codex review round 10: etcher schema + process response contract
ngholiza Jun 15, 2026
f6a9547
Address Codex review round 11: reject fractional integer-column values
ngholiza Jun 15, 2026
1a14bbd
Address Codex review round 12: declared template types + etcher identity
ngholiza Jun 15, 2026
26d8b89
Address Codex review round 13: canonical etcher identity + template/o…
ngholiza Jun 15, 2026
cbe3b2e
Address Codex review round 14: include generic runs in dashboard stats
ngholiza Jun 15, 2026
82475ab
Address Codex review round 15: include parameters in FAIR ontology map
ngholiza Jun 15, 2026
4a6975b
Address Codex review round 16: preserve registered hardware in FAIR m…
ngholiza Jun 15, 2026
f33dc08
Address Codex review round 17: chronological merge + malformed-row re…
ngholiza Jun 15, 2026
4b5c92f
Address Codex review round 18: parse declared integers without float …
ngholiza Jun 15, 2026
d0a3477
Address Codex review round 19: exact int decimals + normalize file co…
ngholiza Jun 15, 2026
6e5e9be
Address Codex review round 20: honor separately-stored column definit…
ngholiza Jun 15, 2026
b085b6a
Address Codex review round 21: honor limit=0 in merged pagination
ngholiza Jun 15, 2026
5c93f24
Address Codex review round 22: preserve uploaded run quality flags
ngholiza Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
366 changes: 348 additions & 18 deletions api/data_loader_pg.py

Large diffs are not rendered by default.

147 changes: 124 additions & 23 deletions api/fair_archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ def _load_domain_config(equipment_id: str) -> dict[str, Any] | None:
"""
try:
from domain_configs import load_domain_config
return load_domain_config(equipment_id)
config = load_domain_config(equipment_id)
if config:
return _normalize_pg_domain_config(config)
except Exception:
pass

Expand All @@ -168,12 +170,58 @@ def _load_domain_config(equipment_id: str) -> dict[str, Any] | None:
config_path = configs_dir / f"{equipment_id}.json"
if config_path.exists():
try:
return json.loads(config_path.read_text())
return _normalize_pg_domain_config(json.loads(config_path.read_text()))
except Exception as exc:
logger.debug("Could not load domain config %s: %s", equipment_id, exc)

# Fallback: dynamically registered equipment have no JSON file on disk;
# their config lives in PostgreSQL. Pull config_json so FAIR metadata still
# carries registered units, ontology, hardware, and owner info.
if equipment_id and equipment_id != "unknown":
try:
from metadata_pg import get_equipment_pg

equipment = get_equipment_pg(equipment_id)
if equipment:
config = equipment.get("config_json")
if isinstance(config, dict) and config:
return _normalize_pg_domain_config(config)
except Exception as exc:
logger.debug("Could not load PG config for %s: %s", equipment_id, exc)
return None


def _normalize_pg_domain_config(config: dict[str, Any]) -> dict[str, Any]:
"""Normalize a registered equipment config (whether loaded from its JSON
snapshot or from PostgreSQL) so the FAIR metadata generator (which reads
hardware/identity fields from ``config["domain"]``) sees the fields
``build_domain_config()`` stores at the config's top level.
Returns a shallow copy with a ``domain`` dict that backfills (without
overwriting) the registered hardware/identity fields. A config that already
carries everything under ``domain`` (e.g. the canonical etcher) is unchanged.
"""
if not isinstance(config, dict):
return config
normalized = dict(config)
domain = dict(normalized.get("domain") or {})
for field in (
"manufacturer",
"model",
"serial_number",
"location",
"equipment_type",
"sosa_type",
"ontology_uri",
"id",
"name",
):
if not domain.get(field) and config.get(field):
domain[field] = config[field]
normalized["domain"] = domain
return normalized


def _get_feature_ontology_map(domain_config: dict[str, Any] | None) -> dict[str, dict[str, str]]:
"""
Build a mapping from feature/target IDs to their QUDT ontology URIs
Expand Down Expand Up @@ -212,6 +260,36 @@ def _get_feature_ontology_map(domain_config: dict[str, Any] | None) -> dict[str,
entry["qudt_unit"] = target["qudt_unit"]
mapping[tid] = entry

# Registered input parameters (set-points captured at registration; always
# input). Generic equipment often declare their inputs here rather than under
# ``features``, so without this their FAIR metadata would lose units/provenance.
for parameter in domain_config.get("parameters", []):
if not isinstance(parameter, dict):
continue
pid = parameter.get("name") or parameter.get("id")
if not pid:
continue
entry = {"unit": parameter.get("unit", ""), "prov_direction": "input"}
if parameter.get("qudt_quantity_kind"):
entry["qudt_quantity_kind"] = parameter["qudt_quantity_kind"]
if parameter.get("qudt_unit"):
entry["qudt_unit"] = parameter["qudt_unit"]
mapping.setdefault(str(pid), entry)

# Registered outputs (measured results captured at registration; always output)
for output in domain_config.get("outputs", []):
if not isinstance(output, dict):
continue
oid = output.get("name") or output.get("id")
if not oid:
continue
entry = {"unit": output.get("unit", ""), "prov_direction": "output"}
if output.get("qudt_quantity_kind"):
entry["qudt_quantity_kind"] = output["qudt_quantity_kind"]
if output.get("qudt_unit"):
entry["qudt_unit"] = output["qudt_unit"]
mapping.setdefault(str(oid), entry)

return mapping


Expand Down Expand Up @@ -764,6 +842,9 @@ def run_to_jsonld(run: dict[str, Any]) -> dict[str, Any]:
project_name = run.get("project_name", "Unknown Project")
equipment_name = run.get("equipment_name", "Unknown Equipment")
equipment_id = run.get("equipment_id", "unknown")
# Label the dataset by its actual equipment so non-etcher runs aren't
# misrepresented as etcher runs in FAIR metadata.
run_label = "Etcher" if equipment_id == "etcher" else (equipment_name or "Equipment")

# Load domain config for ontology URIs
domain_config = _load_domain_config(equipment_id)
Expand Down Expand Up @@ -796,26 +877,46 @@ def run_to_jsonld(run: dict[str, Any]) -> dict[str, Any]:
prop["prov:qualifiedGeneration"] = "prov:generated"
variables_measured.append(prop)

# Build ontology-enriched target list
# Build ontology-enriched result list. The canonical etcher uses typed
# columns; every other equipment reports measurements under ``outputs``
# (which may legitimately be empty for an inputs-only upload).
generic_outputs = run.get("outputs") or {}
results = []
for target_id, target_label, target_key in [
("AvgEtchRate", "Etch Rate", "avg_etch_rate"),
("RangeEtchRate", "Thickness Range", "range_etch_rate"),
]:
onto = ontology_map.get(target_id, {})
result_prop: dict[str, Any] = {
"@type": "PropertyValue",
"name": target_id,
"value": run.get(target_key),
}
if onto.get("unit"):
result_prop["unitText"] = onto["unit"]
if onto.get("qudt_quantity_kind"):
result_prop["qudt:hasQuantityKind"] = {"@id": onto["qudt_quantity_kind"]}
if onto.get("qudt_unit"):
result_prop["qudt:hasUnit"] = {"@id": onto["qudt_unit"]}
result_prop["prov:qualifiedGeneration"] = "prov:generated"
results.append(result_prop)
if equipment_id != "etcher":
for key, value in (generic_outputs if isinstance(generic_outputs, dict) else {}).items():
onto = ontology_map.get(key, {})
result_prop = {
"@type": "PropertyValue",
"name": key,
"value": value,
}
if onto.get("unit"):
result_prop["unitText"] = onto["unit"]
if onto.get("qudt_quantity_kind"):
result_prop["qudt:hasQuantityKind"] = {"@id": onto["qudt_quantity_kind"]}
if onto.get("qudt_unit"):
result_prop["qudt:hasUnit"] = {"@id": onto["qudt_unit"]}
result_prop["prov:qualifiedGeneration"] = "prov:generated"
results.append(result_prop)
else:
for target_id, target_label, target_key in [
("AvgEtchRate", "Etch Rate", "avg_etch_rate"),
("RangeEtchRate", "Thickness Range", "range_etch_rate"),
]:
onto = ontology_map.get(target_id, {})
result_prop: dict[str, Any] = {
"@type": "PropertyValue",
"name": target_id,
"value": run.get(target_key),
}
if onto.get("unit"):
result_prop["unitText"] = onto["unit"]
if onto.get("qudt_quantity_kind"):
result_prop["qudt:hasQuantityKind"] = {"@id": onto["qudt_quantity_kind"]}
if onto.get("qudt_unit"):
result_prop["qudt:hasUnit"] = {"@id": onto["qudt_unit"]}
result_prop["prov:qualifiedGeneration"] = "prov:generated"
results.append(result_prop)

# Build equipment entity with SAREF metadata
equipment_entity: dict[str, Any] = {
Expand Down Expand Up @@ -864,11 +965,11 @@ def run_to_jsonld(run: dict[str, Any]) -> dict[str, Any]:
},
],
"@type": "Dataset",
"name": f"Etcher Run {run.get('run_id', 'unknown')}",
"name": f"{run_label} Run {run.get('run_id', 'unknown')}",
"identifier": str(run.get("run_id", "")),
"dateCreated": run.get("run_date") or run.get("created_at") or "",
"description": (
f"Etcher run {run.get('run_id')} from lot {run.get('lot_name', 'N/A')}. "
f"{run_label} run {run.get('run_id')} from lot {run.get('lot_name', 'N/A')}. "
f"Equipment: {equipment_name}. Project: {project_name}."
),
"license": {"@id": DEFAULT_LICENSE},
Expand Down
2 changes: 2 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def _run_startup_migrations() -> None:
"""
from data_loader_pg import get_pg_superuser_connection
from data_loader_pg import ensure_etcher_run_file_refs_pg
from data_loader_pg import ensure_equipment_runs_pg
from metadata_pg import (
ensure_experiment_proposal_generation_columns_pg,
ensure_experiment_type_reuse_columns_pg,
Expand All @@ -165,6 +166,7 @@ def _run_startup_migrations() -> None:
lock_conn.commit()

ensure_etcher_run_file_refs_pg()
ensure_equipment_runs_pg()
ensure_experiment_proposal_generation_columns_pg()
ensure_experiment_type_reuse_columns_pg()
ensure_experiment_type_versioning_columns_pg()
Expand Down
67 changes: 51 additions & 16 deletions api/metadata_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,22 +1078,57 @@ def _serialize_equipment(
}


def _fetch_equipment_run_metrics(conn) -> dict[str, dict[str, Any]]:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(
"""
SELECT
p.equipment_id,
COUNT(r.idruns)::int AS total_runs,
MAX(COALESCE(r.run_date, r.created_at)) AS last_data_at
FROM projects p
JOIN etcher_runs r ON r.project_id = p.id
WHERE p.equipment_id IS NOT NULL
AND p.equipment_id <> ''
GROUP BY p.equipment_id
"""
)
rows = cur.fetchall()
def _fetch_equipment_run_metrics(_conn=None) -> dict[str, dict[str, Any]]:
"""Aggregate fleet-wide run counts and latest-data timestamps per equipment.
Combines the canonical etcher_runs (attributed via the project's equipment)
with the generic equipment_runs table (attributed via the run's own
equipment_id) so manually uploaded data for any registered equipment is
reflected in the inventory metrics.
Runs on the superuser connection so these non-sensitive aggregate counts are
complete (fleet-wide) regardless of the caller's project membership. The
RLS-scoped connection passed by callers would otherwise hide runs in
private/shared projects, even from members and admins, and undercount.
"""
conn = get_pg_superuser_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(
"""
SELECT
equipment_id,
SUM(total_runs)::int AS total_runs,
MAX(last_data_at) AS last_data_at
FROM (
-- etcher_runs is canonical etcher data: attribute every row
-- to the constant 'etcher' id regardless of the project's
-- equipment association (which may be empty or another tool),
-- so canonical runs are never dropped or miscredited.
SELECT
'etcher'::text AS equipment_id,
COUNT(r.idruns) AS total_runs,
MAX(COALESCE(r.run_date, r.created_at)) AS last_data_at
FROM etcher_runs r
UNION ALL
SELECT
er.equipment_id AS equipment_id,
COUNT(er.id) AS total_runs,
MAX(COALESCE(er.run_date, er.created_at)) AS last_data_at
FROM equipment_runs er
WHERE er.equipment_id IS NOT NULL
AND er.equipment_id <> ''
GROUP BY er.equipment_id
) combined
GROUP BY equipment_id
"""
)
rows = cur.fetchall()
conn.commit()
finally:
conn.close()
return {row["equipment_id"]: dict(row) for row in rows}


Expand Down
Loading