Deploy Taipy app via scripts/manage_space.py
Browse files- .env +2 -0
- dist/luxury_lakehouse-0.3.0-py3-none-any.whl +2 -2
- requirements.txt +2 -2
- src/pages/workflows.py +9 -0
- src/queries/workflows.py +58 -8
- src/state/workflows.py +16 -4
- src/state/workflows_stats.py +69 -4
- src/template.py +10 -0
.env
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LAKEBASE_HOST=ep-spring-rain-d2i6lozx.database.us-east-1.cloud.databricks.com
|
| 2 |
+
LAKEBASE_ENDPOINT_NAME=projects/soccer-analytics-dev/branches/production/endpoints/primary
|
dist/luxury_lakehouse-0.3.0-py3-none-any.whl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59695805377583cb29800da3688cff9b709baa8dae6a135f6e847b22f67aecf2
|
| 3 |
+
size 381689
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# This file was autogenerated by uv via the following command:
|
| 2 |
-
# uv pip compile pyproject.toml --extra taipy-app --python-version 3.10 --python-platform linux -o D:\Development\karstenskyt__luxury-lakehouse
|
| 3 |
aniso8601==10.0.1
|
| 4 |
# via flask-restful
|
| 5 |
annotated-doc==0.0.4
|
|
@@ -266,7 +266,7 @@ requests==2.33.1
|
|
| 266 |
# requests-cache
|
| 267 |
requests-cache==1.3.1
|
| 268 |
# via luxury-lakehouse (pyproject.toml)
|
| 269 |
-
rich==14.3.
|
| 270 |
# via
|
| 271 |
# cookiecutter
|
| 272 |
# typer
|
|
|
|
| 1 |
# This file was autogenerated by uv via the following command:
|
| 2 |
+
# uv pip compile pyproject.toml --extra taipy-app --python-version 3.10 --python-platform linux -o D:\Development\karstenskyt__luxury-lakehouse\hf_taipy_app\requirements.txt
|
| 3 |
aniso8601==10.0.1
|
| 4 |
# via flask-restful
|
| 5 |
annotated-doc==0.0.4
|
|
|
|
| 266 |
# requests-cache
|
| 267 |
requests-cache==1.3.1
|
| 268 |
# via luxury-lakehouse (pyproject.toml)
|
| 269 |
+
rich==14.3.4
|
| 270 |
# via
|
| 271 |
# cookiecutter
|
| 272 |
# typer
|
src/pages/workflows.py
CHANGED
|
@@ -57,6 +57,15 @@ page_config = PageConfig(
|
|
| 57 |
"wf_run_volume_detail",
|
| 58 |
help_text="Total pipeline runs in the last 30 days. Detail shows daily rate and average cost per run.",
|
| 59 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
],
|
| 61 |
content=[
|
| 62 |
ContentRow(
|
|
|
|
| 57 |
"wf_run_volume_detail",
|
| 58 |
help_text="Total pipeline runs in the last 30 days. Detail shows daily rate and average cost per run.",
|
| 59 |
),
|
| 60 |
+
StatCard(
|
| 61 |
+
"Avg Cold Start",
|
| 62 |
+
"wf_avg_cold_start",
|
| 63 |
+
"wf_cold_start_detail",
|
| 64 |
+
help_text=(
|
| 65 |
+
"Average serverless environment setup time (spin-up, wheel install, imports) "
|
| 66 |
+
"before pipeline work begins. Lower is better. Range shows fastest to slowest workflow."
|
| 67 |
+
),
|
| 68 |
+
),
|
| 69 |
],
|
| 70 |
content=[
|
| 71 |
ContentRow(
|
src/queries/workflows.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""Workflow-related queries — extracted from state/workflows.py.
|
| 2 |
|
| 3 |
-
|
| 4 |
-
(
|
| 5 |
-
not database queries.
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
|
@@ -17,20 +17,39 @@ from queries.common import execute_query, t, ttl_cache
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def fetch_cold_costs() -> pd.DataFrame:
|
| 22 |
"""30-day aggregated costs from fct_workflow_costs_synced (cold tier).
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
|
|
|
| 26 |
"""
|
| 27 |
-
_empty = pd.DataFrame(columns=pd.Index(
|
| 28 |
try:
|
| 29 |
tbl = t("fct_workflow_costs_synced")
|
| 30 |
return execute_query(
|
| 31 |
f"SELECT COALESCE(workflow_id, task_key) AS workflow_id, " # noqa: S608
|
| 32 |
f" task_key, "
|
| 33 |
-
f" SUM(
|
| 34 |
f" SUM(attributed_dbu) AS total_dbu, "
|
| 35 |
f" COUNT(DISTINCT job_run_id) AS run_count "
|
| 36 |
f"FROM {tbl} "
|
|
@@ -44,6 +63,37 @@ def fetch_cold_costs() -> pd.DataFrame:
|
|
| 44 |
return _empty
|
| 45 |
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
@ttl_cache(ttl=120)
|
| 48 |
def fetch_warm_costs() -> pd.DataFrame:
|
| 49 |
"""Recent cost estimates from workflow_cost_live_synced (warm tier).
|
|
|
|
| 1 |
"""Workflow-related queries — extracted from state/workflows.py.
|
| 2 |
|
| 3 |
+
Cold costs (30-day aggregate), warm costs (live), and latest-run metrics
|
| 4 |
+
(most recent run per workflow). Non-SQL data fetching (Jobs API, HF Hub)
|
| 5 |
+
remains in state/workflows.py as those are API calls, not database queries.
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
|
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
|
| 20 |
+
_COLD_COST_COLS = [
|
| 21 |
+
"workflow_id",
|
| 22 |
+
"task_key",
|
| 23 |
+
"total_cost_usd",
|
| 24 |
+
"total_dbu",
|
| 25 |
+
"run_count",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
_LATEST_RUN_COLS = [
|
| 29 |
+
"workflow_id",
|
| 30 |
+
"cold_start_seconds",
|
| 31 |
+
"duration_seconds",
|
| 32 |
+
"entity_count",
|
| 33 |
+
"row_count",
|
| 34 |
+
"pipeline_state",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@ttl_cache(ttl=600)
|
| 39 |
def fetch_cold_costs() -> pd.DataFrame:
|
| 40 |
"""30-day aggregated costs from fct_workflow_costs_synced (cold tier).
|
| 41 |
|
| 42 |
+
Cost aggregates only — timing and entity data comes from
|
| 43 |
+
fetch_latest_run_metrics() to avoid diluting recent values with
|
| 44 |
+
historical zeros.
|
| 45 |
"""
|
| 46 |
+
_empty = pd.DataFrame(columns=pd.Index(_COLD_COST_COLS))
|
| 47 |
try:
|
| 48 |
tbl = t("fct_workflow_costs_synced")
|
| 49 |
return execute_query(
|
| 50 |
f"SELECT COALESCE(workflow_id, task_key) AS workflow_id, " # noqa: S608
|
| 51 |
f" task_key, "
|
| 52 |
+
f" SUM(effective_cost_usd) AS total_cost_usd, "
|
| 53 |
f" SUM(attributed_dbu) AS total_dbu, "
|
| 54 |
f" COUNT(DISTINCT job_run_id) AS run_count "
|
| 55 |
f"FROM {tbl} "
|
|
|
|
| 63 |
return _empty
|
| 64 |
|
| 65 |
|
| 66 |
+
@ttl_cache(ttl=600)
|
| 67 |
+
def fetch_latest_run_metrics() -> pd.DataFrame:
|
| 68 |
+
"""Most recent run per workflow from fct_workflow_costs_synced.
|
| 69 |
+
|
| 70 |
+
Returns one row per workflow with cold_start_seconds, duration_seconds,
|
| 71 |
+
entity_count, row_count, and pipeline_state from the latest run.
|
| 72 |
+
"""
|
| 73 |
+
_empty = pd.DataFrame(columns=pd.Index(_LATEST_RUN_COLS))
|
| 74 |
+
try:
|
| 75 |
+
tbl = t("fct_workflow_costs_synced")
|
| 76 |
+
return execute_query(
|
| 77 |
+
f"SELECT workflow_id, cold_start_seconds, duration_seconds, " # noqa: S608
|
| 78 |
+
f" entity_count, row_count, pipeline_state "
|
| 79 |
+
f"FROM ( "
|
| 80 |
+
f" SELECT COALESCE(workflow_id, task_key) AS workflow_id, "
|
| 81 |
+
f" cold_start_seconds, duration_seconds, entity_count, "
|
| 82 |
+
f" row_count, pipeline_state, "
|
| 83 |
+
f" ROW_NUMBER() OVER ("
|
| 84 |
+
f" PARTITION BY COALESCE(workflow_id, task_key) "
|
| 85 |
+
f" ORDER BY usage_date DESC, job_run_id DESC"
|
| 86 |
+
f" ) AS rn "
|
| 87 |
+
f" FROM {tbl} "
|
| 88 |
+
f" WHERE pipeline_state IS NOT NULL "
|
| 89 |
+
f") sub "
|
| 90 |
+
f"WHERE rn = 1",
|
| 91 |
+
)
|
| 92 |
+
except Exception:
|
| 93 |
+
logger.warning("Latest run metrics query failed", exc_info=True)
|
| 94 |
+
return _empty
|
| 95 |
+
|
| 96 |
+
|
| 97 |
@ttl_cache(ttl=120)
|
| 98 |
def fetch_warm_costs() -> pd.DataFrame:
|
| 99 |
"""Recent cost estimates from workflow_cost_live_synced (warm tier).
|
src/state/workflows.py
CHANGED
|
@@ -18,7 +18,7 @@ from typing import Any
|
|
| 18 |
|
| 19 |
import pandas as pd
|
| 20 |
from cache import ttl_cache
|
| 21 |
-
from queries.workflows import fetch_cold_costs, fetch_warm_costs
|
| 22 |
|
| 23 |
from state.shared import register_page_refresher, register_page_teardown
|
| 24 |
from state.workflows_dag import (
|
|
@@ -74,6 +74,8 @@ wf_total_cost_30d: str = "$0.00"
|
|
| 74 |
wf_cost_detail: RawHtml = RawHtml("")
|
| 75 |
wf_run_volume: str = "0"
|
| 76 |
wf_run_volume_detail: str = ""
|
|
|
|
|
|
|
| 77 |
|
| 78 |
wf_table_data: pd.DataFrame = pd.DataFrame(columns=pd.Index(WF_TABLE_COLS))
|
| 79 |
|
|
@@ -133,6 +135,8 @@ __all__ = [
|
|
| 133 |
"wf_cost_detail",
|
| 134 |
"wf_run_volume",
|
| 135 |
"wf_run_volume_detail",
|
|
|
|
|
|
|
| 136 |
"wf_table_data",
|
| 137 |
"wf_type_filter",
|
| 138 |
"wf_type_lov",
|
|
@@ -441,6 +445,7 @@ def _refresh_table(state: Any) -> None:
|
|
| 441 |
global _wf_card_ids
|
| 442 |
|
| 443 |
cold = fetch_cold_costs()
|
|
|
|
| 444 |
jobs = _fetch_job_runs()
|
| 445 |
hf_costs = _fetch_hf_cost_history()
|
| 446 |
|
|
@@ -491,6 +496,7 @@ def _refresh_table(state: Any) -> None:
|
|
| 491 |
state.wf_runtime_filter,
|
| 492 |
state.wf_freshness_filter,
|
| 493 |
hf_costs=hf_costs,
|
|
|
|
| 494 |
)
|
| 495 |
state.wf_table_data = table_df
|
| 496 |
_wf_card_ids = card_ids
|
|
@@ -505,6 +511,7 @@ def _refresh_table(state: Any) -> None:
|
|
| 505 |
jobs,
|
| 506 |
visible_card_ids=matched_ids if not all_filters_default else None,
|
| 507 |
hf_costs=hf_costs,
|
|
|
|
| 508 |
)
|
| 509 |
|
| 510 |
|
|
@@ -550,6 +557,7 @@ def _wf_auto_refresh_tick(state: Any) -> None:
|
|
| 550 |
|
| 551 |
logger.debug("Auto-refresh tick")
|
| 552 |
cold = fetch_cold_costs()
|
|
|
|
| 553 |
warm = fetch_warm_costs()
|
| 554 |
jobs = _fetch_job_runs()
|
| 555 |
hf_costs = _fetch_hf_cost_history()
|
|
@@ -562,10 +570,11 @@ def _wf_auto_refresh_tick(state: Any) -> None:
|
|
| 562 |
state.wf_runtime_filter,
|
| 563 |
state.wf_freshness_filter,
|
| 564 |
hf_costs=hf_costs,
|
|
|
|
| 565 |
)
|
| 566 |
state.wf_table_data = table_df
|
| 567 |
_wf_card_ids = card_ids
|
| 568 |
-
compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs)
|
| 569 |
|
| 570 |
|
| 571 |
# ---------------------------------------------------------------------------
|
|
@@ -608,6 +617,7 @@ def wf_refresh(state: Any) -> None:
|
|
| 608 |
|
| 609 |
# Query costs + job runs (job_runs already re-keyed to workflow_id)
|
| 610 |
cold = fetch_cold_costs()
|
|
|
|
| 611 |
warm = fetch_warm_costs()
|
| 612 |
jobs = _fetch_job_runs()
|
| 613 |
|
|
@@ -629,12 +639,14 @@ def wf_refresh(state: Any) -> None:
|
|
| 629 |
hf_costs = _fetch_hf_cost_history()
|
| 630 |
|
| 631 |
# Build table
|
| 632 |
-
table_df, card_ids = build_table_data(
|
|
|
|
|
|
|
| 633 |
state.wf_table_data = table_df
|
| 634 |
_wf_card_ids = card_ids
|
| 635 |
|
| 636 |
# Stats (uses jobs for freshness, cold for cost, hf_costs for HF data)
|
| 637 |
-
compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs)
|
| 638 |
|
| 639 |
# Clear detail state (dashboard mode)
|
| 640 |
state.wf_selected_workflow = None
|
|
|
|
| 18 |
|
| 19 |
import pandas as pd
|
| 20 |
from cache import ttl_cache
|
| 21 |
+
from queries.workflows import fetch_cold_costs, fetch_latest_run_metrics, fetch_warm_costs
|
| 22 |
|
| 23 |
from state.shared import register_page_refresher, register_page_teardown
|
| 24 |
from state.workflows_dag import (
|
|
|
|
| 74 |
wf_cost_detail: RawHtml = RawHtml("")
|
| 75 |
wf_run_volume: str = "0"
|
| 76 |
wf_run_volume_detail: str = ""
|
| 77 |
+
wf_avg_cold_start: str = "\u2014"
|
| 78 |
+
wf_cold_start_detail: str = ""
|
| 79 |
|
| 80 |
wf_table_data: pd.DataFrame = pd.DataFrame(columns=pd.Index(WF_TABLE_COLS))
|
| 81 |
|
|
|
|
| 135 |
"wf_cost_detail",
|
| 136 |
"wf_run_volume",
|
| 137 |
"wf_run_volume_detail",
|
| 138 |
+
"wf_avg_cold_start",
|
| 139 |
+
"wf_cold_start_detail",
|
| 140 |
"wf_table_data",
|
| 141 |
"wf_type_filter",
|
| 142 |
"wf_type_lov",
|
|
|
|
| 445 |
global _wf_card_ids
|
| 446 |
|
| 447 |
cold = fetch_cold_costs()
|
| 448 |
+
latest = fetch_latest_run_metrics()
|
| 449 |
jobs = _fetch_job_runs()
|
| 450 |
hf_costs = _fetch_hf_cost_history()
|
| 451 |
|
|
|
|
| 496 |
state.wf_runtime_filter,
|
| 497 |
state.wf_freshness_filter,
|
| 498 |
hf_costs=hf_costs,
|
| 499 |
+
latest_run_metrics=latest,
|
| 500 |
)
|
| 501 |
state.wf_table_data = table_df
|
| 502 |
_wf_card_ids = card_ids
|
|
|
|
| 511 |
jobs,
|
| 512 |
visible_card_ids=matched_ids if not all_filters_default else None,
|
| 513 |
hf_costs=hf_costs,
|
| 514 |
+
latest_run_metrics=latest,
|
| 515 |
)
|
| 516 |
|
| 517 |
|
|
|
|
| 557 |
|
| 558 |
logger.debug("Auto-refresh tick")
|
| 559 |
cold = fetch_cold_costs()
|
| 560 |
+
latest = fetch_latest_run_metrics()
|
| 561 |
warm = fetch_warm_costs()
|
| 562 |
jobs = _fetch_job_runs()
|
| 563 |
hf_costs = _fetch_hf_cost_history()
|
|
|
|
| 570 |
state.wf_runtime_filter,
|
| 571 |
state.wf_freshness_filter,
|
| 572 |
hf_costs=hf_costs,
|
| 573 |
+
latest_run_metrics=latest,
|
| 574 |
)
|
| 575 |
state.wf_table_data = table_df
|
| 576 |
_wf_card_ids = card_ids
|
| 577 |
+
compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs, latest_run_metrics=latest)
|
| 578 |
|
| 579 |
|
| 580 |
# ---------------------------------------------------------------------------
|
|
|
|
| 617 |
|
| 618 |
# Query costs + job runs (job_runs already re-keyed to workflow_id)
|
| 619 |
cold = fetch_cold_costs()
|
| 620 |
+
latest = fetch_latest_run_metrics()
|
| 621 |
warm = fetch_warm_costs()
|
| 622 |
jobs = _fetch_job_runs()
|
| 623 |
|
|
|
|
| 639 |
hf_costs = _fetch_hf_cost_history()
|
| 640 |
|
| 641 |
# Build table
|
| 642 |
+
table_df, card_ids = build_table_data(
|
| 643 |
+
_cards, cold, jobs, "All", "All", "All", hf_costs=hf_costs, latest_run_metrics=latest
|
| 644 |
+
)
|
| 645 |
state.wf_table_data = table_df
|
| 646 |
_wf_card_ids = card_ids
|
| 647 |
|
| 648 |
# Stats (uses jobs for freshness, cold for cost, hf_costs for HF data)
|
| 649 |
+
compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs, latest_run_metrics=latest)
|
| 650 |
|
| 651 |
# Clear detail state (dashboard mode)
|
| 652 |
state.wf_selected_workflow = None
|
src/state/workflows_stats.py
CHANGED
|
@@ -235,6 +235,8 @@ WF_TABLE_COLS = [
|
|
| 235 |
"Status",
|
| 236 |
"Last Run",
|
| 237 |
"Last Duration",
|
|
|
|
|
|
|
| 238 |
"Cost (30d)",
|
| 239 |
"Avg/Run",
|
| 240 |
"Freshness",
|
|
@@ -249,6 +251,7 @@ def build_table_data(
|
|
| 249 |
runtime_filter: str | None = "All",
|
| 250 |
freshness_filter: str | None = "All",
|
| 251 |
hf_costs: dict[str, HFCostData] | None = None,
|
|
|
|
| 252 |
) -> tuple[pd.DataFrame, list[str]]:
|
| 253 |
"""Build dashboard table DataFrame from cards + cost data.
|
| 254 |
|
|
@@ -256,21 +259,32 @@ def build_table_data(
|
|
| 256 |
- cold_costs: DB cold-tier costs (workflow_id column)
|
| 257 |
- job_runs: Databricks Jobs API (re-keyed to workflow_id)
|
| 258 |
- hf_costs: HF Hub cost history (keyed by workflow_id)
|
|
|
|
| 259 |
|
| 260 |
Returns (DataFrame, card_ids) where card_ids is parallel to rows
|
| 261 |
for mapping row index to card ID.
|
| 262 |
"""
|
| 263 |
card_ids: list[str] = []
|
| 264 |
hf = hf_costs or {}
|
|
|
|
| 265 |
|
| 266 |
# Build cost lookups keyed by workflow_id
|
| 267 |
cold_cost_lookup: dict[str, float] = {}
|
| 268 |
cold_run_count_lookup: dict[str, int] = {}
|
| 269 |
if not cold_costs.empty and "workflow_id" in cold_costs.columns:
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
)
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
rows = []
|
| 276 |
for card_id, card in cards.items():
|
|
@@ -358,6 +372,16 @@ def build_table_data(
|
|
| 358 |
# --- Status ---
|
| 359 |
status_str = _resolve_status(hf_data, job_run, jobs_last_run_ts, hf_last_run_ts)
|
| 360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
rows.append(
|
| 362 |
{
|
| 363 |
"Name": card.get("name", card_id),
|
|
@@ -367,6 +391,8 @@ def build_table_data(
|
|
| 367 |
"Status": status_str,
|
| 368 |
"Last Run": last_run_str,
|
| 369 |
"Last Duration": duration_str,
|
|
|
|
|
|
|
| 370 |
"Cost (30d)": cost_val,
|
| 371 |
"Avg/Run": avg_run_val,
|
| 372 |
"Freshness": freshness_str,
|
|
@@ -493,6 +519,7 @@ def compute_stats(
|
|
| 493 |
jobs: dict[str, dict[str, Any]],
|
| 494 |
visible_card_ids: set[str] | None = None,
|
| 495 |
hf_costs: dict[str, HFCostData] | None = None,
|
|
|
|
| 496 |
) -> None:
|
| 497 |
"""Compute stats bar metrics.
|
| 498 |
|
|
@@ -571,6 +598,12 @@ def compute_stats(
|
|
| 571 |
else:
|
| 572 |
state.wf_run_volume_detail = ""
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
def _compute_hf_cost(
|
| 576 |
cards_subset: dict[str, dict[str, Any]],
|
|
@@ -669,6 +702,38 @@ def _compute_freshness_stats(
|
|
| 669 |
state.wf_freshness_detail = RawHtml("")
|
| 670 |
|
| 671 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
__all__ = [
|
| 673 |
"HFCostData",
|
| 674 |
"WF_TABLE_COLS",
|
|
|
|
| 235 |
"Status",
|
| 236 |
"Last Run",
|
| 237 |
"Last Duration",
|
| 238 |
+
"Cold Start",
|
| 239 |
+
"Entities",
|
| 240 |
"Cost (30d)",
|
| 241 |
"Avg/Run",
|
| 242 |
"Freshness",
|
|
|
|
| 251 |
runtime_filter: str | None = "All",
|
| 252 |
freshness_filter: str | None = "All",
|
| 253 |
hf_costs: dict[str, HFCostData] | None = None,
|
| 254 |
+
latest_run_metrics: pd.DataFrame | None = None,
|
| 255 |
) -> tuple[pd.DataFrame, list[str]]:
|
| 256 |
"""Build dashboard table DataFrame from cards + cost data.
|
| 257 |
|
|
|
|
| 259 |
- cold_costs: DB cold-tier costs (workflow_id column)
|
| 260 |
- job_runs: Databricks Jobs API (re-keyed to workflow_id)
|
| 261 |
- hf_costs: HF Hub cost history (keyed by workflow_id)
|
| 262 |
+
- latest_run_metrics: most recent run per workflow (cold_start, entities)
|
| 263 |
|
| 264 |
Returns (DataFrame, card_ids) where card_ids is parallel to rows
|
| 265 |
for mapping row index to card ID.
|
| 266 |
"""
|
| 267 |
card_ids: list[str] = []
|
| 268 |
hf = hf_costs or {}
|
| 269 |
+
lrm = latest_run_metrics if latest_run_metrics is not None else pd.DataFrame()
|
| 270 |
|
| 271 |
# Build cost lookups keyed by workflow_id
|
| 272 |
cold_cost_lookup: dict[str, float] = {}
|
| 273 |
cold_run_count_lookup: dict[str, int] = {}
|
| 274 |
if not cold_costs.empty and "workflow_id" in cold_costs.columns:
|
| 275 |
+
idx = cold_costs.set_index("workflow_id")
|
| 276 |
+
cold_cost_lookup = idx["total_cost_usd"].apply(lambda x: float(x or 0)).to_dict()
|
| 277 |
+
cold_run_count_lookup = idx["run_count"].apply(lambda x: int(x or 0)).to_dict()
|
| 278 |
+
|
| 279 |
+
# Build latest-run lookups keyed by workflow_id
|
| 280 |
+
cold_start_lookup: dict[str, int] = {}
|
| 281 |
+
entity_count_lookup: dict[str, int] = {}
|
| 282 |
+
if not lrm.empty and "workflow_id" in lrm.columns:
|
| 283 |
+
lrm_idx = lrm.set_index("workflow_id")
|
| 284 |
+
if "cold_start_seconds" in lrm_idx.columns:
|
| 285 |
+
cold_start_lookup = lrm_idx["cold_start_seconds"].dropna().apply(int).to_dict()
|
| 286 |
+
if "entity_count" in lrm_idx.columns:
|
| 287 |
+
entity_count_lookup = lrm_idx["entity_count"].dropna().apply(int).to_dict()
|
| 288 |
|
| 289 |
rows = []
|
| 290 |
for card_id, card in cards.items():
|
|
|
|
| 372 |
# --- Status ---
|
| 373 |
status_str = _resolve_status(hf_data, job_run, jobs_last_run_ts, hf_last_run_ts)
|
| 374 |
|
| 375 |
+
# --- Cold start + Entities (from enriched cold tier) ---
|
| 376 |
+
cs = cold_start_lookup.get(card_id)
|
| 377 |
+
cold_start_str = "\u2014"
|
| 378 |
+
if cs is not None:
|
| 379 |
+
cs_mins, cs_secs = divmod(int(cs), 60)
|
| 380 |
+
cold_start_str = f"{cs_mins}m {cs_secs}s" if cs_mins else f"{cs_secs}s"
|
| 381 |
+
|
| 382 |
+
ent = entity_count_lookup.get(card_id)
|
| 383 |
+
entity_str = f"{int(ent):,}" if ent is not None else "\u2014"
|
| 384 |
+
|
| 385 |
rows.append(
|
| 386 |
{
|
| 387 |
"Name": card.get("name", card_id),
|
|
|
|
| 391 |
"Status": status_str,
|
| 392 |
"Last Run": last_run_str,
|
| 393 |
"Last Duration": duration_str,
|
| 394 |
+
"Cold Start": cold_start_str,
|
| 395 |
+
"Entities": entity_str,
|
| 396 |
"Cost (30d)": cost_val,
|
| 397 |
"Avg/Run": avg_run_val,
|
| 398 |
"Freshness": freshness_str,
|
|
|
|
| 519 |
jobs: dict[str, dict[str, Any]],
|
| 520 |
visible_card_ids: set[str] | None = None,
|
| 521 |
hf_costs: dict[str, HFCostData] | None = None,
|
| 522 |
+
latest_run_metrics: pd.DataFrame | None = None,
|
| 523 |
) -> None:
|
| 524 |
"""Compute stats bar metrics.
|
| 525 |
|
|
|
|
| 598 |
else:
|
| 599 |
state.wf_run_volume_detail = ""
|
| 600 |
|
| 601 |
+
# Cold start: from latest run per workflow (not averaged across 30 days)
|
| 602 |
+
lrm = latest_run_metrics if latest_run_metrics is not None else pd.DataFrame()
|
| 603 |
+
if not lrm.empty and visible_card_ids is not None:
|
| 604 |
+
lrm = lrm[lrm["workflow_id"].isin(list(visible_card_ids))]
|
| 605 |
+
_compute_cold_start_stats(state, lrm)
|
| 606 |
+
|
| 607 |
|
| 608 |
def _compute_hf_cost(
|
| 609 |
cards_subset: dict[str, dict[str, Any]],
|
|
|
|
| 702 |
state.wf_freshness_detail = RawHtml("")
|
| 703 |
|
| 704 |
|
| 705 |
+
def _compute_cold_start_stats(state: Any, latest_run: pd.DataFrame) -> None:
|
| 706 |
+
"""Compute cold start stat from latest-run-per-workflow metrics.
|
| 707 |
+
|
| 708 |
+
Uses median (robust to outliers) across workflows' most recent runs.
|
| 709 |
+
"""
|
| 710 |
+
if latest_run.empty or "cold_start_seconds" not in latest_run.columns:
|
| 711 |
+
state.wf_avg_cold_start = "\u2014"
|
| 712 |
+
state.wf_cold_start_detail = ""
|
| 713 |
+
return
|
| 714 |
+
|
| 715 |
+
valid = latest_run[latest_run["cold_start_seconds"].notna()]
|
| 716 |
+
if valid.empty:
|
| 717 |
+
state.wf_avg_cold_start = "\u2014"
|
| 718 |
+
state.wf_cold_start_detail = "No enrichment data yet"
|
| 719 |
+
return
|
| 720 |
+
|
| 721 |
+
median_cs = float(valid["cold_start_seconds"].median())
|
| 722 |
+
max_cs = float(valid["cold_start_seconds"].max())
|
| 723 |
+
min_cs = float(valid["cold_start_seconds"].min())
|
| 724 |
+
|
| 725 |
+
mins, secs = divmod(int(median_cs), 60)
|
| 726 |
+
state.wf_avg_cold_start = f"{mins}m {secs}s" if mins else f"{secs}s"
|
| 727 |
+
|
| 728 |
+
# Detail: range across workflows
|
| 729 |
+
min_m, min_s = divmod(int(min_cs), 60)
|
| 730 |
+
max_m, max_s = divmod(int(max_cs), 60)
|
| 731 |
+
min_str = f"{min_m}m {min_s}s" if min_m else f"{min_s}s"
|
| 732 |
+
max_str = f"{max_m}m {max_s}s" if max_m else f"{max_s}s"
|
| 733 |
+
|
| 734 |
+
state.wf_cold_start_detail = f"range {min_str}\u2013{max_str}"
|
| 735 |
+
|
| 736 |
+
|
| 737 |
__all__ = [
|
| 738 |
"HFCostData",
|
| 739 |
"WF_TABLE_COLS",
|
src/template.py
CHANGED
|
@@ -128,6 +128,14 @@ GLOSSARY: dict[str, str] = {
|
|
| 128 |
"FAILED = last run errored. SKIPPED = last run was skipped."
|
| 129 |
),
|
| 130 |
"Trigger": ("How a workflow is initiated. Scheduled = runs on a cron interval. Manual = triggered by a human."),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
"Passes with Value": (
|
| 132 |
"Passes where the off-ball scoring opportunity (actual OBSO) was greater than zero. "
|
| 133 |
"Used as a quality proxy for 'successful' passes when pass outcome data is unavailable."
|
|
@@ -258,6 +266,8 @@ PAGE_TERMS: dict[str, list[str]] = {
|
|
| 258 |
"Workflow Card",
|
| 259 |
"Workflow Status",
|
| 260 |
"Trigger",
|
|
|
|
|
|
|
| 261 |
],
|
| 262 |
"Conversion-Funnel": ["A3 Entry", "Conversion Rate", "Possession"],
|
| 263 |
}
|
|
|
|
| 128 |
"FAILED = last run errored. SKIPPED = last run was skipped."
|
| 129 |
),
|
| 130 |
"Trigger": ("How a workflow is initiated. Scheduled = runs on a cron interval. Manual = triggered by a human."),
|
| 131 |
+
"Cold Start": (
|
| 132 |
+
"Serverless environment setup time before pipeline work begins. "
|
| 133 |
+
"Includes compute spin-up, wheel installation, and Python imports. Lower is better."
|
| 134 |
+
),
|
| 135 |
+
"Entities": (
|
| 136 |
+
"Average number of input entities (e.g., matches, players) processed per run. "
|
| 137 |
+
"From the pipeline's skip guard, which determines what work is needed."
|
| 138 |
+
),
|
| 139 |
"Passes with Value": (
|
| 140 |
"Passes where the off-ball scoring opportunity (actual OBSO) was greater than zero. "
|
| 141 |
"Used as a quality proxy for 'successful' passes when pass outcome data is unavailable."
|
|
|
|
| 266 |
"Workflow Card",
|
| 267 |
"Workflow Status",
|
| 268 |
"Trigger",
|
| 269 |
+
"Cold Start",
|
| 270 |
+
"Entities",
|
| 271 |
],
|
| 272 |
"Conversion-Funnel": ["A3 Entry", "Conversion Rate", "Possession"],
|
| 273 |
}
|