Spaces:

luxury-lakehouse
/

soccer-analytics-app

Running

App Files Files Community

karstenskyt commited on Apr 12

Commit

c485f29

verified ·

1 Parent(s): c724d07

Deploy Taipy app via scripts/manage_space.py

Browse files

Files changed (8) hide show

.env +2 -0
dist/luxury_lakehouse-0.3.0-py3-none-any.whl +2 -2
requirements.txt +2 -2
src/pages/workflows.py +9 -0
src/queries/workflows.py +58 -8
src/state/workflows.py +16 -4
src/state/workflows_stats.py +69 -4
src/template.py +10 -0

.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ LAKEBASE_HOST=ep-spring-rain-d2i6lozx.database.us-east-1.cloud.databricks.com
2	+ LAKEBASE_ENDPOINT_NAME=projects/soccer-analytics-dev/branches/production/endpoints/primary

dist/luxury_lakehouse-0.3.0-py3-none-any.whl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3058358a8b38dc4ecd1fe414617ba9b06ee4e3e96c93ed0e3bdc94fd3b19f6c0
-size 380959

 version https://git-lfs.github.com/spec/v1
+oid sha256:59695805377583cb29800da3688cff9b709baa8dae6a135f6e847b22f67aecf2
+size 381689

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile pyproject.toml --extra taipy-app --python-version 3.10 --python-platform linux -o D:\Development\karstenskyt__luxury-lakehouse-d32\hf_taipy_app\requirements.txt
 aniso8601==10.0.1
     # via flask-restful
 annotated-doc==0.0.4
@@ -266,7 +266,7 @@ requests==2.33.1
     #   requests-cache
 requests-cache==1.3.1
     # via luxury-lakehouse (pyproject.toml)
-rich==14.3.3
     # via
     #   cookiecutter
     #   typer

 # This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra taipy-app --python-version 3.10 --python-platform linux -o D:\Development\karstenskyt__luxury-lakehouse\hf_taipy_app\requirements.txt
 aniso8601==10.0.1
     # via flask-restful
 annotated-doc==0.0.4
     #   requests-cache
 requests-cache==1.3.1
     # via luxury-lakehouse (pyproject.toml)
+rich==14.3.4
     # via
     #   cookiecutter
     #   typer

src/pages/workflows.py CHANGED Viewed

@@ -57,6 +57,15 @@ page_config = PageConfig(
             "wf_run_volume_detail",
             help_text="Total pipeline runs in the last 30 days. Detail shows daily rate and average cost per run.",
         ),
     ],
     content=[
         ContentRow(

             "wf_run_volume_detail",
             help_text="Total pipeline runs in the last 30 days. Detail shows daily rate and average cost per run.",
         ),
+        StatCard(
+            "Avg Cold Start",
+            "wf_avg_cold_start",
+            "wf_cold_start_detail",
+            help_text=(
+                "Average serverless environment setup time (spin-up, wheel install, imports) "
+                "before pipeline work begins. Lower is better. Range shows fastest to slowest workflow."
+            ),
+        ),
     ],
     content=[
         ContentRow(

src/queries/workflows.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """Workflow-related queries — extracted from state/workflows.py.
-Only the 2 SQL queries (cold costs + warm costs). Non-SQL data fetching
-(Jobs API, HF Hub) remains in state/workflows.py as those are API calls,
-not database queries.
 """
 from __future__ import annotations
@@ -17,20 +17,39 @@ from queries.common import execute_query, t, ttl_cache
 logger = logging.getLogger(__name__)
-@ttl_cache(ttl=3600)
 def fetch_cold_costs() -> pd.DataFrame:
     """30-day aggregated costs from fct_workflow_costs_synced (cold tier).
-    Expected columns: workflow_id, task_key, total_cost_usd, total_dbu, run_count.
-    Grouped by workflow_id (falls back to task_key when workflow_id is NULL).
     """
-    _empty = pd.DataFrame(columns=pd.Index(["workflow_id", "task_key", "total_cost_usd", "total_dbu", "run_count"]))
     try:
         tbl = t("fct_workflow_costs_synced")
         return execute_query(
             f"SELECT COALESCE(workflow_id, task_key) AS workflow_id, "  # noqa: S608
             f"  task_key, "
-            f"  SUM(attributed_cost_usd) AS total_cost_usd, "
             f"  SUM(attributed_dbu) AS total_dbu, "
             f"  COUNT(DISTINCT job_run_id) AS run_count "
             f"FROM {tbl} "
@@ -44,6 +63,37 @@ def fetch_cold_costs() -> pd.DataFrame:
         return _empty
 @ttl_cache(ttl=120)
 def fetch_warm_costs() -> pd.DataFrame:
     """Recent cost estimates from workflow_cost_live_synced (warm tier).

 """Workflow-related queries — extracted from state/workflows.py.
+Cold costs (30-day aggregate), warm costs (live), and latest-run metrics
+(most recent run per workflow). Non-SQL data fetching (Jobs API, HF Hub)
+remains in state/workflows.py as those are API calls, not database queries.
 """
 from __future__ import annotations
 logger = logging.getLogger(__name__)
+_COLD_COST_COLS = [
+    "workflow_id",
+    "task_key",
+    "total_cost_usd",
+    "total_dbu",
+    "run_count",
+]
+_LATEST_RUN_COLS = [
+    "workflow_id",
+    "cold_start_seconds",
+    "duration_seconds",
+    "entity_count",
+    "row_count",
+    "pipeline_state",
+]
+@ttl_cache(ttl=600)
 def fetch_cold_costs() -> pd.DataFrame:
     """30-day aggregated costs from fct_workflow_costs_synced (cold tier).
+    Cost aggregates only — timing and entity data comes from
+    fetch_latest_run_metrics() to avoid diluting recent values with
+    historical zeros.
     """
+    _empty = pd.DataFrame(columns=pd.Index(_COLD_COST_COLS))
     try:
         tbl = t("fct_workflow_costs_synced")
         return execute_query(
             f"SELECT COALESCE(workflow_id, task_key) AS workflow_id, "  # noqa: S608
             f"  task_key, "
+            f"  SUM(effective_cost_usd) AS total_cost_usd, "
             f"  SUM(attributed_dbu) AS total_dbu, "
             f"  COUNT(DISTINCT job_run_id) AS run_count "
             f"FROM {tbl} "
         return _empty
+@ttl_cache(ttl=600)
+def fetch_latest_run_metrics() -> pd.DataFrame:
+    """Most recent run per workflow from fct_workflow_costs_synced.
+    Returns one row per workflow with cold_start_seconds, duration_seconds,
+    entity_count, row_count, and pipeline_state from the latest run.
+    """
+    _empty = pd.DataFrame(columns=pd.Index(_LATEST_RUN_COLS))
+    try:
+        tbl = t("fct_workflow_costs_synced")
+        return execute_query(
+            f"SELECT workflow_id, cold_start_seconds, duration_seconds, "  # noqa: S608
+            f"  entity_count, row_count, pipeline_state "
+            f"FROM ( "
+            f"  SELECT COALESCE(workflow_id, task_key) AS workflow_id, "
+            f"    cold_start_seconds, duration_seconds, entity_count, "
+            f"    row_count, pipeline_state, "
+            f"    ROW_NUMBER() OVER ("
+            f"      PARTITION BY COALESCE(workflow_id, task_key) "
+            f"      ORDER BY usage_date DESC, job_run_id DESC"
+            f"    ) AS rn "
+            f"  FROM {tbl} "
+            f"  WHERE pipeline_state IS NOT NULL "
+            f") sub "
+            f"WHERE rn = 1",
+        )
+    except Exception:
+        logger.warning("Latest run metrics query failed", exc_info=True)
+        return _empty
 @ttl_cache(ttl=120)
 def fetch_warm_costs() -> pd.DataFrame:
     """Recent cost estimates from workflow_cost_live_synced (warm tier).

src/state/workflows.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Any
 import pandas as pd
 from cache import ttl_cache
-from queries.workflows import fetch_cold_costs, fetch_warm_costs
 from state.shared import register_page_refresher, register_page_teardown
 from state.workflows_dag import (
@@ -74,6 +74,8 @@ wf_total_cost_30d: str = "$0.00"
 wf_cost_detail: RawHtml = RawHtml("")
 wf_run_volume: str = "0"
 wf_run_volume_detail: str = ""
 wf_table_data: pd.DataFrame = pd.DataFrame(columns=pd.Index(WF_TABLE_COLS))
@@ -133,6 +135,8 @@ __all__ = [
     "wf_cost_detail",
     "wf_run_volume",
     "wf_run_volume_detail",
     "wf_table_data",
     "wf_type_filter",
     "wf_type_lov",
@@ -441,6 +445,7 @@ def _refresh_table(state: Any) -> None:
     global _wf_card_ids
     cold = fetch_cold_costs()
     jobs = _fetch_job_runs()
     hf_costs = _fetch_hf_cost_history()
@@ -491,6 +496,7 @@ def _refresh_table(state: Any) -> None:
         state.wf_runtime_filter,
         state.wf_freshness_filter,
         hf_costs=hf_costs,
     )
     state.wf_table_data = table_df
     _wf_card_ids = card_ids
@@ -505,6 +511,7 @@ def _refresh_table(state: Any) -> None:
         jobs,
         visible_card_ids=matched_ids if not all_filters_default else None,
         hf_costs=hf_costs,
     )
@@ -550,6 +557,7 @@ def _wf_auto_refresh_tick(state: Any) -> None:
     logger.debug("Auto-refresh tick")
     cold = fetch_cold_costs()
     warm = fetch_warm_costs()
     jobs = _fetch_job_runs()
     hf_costs = _fetch_hf_cost_history()
@@ -562,10 +570,11 @@ def _wf_auto_refresh_tick(state: Any) -> None:
         state.wf_runtime_filter,
         state.wf_freshness_filter,
         hf_costs=hf_costs,
     )
     state.wf_table_data = table_df
     _wf_card_ids = card_ids
-    compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs)
 # ---------------------------------------------------------------------------
@@ -608,6 +617,7 @@ def wf_refresh(state: Any) -> None:
     # Query costs + job runs (job_runs already re-keyed to workflow_id)
     cold = fetch_cold_costs()
     warm = fetch_warm_costs()
     jobs = _fetch_job_runs()
@@ -629,12 +639,14 @@ def wf_refresh(state: Any) -> None:
     hf_costs = _fetch_hf_cost_history()
     # Build table
-    table_df, card_ids = build_table_data(_cards, cold, jobs, "All", "All", "All", hf_costs=hf_costs)
     state.wf_table_data = table_df
     _wf_card_ids = card_ids
     # Stats (uses jobs for freshness, cold for cost, hf_costs for HF data)
-    compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs)
     # Clear detail state (dashboard mode)
     state.wf_selected_workflow = None

 import pandas as pd
 from cache import ttl_cache
+from queries.workflows import fetch_cold_costs, fetch_latest_run_metrics, fetch_warm_costs
 from state.shared import register_page_refresher, register_page_teardown
 from state.workflows_dag import (
 wf_cost_detail: RawHtml = RawHtml("")
 wf_run_volume: str = "0"
 wf_run_volume_detail: str = ""
+wf_avg_cold_start: str = "\u2014"
+wf_cold_start_detail: str = ""
 wf_table_data: pd.DataFrame = pd.DataFrame(columns=pd.Index(WF_TABLE_COLS))
     "wf_cost_detail",
     "wf_run_volume",
     "wf_run_volume_detail",
+    "wf_avg_cold_start",
+    "wf_cold_start_detail",
     "wf_table_data",
     "wf_type_filter",
     "wf_type_lov",
     global _wf_card_ids
     cold = fetch_cold_costs()
+    latest = fetch_latest_run_metrics()
     jobs = _fetch_job_runs()
     hf_costs = _fetch_hf_cost_history()
         state.wf_runtime_filter,
         state.wf_freshness_filter,
         hf_costs=hf_costs,
+        latest_run_metrics=latest,
     )
     state.wf_table_data = table_df
     _wf_card_ids = card_ids
         jobs,
         visible_card_ids=matched_ids if not all_filters_default else None,
         hf_costs=hf_costs,
+        latest_run_metrics=latest,
     )
     logger.debug("Auto-refresh tick")
     cold = fetch_cold_costs()
+    latest = fetch_latest_run_metrics()
     warm = fetch_warm_costs()
     jobs = _fetch_job_runs()
     hf_costs = _fetch_hf_cost_history()
         state.wf_runtime_filter,
         state.wf_freshness_filter,
         hf_costs=hf_costs,
+        latest_run_metrics=latest,
     )
     state.wf_table_data = table_df
     _wf_card_ids = card_ids
+    compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs, latest_run_metrics=latest)
 # ---------------------------------------------------------------------------
     # Query costs + job runs (job_runs already re-keyed to workflow_id)
     cold = fetch_cold_costs()
+    latest = fetch_latest_run_metrics()
     warm = fetch_warm_costs()
     jobs = _fetch_job_runs()
     hf_costs = _fetch_hf_cost_history()
     # Build table
+    table_df, card_ids = build_table_data(
+        _cards, cold, jobs, "All", "All", "All", hf_costs=hf_costs, latest_run_metrics=latest
+    )
     state.wf_table_data = table_df
     _wf_card_ids = card_ids
     # Stats (uses jobs for freshness, cold for cost, hf_costs for HF data)
+    compute_stats(state, _cards, cold, warm, jobs, hf_costs=hf_costs, latest_run_metrics=latest)
     # Clear detail state (dashboard mode)
     state.wf_selected_workflow = None

src/state/workflows_stats.py CHANGED Viewed

@@ -235,6 +235,8 @@ WF_TABLE_COLS = [
     "Status",
     "Last Run",
     "Last Duration",
     "Cost (30d)",
     "Avg/Run",
     "Freshness",
@@ -249,6 +251,7 @@ def build_table_data(
     runtime_filter: str | None = "All",
     freshness_filter: str | None = "All",
     hf_costs: dict[str, HFCostData] | None = None,
 ) -> tuple[pd.DataFrame, list[str]]:
     """Build dashboard table DataFrame from cards + cost data.
@@ -256,21 +259,32 @@ def build_table_data(
     - cold_costs: DB cold-tier costs (workflow_id column)
     - job_runs: Databricks Jobs API (re-keyed to workflow_id)
     - hf_costs: HF Hub cost history (keyed by workflow_id)
     Returns (DataFrame, card_ids) where card_ids is parallel to rows
     for mapping row index to card ID.
     """
     card_ids: list[str] = []
     hf = hf_costs or {}
     # Build cost lookups keyed by workflow_id
     cold_cost_lookup: dict[str, float] = {}
     cold_run_count_lookup: dict[str, int] = {}
     if not cold_costs.empty and "workflow_id" in cold_costs.columns:
-        cold_cost_lookup = (
-            cold_costs.set_index("workflow_id")["total_cost_usd"].apply(lambda x: float(x or 0)).to_dict()
-        )
-        cold_run_count_lookup = cold_costs.set_index("workflow_id")["run_count"].apply(lambda x: int(x or 0)).to_dict()
     rows = []
     for card_id, card in cards.items():
@@ -358,6 +372,16 @@ def build_table_data(
         # --- Status ---
         status_str = _resolve_status(hf_data, job_run, jobs_last_run_ts, hf_last_run_ts)
         rows.append(
             {
                 "Name": card.get("name", card_id),
@@ -367,6 +391,8 @@ def build_table_data(
                 "Status": status_str,
                 "Last Run": last_run_str,
                 "Last Duration": duration_str,
                 "Cost (30d)": cost_val,
                 "Avg/Run": avg_run_val,
                 "Freshness": freshness_str,
@@ -493,6 +519,7 @@ def compute_stats(
     jobs: dict[str, dict[str, Any]],
     visible_card_ids: set[str] | None = None,
     hf_costs: dict[str, HFCostData] | None = None,
 ) -> None:
     """Compute stats bar metrics.
@@ -571,6 +598,12 @@ def compute_stats(
         else:
             state.wf_run_volume_detail = ""
 def _compute_hf_cost(
     cards_subset: dict[str, dict[str, Any]],
@@ -669,6 +702,38 @@ def _compute_freshness_stats(
         state.wf_freshness_detail = RawHtml("")
 __all__ = [
     "HFCostData",
     "WF_TABLE_COLS",

     "Status",
     "Last Run",
     "Last Duration",
+    "Cold Start",
+    "Entities",
     "Cost (30d)",
     "Avg/Run",
     "Freshness",
     runtime_filter: str | None = "All",
     freshness_filter: str | None = "All",
     hf_costs: dict[str, HFCostData] | None = None,
+    latest_run_metrics: pd.DataFrame | None = None,
 ) -> tuple[pd.DataFrame, list[str]]:
     """Build dashboard table DataFrame from cards + cost data.
     - cold_costs: DB cold-tier costs (workflow_id column)
     - job_runs: Databricks Jobs API (re-keyed to workflow_id)
     - hf_costs: HF Hub cost history (keyed by workflow_id)
+    - latest_run_metrics: most recent run per workflow (cold_start, entities)
     Returns (DataFrame, card_ids) where card_ids is parallel to rows
     for mapping row index to card ID.
     """
     card_ids: list[str] = []
     hf = hf_costs or {}
+    lrm = latest_run_metrics if latest_run_metrics is not None else pd.DataFrame()
     # Build cost lookups keyed by workflow_id
     cold_cost_lookup: dict[str, float] = {}
     cold_run_count_lookup: dict[str, int] = {}
     if not cold_costs.empty and "workflow_id" in cold_costs.columns:
+        idx = cold_costs.set_index("workflow_id")
+        cold_cost_lookup = idx["total_cost_usd"].apply(lambda x: float(x or 0)).to_dict()
+        cold_run_count_lookup = idx["run_count"].apply(lambda x: int(x or 0)).to_dict()
+    # Build latest-run lookups keyed by workflow_id
+    cold_start_lookup: dict[str, int] = {}
+    entity_count_lookup: dict[str, int] = {}
+    if not lrm.empty and "workflow_id" in lrm.columns:
+        lrm_idx = lrm.set_index("workflow_id")
+        if "cold_start_seconds" in lrm_idx.columns:
+            cold_start_lookup = lrm_idx["cold_start_seconds"].dropna().apply(int).to_dict()
+        if "entity_count" in lrm_idx.columns:
+            entity_count_lookup = lrm_idx["entity_count"].dropna().apply(int).to_dict()
     rows = []
     for card_id, card in cards.items():
         # --- Status ---
         status_str = _resolve_status(hf_data, job_run, jobs_last_run_ts, hf_last_run_ts)
+        # --- Cold start + Entities (from enriched cold tier) ---
+        cs = cold_start_lookup.get(card_id)
+        cold_start_str = "\u2014"
+        if cs is not None:
+            cs_mins, cs_secs = divmod(int(cs), 60)
+            cold_start_str = f"{cs_mins}m {cs_secs}s" if cs_mins else f"{cs_secs}s"
+        ent = entity_count_lookup.get(card_id)
+        entity_str = f"{int(ent):,}" if ent is not None else "\u2014"
         rows.append(
             {
                 "Name": card.get("name", card_id),
                 "Status": status_str,
                 "Last Run": last_run_str,
                 "Last Duration": duration_str,
+                "Cold Start": cold_start_str,
+                "Entities": entity_str,
                 "Cost (30d)": cost_val,
                 "Avg/Run": avg_run_val,
                 "Freshness": freshness_str,
     jobs: dict[str, dict[str, Any]],
     visible_card_ids: set[str] | None = None,
     hf_costs: dict[str, HFCostData] | None = None,
+    latest_run_metrics: pd.DataFrame | None = None,
 ) -> None:
     """Compute stats bar metrics.
         else:
             state.wf_run_volume_detail = ""
+    # Cold start: from latest run per workflow (not averaged across 30 days)
+    lrm = latest_run_metrics if latest_run_metrics is not None else pd.DataFrame()
+    if not lrm.empty and visible_card_ids is not None:
+        lrm = lrm[lrm["workflow_id"].isin(list(visible_card_ids))]
+    _compute_cold_start_stats(state, lrm)
 def _compute_hf_cost(
     cards_subset: dict[str, dict[str, Any]],
         state.wf_freshness_detail = RawHtml("")
+def _compute_cold_start_stats(state: Any, latest_run: pd.DataFrame) -> None:
+    """Compute cold start stat from latest-run-per-workflow metrics.
+    Uses median (robust to outliers) across workflows' most recent runs.
+    """
+    if latest_run.empty or "cold_start_seconds" not in latest_run.columns:
+        state.wf_avg_cold_start = "\u2014"
+        state.wf_cold_start_detail = ""
+        return
+    valid = latest_run[latest_run["cold_start_seconds"].notna()]
+    if valid.empty:
+        state.wf_avg_cold_start = "\u2014"
+        state.wf_cold_start_detail = "No enrichment data yet"
+        return
+    median_cs = float(valid["cold_start_seconds"].median())
+    max_cs = float(valid["cold_start_seconds"].max())
+    min_cs = float(valid["cold_start_seconds"].min())
+    mins, secs = divmod(int(median_cs), 60)
+    state.wf_avg_cold_start = f"{mins}m {secs}s" if mins else f"{secs}s"
+    # Detail: range across workflows
+    min_m, min_s = divmod(int(min_cs), 60)
+    max_m, max_s = divmod(int(max_cs), 60)
+    min_str = f"{min_m}m {min_s}s" if min_m else f"{min_s}s"
+    max_str = f"{max_m}m {max_s}s" if max_m else f"{max_s}s"
+    state.wf_cold_start_detail = f"range {min_str}\u2013{max_str}"
 __all__ = [
     "HFCostData",
     "WF_TABLE_COLS",

src/template.py CHANGED Viewed

@@ -128,6 +128,14 @@ GLOSSARY: dict[str, str] = {
         "FAILED = last run errored. SKIPPED = last run was skipped."
     ),
     "Trigger": ("How a workflow is initiated. Scheduled = runs on a cron interval. Manual = triggered by a human."),
     "Passes with Value": (
         "Passes where the off-ball scoring opportunity (actual OBSO) was greater than zero. "
         "Used as a quality proxy for 'successful' passes when pass outcome data is unavailable."
@@ -258,6 +266,8 @@ PAGE_TERMS: dict[str, list[str]] = {
         "Workflow Card",
         "Workflow Status",
         "Trigger",
     ],
     "Conversion-Funnel": ["A3 Entry", "Conversion Rate", "Possession"],
 }

         "FAILED = last run errored. SKIPPED = last run was skipped."
     ),
     "Trigger": ("How a workflow is initiated. Scheduled = runs on a cron interval. Manual = triggered by a human."),
+    "Cold Start": (
+        "Serverless environment setup time before pipeline work begins. "
+        "Includes compute spin-up, wheel installation, and Python imports. Lower is better."
+    ),
+    "Entities": (
+        "Average number of input entities (e.g., matches, players) processed per run. "
+        "From the pipeline's skip guard, which determines what work is needed."
+    ),
     "Passes with Value": (
         "Passes where the off-ball scoring opportunity (actual OBSO) was greater than zero. "
         "Used as a quality proxy for 'successful' passes when pass outcome data is unavailable."
         "Workflow Card",
         "Workflow Status",
         "Trigger",
+        "Cold Start",
+        "Entities",
     ],
     "Conversion-Funnel": ["A3 Entry", "Conversion Rate", "Possession"],
 }