From 2931a74fcbc708842909a0665b8f52ba778fdf5c Mon Sep 17 00:00:00 2001 From: Jeff Larson Date: Mon, 8 Jun 2026 00:36:10 -0700 Subject: [PATCH] api: bound unbounded trace endpoints; temp traceparent debug log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit list_traces and service_red did a full-table GROUP BY when called with no time window (the UI always passes one, but a bare API call would scan the whole spans table and hit statement_timeout → 500). Default the lower bound to now()-24h when absent so they can't blow up; explicit ranges are unchanged. Two tests that queried unbounded over ancient timestamps now use recent data / expect the recent-window default. Also add a temporary debug log in otel_request_span recording whether an incoming `traceparent` reached the app — to settle whether the missing traefik→watcher edge is traefik not injecting it vs the linkerd sidecar stripping it. Removed in a follow-up once diagnosed. Co-Authored-By: Claude Opus 4.8 --- server/src/api.rs | 8 ++++++-- server/src/lib.rs | 9 +++++++++ server/tests/smoke.rs | 12 ++++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/server/src/api.rs b/server/src/api.rs index cb57bde..463ecda 100644 --- a/server/src/api.rs +++ b/server/src/api.rs @@ -77,7 +77,9 @@ pub async fn list_traces( count(*) FILTER (WHERE status_code = 2) AS error_count FROM spans WHERE ($1::text IS NULL OR service = $1) - AND ($2::timestamptz IS NULL OR start_time >= $2) + -- default to a recent window when unbounded so this can't turn into a + -- full-table GROUP BY (which times out); the UI always passes a range. + AND start_time >= COALESCE($2::timestamptz, now() - interval '24 hours') AND ($3::timestamptz IS NULL OR start_time <= $3) AND ($6::jsonb IS NULL OR trace_id IN (SELECT trace_id FROM spans WHERE attributes @> $6)) @@ -866,7 +868,9 @@ pub async fn service_red( percentile_cont(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99_ms FROM spans WHERE service IS NOT NULL - AND ($1::timestamptz IS NULL OR start_time >= $1) + -- default to a recent window when unbounded so the per-service + -- percentile aggregate can't full-scan the spans table and time out. + AND start_time >= COALESCE($1::timestamptz, now() - interval '24 hours') AND ($2::timestamptz IS NULL OR start_time <= $2) GROUP BY service ORDER BY spans DESC", diff --git a/server/src/lib.rs b/server/src/lib.rs index db1fc40..6a2ec54 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -45,6 +45,15 @@ fn otel_request_span(req: &Request) -> tracing::Span { let parent = opentelemetry::global::get_text_map_propagator(|p| { p.extract(&HeaderExtractor(req.headers())) }); + // TEMP DEBUG (remove after diagnosing the missing traefik→watcher edge): + // log whether an upstream traceparent actually reached the app, to tell apart + // "traefik isn't injecting it" from "the linkerd sidecar strips it". + tracing::info!( + target: "tp_debug", + path = %req.uri().path(), + traceparent_present = req.headers().contains_key("traceparent"), + "incoming request trace-context" + ); span.set_parent(parent); span } diff --git a/server/tests/smoke.rs b/server/tests/smoke.rs index e76f0ef..7c13001 100644 --- a/server/tests/smoke.rs +++ b/server/tests/smoke.rs @@ -250,6 +250,7 @@ async fn ingest_and_query_a_trace() { }; let router = app(pool); + let start = nanos_ago(5); let req = ExportTraceServiceRequest { resource_spans: vec![ResourceSpans { resource: Some(Resource { @@ -261,8 +262,9 @@ async fn ingest_and_query_a_trace() { trace_id: vec![1u8; 16], span_id: vec![2u8; 8], name: "GET /checkout".to_string(), - start_time_unix_nano: 1_000_000_000, - end_time_unix_nano: 1_050_000_000, + // recent (within the list's default window), +50ms duration + start_time_unix_nano: start, + end_time_unix_nano: start + 50_000_000, ..Default::default() }], ..Default::default() @@ -1668,9 +1670,11 @@ async fn time_window_filters_traces_and_logs() { let (_, logs) = get_json(&router, &format!("/api/logs?from={from}")).await; assert_eq!(logs.as_array().unwrap().len(), 1, "only the recent log"); - // No window → both. + // No window → defaults to the recent (24h) window, so the 3-day-old trace is + // excluded and only the recent one comes back. let (_, all) = get_json(&router, "/api/traces").await; - assert_eq!(all.as_array().unwrap().len(), 2); + assert_eq!(all.as_array().unwrap().len(), 1); + assert_eq!(all[0]["trace_id"], "recent"); } #[tokio::test]