Skip to content

Commit f3f0a2c

Browse files
authored
fix(enterprise/coderd/x/chatd): harden TestSubscribeRelayEstablishedMidStream against CI flakes (#24108)
Fixes coder/internal#1455 Three changes to eliminate the timing-sensitive flake in `TestSubscribeRelayEstablishedMidStream`: 1. **Reduce `PendingChatAcquireInterval` from `time.Hour` to `time.Second`.** The primary trigger is still `signalWake()` from `SendMessage`, but a short fallback poll ensures the worker picks up the pending chat even under heavy CI goroutine scheduling contention. 2. **Increase context timeout from `WaitLong` (25s) to `WaitSuperLong` (60s).** The worker pipeline (model resolution, message loading, LLM call) involves multiple DB round-trips that can be slow when PostgreSQL is shared with many parallel test packages. 3. **Add a status-polling loop while waiting for the streaming request.** If the worker errors out during chat processing, the test now fails immediately with the error status and message instead of silently timing out. > Generated by Coder Agents
1 parent 5453a6c commit f3f0a2c

1 file changed

Lines changed: 36 additions & 9 deletions

File tree

enterprise/coderd/x/chatd/chatd_test.go

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1451,15 +1451,17 @@ func TestSubscribeRelayEstablishedMidStream(t *testing.T) {
14511451
)
14521452
})
14531453

1454-
// Worker with a 1-hour acquire interval; only processes when
1455-
// explicitly woken.
1454+
// Worker with a short fallback poll interval. The primary
1455+
// trigger is signalWake() from SendMessage, but under heavy
1456+
// CI load the wake goroutine may be delayed. A short poll
1457+
// ensures the worker always picks up the pending chat.
14561458
workerLogger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
14571459
worker := osschatd.New(osschatd.Config{
14581460
Logger: workerLogger,
14591461
Database: db,
14601462
ReplicaID: workerID,
14611463
Pubsub: ps,
1462-
PendingChatAcquireInterval: time.Hour,
1464+
PendingChatAcquireInterval: time.Second,
14631465
InFlightChatStaleAfter: testutil.WaitSuperLong,
14641466
})
14651467
t.Cleanup(func() {
@@ -1489,7 +1491,11 @@ func TestSubscribeRelayEstablishedMidStream(t *testing.T) {
14891491
return snapshot, relayEvents, cancel, nil
14901492
}, nil)
14911493

1492-
ctx := testutil.Context(t, testutil.WaitLong)
1494+
// Use WaitSuperLong so the test survives heavy CI contention.
1495+
// The worker pipeline (model resolution, message loading, LLM
1496+
// call) involves multiple DB round-trips that can be slow under
1497+
// load.
1498+
ctx := testutil.Context(t, testutil.WaitSuperLong)
14931499
user, model := seedChatDependencies(ctx, t, db)
14941500
setOpenAIProviderBaseURL(ctx, t, db, openAIURL)
14951501

@@ -1509,11 +1515,32 @@ func TestSubscribeRelayEstablishedMidStream(t *testing.T) {
15091515
})
15101516
require.NoError(t, err)
15111517

1512-
// Wait for the worker to reach the LLM (first streaming request).
1513-
select {
1514-
case <-firstChunkEmitted:
1515-
case <-ctx.Done():
1516-
t.Fatal("timed out waiting for worker to start streaming")
1518+
// Wait for the worker to reach the LLM (first streaming
1519+
// request). Also poll the chat status so we fail fast with a
1520+
// clear message if the worker errors out instead of timing
1521+
// out silently.
1522+
ticker := time.NewTicker(250 * time.Millisecond)
1523+
defer ticker.Stop()
1524+
waitForStream:
1525+
for {
1526+
select {
1527+
case <-firstChunkEmitted:
1528+
break waitForStream
1529+
case <-ticker.C:
1530+
currentChat, dbErr := db.GetChatByID(ctx, chat.ID)
1531+
if dbErr == nil && currentChat.Status == database.ChatStatusError {
1532+
t.Fatalf("worker failed to process chat: status=%s last_error=%s",
1533+
currentChat.Status, currentChat.LastError.String)
1534+
}
1535+
case <-ctx.Done():
1536+
// Dump the final chat status for debugging.
1537+
currentChat, dbErr := db.GetChatByID(context.Background(), chat.ID)
1538+
if dbErr == nil {
1539+
t.Fatalf("timed out waiting for worker to start streaming (chat status=%s, last_error=%q)",
1540+
currentChat.Status, currentChat.LastError.String)
1541+
}
1542+
t.Fatal("timed out waiting for worker to start streaming")
1543+
}
15171544
}
15181545

15191546
// Wait for the subscriber to receive the running status, which

0 commit comments

Comments
 (0)