Skip to content

Commit 0fc3be8

Browse files
committed
session: fix watchdog blind spots and cancel propagation
* Restructure watchdogTick so idle-detection sweeps SessionActivity.list() independently of stuck-tool query results — fixes Bug anomalyco#4 (idle gated behind stuck-tool results) and Bug anomalyco#5 (leaf.length === 0 early return skipping idle detection entirely) * Add pre-cancel map cleanup in the idle sweep to prevent unbounded growth of stale entries * Define CancelRequested bus event in processor and publish from abortChildren so child sessions get in-memory cancel alongside the DB updates (Bug anomalyco#7) * Add SessionPrompt.init() subscribing to the event, called from bootstrap alongside SessionActivity * Wrap AI SDK iterator consumption in try/finally to call iter.return() on abort, preventing leaked HTTP connections (Bug anomalyco#8) * Check abort signal before retry continue to stop cancelled sessions from restarting loops (Bug anomalyco#9) * Remove shadowed abort promise in permission-check loop so pre-aborted signals reject immediately instead of hanging (Bug anomalyco#10) * Update watchdog tests for independent idle sweep and add cancel-propagation event tests
1 parent 7dbd4ca commit 0fc3be8

File tree

5 files changed

+447
-391
lines changed

5 files changed

+447
-391
lines changed

packages/opencode/src/project/bootstrap.ts

Lines changed: 76 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export async function InstanceBootstrap() {
3636
Snapshot.init()
3737
Truncate.init()
3838
SessionActivity.init()
39+
SessionPrompt.init()
3940
cleanupOrphanedParts()
4041
watchdog()
4142

@@ -110,105 +111,93 @@ export function watchdogTick(cutoff: number, idle?: number) {
110111
AND json_extract(${PartTable.data}, '$.state.time.start') < ${cutoff}`,
111112
)
112113
.all()
113-
if (stuck.length === 0) return
114114

115-
// Sessions that contain at least one stuck tool
116-
const stuckSessions = new Set(stuck.map((r) => r.session_id))
117-
118-
// A task tool whose child session also has stuck tools is just
119-
// waiting — it will resolve once the child is cancelled.
120-
// Everything else (non-task tools, or task tools whose child has
121-
// no stuck tools) is a leaf that we must force-error.
122-
const leaf = stuck.filter((r) => {
123-
if (r.tool !== "task") return true
124-
if (!r.child) return true
125-
return !stuckSessions.has(r.child)
126-
})
115+
const cancelled = new Set<string>()
127116

128-
log.warn("watchdog: found stuck tool parts", {
129-
total: stuck.length,
130-
leaf: leaf.length,
131-
ids: stuck.map((r) => r.id),
132-
})
117+
if (stuck.length > 0) {
118+
// Sessions that contain at least one stuck tool
119+
const stuckSessions = new Set(stuck.map((r) => r.session_id))
120+
121+
// A task tool whose child session also has stuck tools is just
122+
// waiting — it will resolve once the child is cancelled.
123+
// Everything else (non-task tools, or task tools whose child has
124+
// no stuck tools) is a leaf that we must force-error.
125+
const leaf = stuck.filter((r) => {
126+
if (r.tool !== "task") return true
127+
if (!r.child) return true
128+
return !stuckSessions.has(r.child)
129+
})
133130

134-
if (leaf.length === 0) return
131+
log.warn("watchdog: found stuck tool parts", {
132+
total: stuck.length,
133+
leaf: leaf.length,
134+
ids: stuck.map((r) => r.id),
135+
})
135136

136-
// For task-tool leaves, cancel the *child* session so the task tool's
137-
// normal error-propagation path runs: child cancel → SessionPrompt.prompt()
138-
// resolves → task tool returns structured TIMEOUT to the parent LLM.
139-
// For non-task leaves, cancel the owning session directly.
140-
const cancelled = new Set<string>()
141-
for (const r of leaf) {
142-
if (r.tool === "task" && r.child) {
143-
if (cancelled.has(r.child)) continue
144-
cancelled.add(r.child)
145-
log.warn("watchdog: cancelling stuck child session", { child: r.child, parent: r.session_id })
146-
SessionPrompt.cancel(r.child)
147-
} else {
148-
if (cancelled.has(r.session_id)) continue
149-
cancelled.add(r.session_id)
150-
log.warn("watchdog: cancelling stuck session", { sessionID: r.session_id })
151-
SessionPrompt.cancel(r.session_id)
137+
if (leaf.length > 0) {
138+
// For task-tool leaves, cancel the *child* session so the task tool's
139+
// normal error-propagation path runs: child cancel → SessionPrompt.prompt()
140+
// resolves → task tool returns structured TIMEOUT to the parent LLM.
141+
// For non-task leaves, cancel the owning session directly.
142+
for (const r of leaf) {
143+
if (r.tool === "task" && r.child) {
144+
if (cancelled.has(r.child)) continue
145+
cancelled.add(r.child)
146+
log.warn("watchdog: cancelling stuck child session", { child: r.child, parent: r.session_id })
147+
SessionPrompt.cancel(r.child)
148+
} else {
149+
if (cancelled.has(r.session_id)) continue
150+
cancelled.add(r.session_id)
151+
log.warn("watchdog: cancelling stuck session", { sessionID: r.session_id })
152+
SessionPrompt.cancel(r.session_id)
153+
}
154+
}
155+
156+
// DB update as redundant safety net — only for leaf tools
157+
const now = Date.now()
158+
for (const r of leaf) {
159+
db.update(PartTable)
160+
.set({
161+
data: sql`json_set(
162+
json_set(
163+
json_set(${PartTable.data}, '$.state.status', 'error'),
164+
'$.state.error', 'Tool execution exceeded maximum allowed duration (watchdog)'
165+
),
166+
'$.state.time.end', ${now}
167+
)`,
168+
})
169+
.where(
170+
sql`${PartTable.id} = ${r.id}
171+
AND json_extract(${PartTable.data}, '$.state.status') = 'running'`,
172+
)
173+
.run()
174+
}
152175
}
153176
}
154177

155-
// DB update as redundant safety net — only for leaf tools
156-
const now = Date.now()
157-
for (const r of leaf) {
158-
db.update(PartTable)
159-
.set({
160-
data: sql`json_set(
161-
json_set(
162-
json_set(${PartTable.data}, '$.state.status', 'error'),
163-
'$.state.error', 'Tool execution exceeded maximum allowed duration (watchdog)'
164-
),
165-
'$.state.time.end', ${now}
166-
)`,
167-
})
168-
.where(
169-
sql`${PartTable.id} = ${r.id}
170-
AND json_extract(${PartTable.data}, '$.state.status') = 'running'`,
171-
)
172-
.run()
173-
}
174-
175-
// --- Idle detection for subagent sessions ---
176-
// A session is "idle" when it has recorded activity (stream started)
177-
// but nothing has happened for longer than the idle threshold.
178-
// Only subagent sessions (those with a parent task tool among the
179-
// stuck set) are candidates — root/interactive sessions are exempt.
178+
// --- Independent idle detection sweep ---
179+
// Runs on every tick when idle param is provided, regardless of
180+
// whether any stuck tool parts were found above.
181+
// Iterates ALL tracked sessions in SessionActivity, not just
182+
// children of stuck task tools.
180183
if (idle) {
181-
// Collect child session IDs referenced by stuck task tools
182-
const children = new Set(stuck.filter((r) => r.tool === "task" && r.child).map((r) => r.child!))
183-
184-
// Sessions that currently have running tools are NOT idle — the tool
185-
// is doing work even though no Bus events are firing (e.g. a long
186-
// bash command, a web fetch, a large file read). Query once and
187-
// build a set so we skip them cheaply.
188-
const running = new Set(
189-
db
190-
.select({ sid: PartTable.session_id })
191-
.from(PartTable)
192-
.where(
193-
sql`json_extract(${PartTable.data}, '$.type') = 'tool'
194-
AND json_extract(${PartTable.data}, '$.state.status') = 'running'`,
195-
)
196-
.all()
197-
.map((r) => r.sid),
198-
)
184+
// Clean up stale pre-cancel entries that were never consumed
185+
const now = Date.now()
186+
for (const [id, ts] of SessionPrompt._precancelled) {
187+
if (now - ts > idle) SessionPrompt._precancelled.delete(id)
188+
}
199189

200-
for (const child of children) {
201-
if (cancelled.has(child)) continue
202-
if (running.has(child)) continue
203-
if (!SessionActivity.stale(child, idle)) continue
204-
const ts = SessionActivity.last(child)
205-
log.warn("watchdog: idle subagent detected", {
206-
sessionID: child,
190+
for (const [id] of Object.entries(SessionActivity.list())) {
191+
if (cancelled.has(id)) continue
192+
if (!SessionActivity.stale(id, idle)) continue
193+
const ts = SessionActivity.last(id)
194+
log.warn("watchdog: idle session detected", {
195+
sessionID: id,
207196
last: ts,
208197
threshold: idle,
209198
})
210-
cancelled.add(child)
211-
SessionPrompt.cancel(child)
199+
cancelled.add(id)
200+
SessionPrompt.cancel(id)
212201
}
213202
}
214203
})

0 commit comments

Comments
 (0)