@@ -36,6 +36,7 @@ export async function InstanceBootstrap() {
3636 Snapshot . init ( )
3737 Truncate . init ( )
3838 SessionActivity . init ( )
39+ SessionPrompt . init ( )
3940 cleanupOrphanedParts ( )
4041 watchdog ( )
4142
@@ -110,105 +111,93 @@ export function watchdogTick(cutoff: number, idle?: number) {
110111 AND json_extract(${ PartTable . data } , '$.state.time.start') < ${ cutoff } ` ,
111112 )
112113 . all ( )
113- if ( stuck . length === 0 ) return
114114
115- // Sessions that contain at least one stuck tool
116- const stuckSessions = new Set ( stuck . map ( ( r ) => r . session_id ) )
117-
118- // A task tool whose child session also has stuck tools is just
119- // waiting — it will resolve once the child is cancelled.
120- // Everything else (non-task tools, or task tools whose child has
121- // no stuck tools) is a leaf that we must force-error.
122- const leaf = stuck . filter ( ( r ) => {
123- if ( r . tool !== "task" ) return true
124- if ( ! r . child ) return true
125- return ! stuckSessions . has ( r . child )
126- } )
115+ const cancelled = new Set < string > ( )
127116
128- log . warn ( "watchdog: found stuck tool parts" , {
129- total : stuck . length ,
130- leaf : leaf . length ,
131- ids : stuck . map ( ( r ) => r . id ) ,
132- } )
117+ if ( stuck . length > 0 ) {
118+ // Sessions that contain at least one stuck tool
119+ const stuckSessions = new Set ( stuck . map ( ( r ) => r . session_id ) )
120+
121+ // A task tool whose child session also has stuck tools is just
122+ // waiting — it will resolve once the child is cancelled.
123+ // Everything else (non-task tools, or task tools whose child has
124+ // no stuck tools) is a leaf that we must force-error.
125+ const leaf = stuck . filter ( ( r ) => {
126+ if ( r . tool !== "task" ) return true
127+ if ( ! r . child ) return true
128+ return ! stuckSessions . has ( r . child )
129+ } )
133130
134- if ( leaf . length === 0 ) return
131+ log . warn ( "watchdog: found stuck tool parts" , {
132+ total : stuck . length ,
133+ leaf : leaf . length ,
134+ ids : stuck . map ( ( r ) => r . id ) ,
135+ } )
135136
136- // For task-tool leaves, cancel the *child* session so the task tool's
137- // normal error-propagation path runs: child cancel → SessionPrompt.prompt()
138- // resolves → task tool returns structured TIMEOUT to the parent LLM.
139- // For non-task leaves, cancel the owning session directly.
140- const cancelled = new Set < string > ( )
141- for ( const r of leaf ) {
142- if ( r . tool === "task" && r . child ) {
143- if ( cancelled . has ( r . child ) ) continue
144- cancelled . add ( r . child )
145- log . warn ( "watchdog: cancelling stuck child session" , { child : r . child , parent : r . session_id } )
146- SessionPrompt . cancel ( r . child )
147- } else {
148- if ( cancelled . has ( r . session_id ) ) continue
149- cancelled . add ( r . session_id )
150- log . warn ( "watchdog: cancelling stuck session" , { sessionID : r . session_id } )
151- SessionPrompt . cancel ( r . session_id )
137+ if ( leaf . length > 0 ) {
138+ // For task-tool leaves, cancel the *child* session so the task tool's
139+ // normal error-propagation path runs: child cancel → SessionPrompt.prompt()
140+ // resolves → task tool returns structured TIMEOUT to the parent LLM.
141+ // For non-task leaves, cancel the owning session directly.
142+ for ( const r of leaf ) {
143+ if ( r . tool === "task" && r . child ) {
144+ if ( cancelled . has ( r . child ) ) continue
145+ cancelled . add ( r . child )
146+ log . warn ( "watchdog: cancelling stuck child session" , { child : r . child , parent : r . session_id } )
147+ SessionPrompt . cancel ( r . child )
148+ } else {
149+ if ( cancelled . has ( r . session_id ) ) continue
150+ cancelled . add ( r . session_id )
151+ log . warn ( "watchdog: cancelling stuck session" , { sessionID : r . session_id } )
152+ SessionPrompt . cancel ( r . session_id )
153+ }
154+ }
155+
156+ // DB update as redundant safety net — only for leaf tools
157+ const now = Date . now ( )
158+ for ( const r of leaf ) {
159+ db . update ( PartTable )
160+ . set ( {
161+ data : sql `json_set(
162+ json_set(
163+ json_set(${ PartTable . data } , '$.state.status', 'error'),
164+ '$.state.error', 'Tool execution exceeded maximum allowed duration (watchdog)'
165+ ),
166+ '$.state.time.end', ${ now }
167+ )` ,
168+ } )
169+ . where (
170+ sql `${ PartTable . id } = ${ r . id }
171+ AND json_extract(${ PartTable . data } , '$.state.status') = 'running'` ,
172+ )
173+ . run ( )
174+ }
152175 }
153176 }
154177
155- // DB update as redundant safety net — only for leaf tools
156- const now = Date . now ( )
157- for ( const r of leaf ) {
158- db . update ( PartTable )
159- . set ( {
160- data : sql `json_set(
161- json_set(
162- json_set(${ PartTable . data } , '$.state.status', 'error'),
163- '$.state.error', 'Tool execution exceeded maximum allowed duration (watchdog)'
164- ),
165- '$.state.time.end', ${ now }
166- )` ,
167- } )
168- . where (
169- sql `${ PartTable . id } = ${ r . id }
170- AND json_extract(${ PartTable . data } , '$.state.status') = 'running'` ,
171- )
172- . run ( )
173- }
174-
175- // --- Idle detection for subagent sessions ---
176- // A session is "idle" when it has recorded activity (stream started)
177- // but nothing has happened for longer than the idle threshold.
178- // Only subagent sessions (those with a parent task tool among the
179- // stuck set) are candidates — root/interactive sessions are exempt.
178+ // --- Independent idle detection sweep ---
179+ // Runs on every tick when idle param is provided, regardless of
180+ // whether any stuck tool parts were found above.
181+ // Iterates ALL tracked sessions in SessionActivity, not just
182+ // children of stuck task tools.
180183 if ( idle ) {
181- // Collect child session IDs referenced by stuck task tools
182- const children = new Set ( stuck . filter ( ( r ) => r . tool === "task" && r . child ) . map ( ( r ) => r . child ! ) )
183-
184- // Sessions that currently have running tools are NOT idle — the tool
185- // is doing work even though no Bus events are firing (e.g. a long
186- // bash command, a web fetch, a large file read). Query once and
187- // build a set so we skip them cheaply.
188- const running = new Set (
189- db
190- . select ( { sid : PartTable . session_id } )
191- . from ( PartTable )
192- . where (
193- sql `json_extract(${ PartTable . data } , '$.type') = 'tool'
194- AND json_extract(${ PartTable . data } , '$.state.status') = 'running'` ,
195- )
196- . all ( )
197- . map ( ( r ) => r . sid ) ,
198- )
184+ // Clean up stale pre-cancel entries that were never consumed
185+ const now = Date . now ( )
186+ for ( const [ id , ts ] of SessionPrompt . _precancelled ) {
187+ if ( now - ts > idle ) SessionPrompt . _precancelled . delete ( id )
188+ }
199189
200- for ( const child of children ) {
201- if ( cancelled . has ( child ) ) continue
202- if ( running . has ( child ) ) continue
203- if ( ! SessionActivity . stale ( child , idle ) ) continue
204- const ts = SessionActivity . last ( child )
205- log . warn ( "watchdog: idle subagent detected" , {
206- sessionID : child ,
190+ for ( const [ id ] of Object . entries ( SessionActivity . list ( ) ) ) {
191+ if ( cancelled . has ( id ) ) continue
192+ if ( ! SessionActivity . stale ( id , idle ) ) continue
193+ const ts = SessionActivity . last ( id )
194+ log . warn ( "watchdog: idle session detected" , {
195+ sessionID : id ,
207196 last : ts ,
208197 threshold : idle ,
209198 } )
210- cancelled . add ( child )
211- SessionPrompt . cancel ( child )
199+ cancelled . add ( id )
200+ SessionPrompt . cancel ( id )
212201 }
213202 }
214203 } )
0 commit comments