forked from sanbuphy/learn-coding-agent
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbridgeDebug.ts
More file actions
135 lines (124 loc) · 4.81 KB
/
bridgeDebug.ts
File metadata and controls
135 lines (124 loc) · 4.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import { logForDebugging } from '../utils/debug.js'
import { BridgeFatalError } from './bridgeApi.js'
import type { BridgeApiClient } from './types.js'
/**
* Ant-only fault injection for manually testing bridge recovery paths.
*
* Real failure modes this targets (BQ 2026-03-12, 7-day window):
* poll 404 not_found_error — 147K sessions/week, dead onEnvironmentLost gate
* ws_closed 1002/1006 — 22K sessions/week, zombie poll after close
* register transient failure — residual: network blips during doReconnect
*
* Usage: /bridge-kick <subcommand> from the REPL while Remote Control is
* connected, then tail debug.log to watch the recovery machinery react.
*
* Module-level state is intentional here: one bridge per REPL process, the
* /bridge-kick slash command has no other way to reach into initBridgeCore's
* closures, and teardown clears the slot.
*/
/** One-shot fault to inject on the next matching api call. */
type BridgeFault = {
method:
| 'pollForWork'
| 'registerBridgeEnvironment'
| 'reconnectSession'
| 'heartbeatWork'
/** Fatal errors go through handleErrorStatus → BridgeFatalError. Transient
* errors surface as plain axios rejections (5xx / network). Recovery code
* distinguishes the two: fatal → teardown, transient → retry/backoff. */
kind: 'fatal' | 'transient'
status: number
errorType?: string
/** Remaining injections. Decremented on consume; removed at 0. */
count: number
}
export type BridgeDebugHandle = {
/** Invoke the transport's permanent-close handler directly. Tests the
* ws_closed → reconnectEnvironmentWithSession escalation (#22148). */
fireClose: (code: number) => void
/** Call reconnectEnvironmentWithSession() — same as SIGUSR2 but
* reachable from the slash command. */
forceReconnect: () => void
/** Queue a fault for the next N calls to the named api method. */
injectFault: (fault: BridgeFault) => void
/** Abort the at-capacity sleep so an injected poll fault lands
* immediately instead of up to 10min later. */
wakePollLoop: () => void
/** env/session IDs for the debug.log grep. */
describe: () => string
}
let debugHandle: BridgeDebugHandle | null = null
const faultQueue: BridgeFault[] = []
export function registerBridgeDebugHandle(h: BridgeDebugHandle): void {
debugHandle = h
}
export function clearBridgeDebugHandle(): void {
debugHandle = null
faultQueue.length = 0
}
export function getBridgeDebugHandle(): BridgeDebugHandle | null {
return debugHandle
}
export function injectBridgeFault(fault: BridgeFault): void {
faultQueue.push(fault)
logForDebugging(
`[bridge:debug] Queued fault: ${fault.method} ${fault.kind}/${fault.status}${fault.errorType ? `/${fault.errorType}` : ''} ×${fault.count}`,
)
}
/**
* Wrap a BridgeApiClient so each call first checks the fault queue. If a
* matching fault is queued, throw the specified error instead of calling
* through. Delegates everything else to the real client.
*
* Only called when USER_TYPE === 'ant' — zero overhead in external builds.
*/
export function wrapApiForFaultInjection(
api: BridgeApiClient,
): BridgeApiClient {
function consume(method: BridgeFault['method']): BridgeFault | null {
const idx = faultQueue.findIndex(f => f.method === method)
if (idx === -1) return null
const fault = faultQueue[idx]!
fault.count--
if (fault.count <= 0) faultQueue.splice(idx, 1)
return fault
}
function throwFault(fault: BridgeFault, context: string): never {
logForDebugging(
`[bridge:debug] Injecting ${fault.kind} fault into ${context}: status=${fault.status} errorType=${fault.errorType ?? 'none'}`,
)
if (fault.kind === 'fatal') {
throw new BridgeFatalError(
`[injected] ${context} ${fault.status}`,
fault.status,
fault.errorType,
)
}
// Transient: mimic an axios rejection (5xx / network). No .status on
// the error itself — that's how the catch blocks distinguish.
throw new Error(`[injected transient] ${context} ${fault.status}`)
}
return {
...api,
async pollForWork(envId, secret, signal, reclaimMs) {
const f = consume('pollForWork')
if (f) throwFault(f, 'Poll')
return api.pollForWork(envId, secret, signal, reclaimMs)
},
async registerBridgeEnvironment(config) {
const f = consume('registerBridgeEnvironment')
if (f) throwFault(f, 'Registration')
return api.registerBridgeEnvironment(config)
},
async reconnectSession(envId, sessionId) {
const f = consume('reconnectSession')
if (f) throwFault(f, 'ReconnectSession')
return api.reconnectSession(envId, sessionId)
},
async heartbeatWork(envId, workId, token) {
const f = consume('heartbeatWork')
if (f) throwFault(f, 'Heartbeat')
return api.heartbeatWork(envId, workId, token)
},
}
}