forked from sanbuphy/learn-coding-agent
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontext.ts
More file actions
221 lines (198 loc) · 6.71 KB
/
context.ts
File metadata and controls
221 lines (198 loc) · 6.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// biome-ignore-all assist/source/organizeImports: ANT-ONLY import markers must not be reordered
import { CONTEXT_1M_BETA_HEADER } from '../constants/betas.js'
import { getGlobalConfig } from './config.js'
import { isEnvTruthy } from './envUtils.js'
import { getCanonicalName } from './model/model.js'
import { getModelCapability } from './model/modelCapabilities.js'
// Model context window size (200k tokens for all models right now)
export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
// Maximum output tokens for compact operations
export const COMPACT_MAX_OUTPUT_TOKENS = 20_000
// Default max output tokens
const MAX_OUTPUT_TOKENS_DEFAULT = 32_000
const MAX_OUTPUT_TOKENS_UPPER_LIMIT = 64_000
// Capped default for slot-reservation optimization. BQ p99 output = 4,911
// tokens, so 32k/64k defaults over-reserve 8-16× slot capacity. With the cap
// enabled, <1% of requests hit the limit; those get one clean retry at 64k
// (see query.ts max_output_tokens_escalate). Cap is applied in
// claude.ts:getMaxOutputTokensForModel to avoid the growthbook→betas→context
// import cycle.
export const CAPPED_DEFAULT_MAX_TOKENS = 8_000
export const ESCALATED_MAX_TOKENS = 64_000
/**
* Check if 1M context is disabled via environment variable.
* Used by C4E admins to disable 1M context for HIPAA compliance.
*/
export function is1mContextDisabled(): boolean {
return isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_1M_CONTEXT)
}
export function has1mContext(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
return /\[1m\]/i.test(model)
}
// @[MODEL LAUNCH]: Update this pattern if the new model supports 1M context
export function modelSupports1M(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
const canonical = getCanonicalName(model)
return canonical.includes('claude-sonnet-4') || canonical.includes('opus-4-6')
}
export function getContextWindowForModel(
model: string,
betas?: string[],
): number {
// Allow override via environment variable (ant-only)
// This takes precedence over all other context window resolution, including 1M detection,
// so users can cap the effective context window for local decisions (auto-compact, etc.)
// while still using a 1M-capable endpoint.
if (
process.env.USER_TYPE === 'ant' &&
process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS
) {
const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10)
if (!isNaN(override) && override > 0) {
return override
}
}
// [1m] suffix — explicit client-side opt-in, respected over all detection
if (has1mContext(model)) {
return 1_000_000
}
const cap = getModelCapability(model)
if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
if (
cap.max_input_tokens > MODEL_CONTEXT_WINDOW_DEFAULT &&
is1mContextDisabled()
) {
return MODEL_CONTEXT_WINDOW_DEFAULT
}
return cap.max_input_tokens
}
if (betas?.includes(CONTEXT_1M_BETA_HEADER) && modelSupports1M(model)) {
return 1_000_000
}
if (getSonnet1mExpTreatmentEnabled(model)) {
return 1_000_000
}
if (process.env.USER_TYPE === 'ant') {
const antModel = resolveAntModel(model)
if (antModel?.contextWindow) {
return antModel.contextWindow
}
}
return MODEL_CONTEXT_WINDOW_DEFAULT
}
export function getSonnet1mExpTreatmentEnabled(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
// Only applies to sonnet 4.6 without an explicit [1m] suffix
if (has1mContext(model)) {
return false
}
if (!getCanonicalName(model).includes('sonnet-4-6')) {
return false
}
return getGlobalConfig().clientDataCache?.['coral_reef_sonnet'] === 'true'
}
/**
* Calculate context window usage percentage from token usage data.
* Returns used and remaining percentages, or null values if no usage data.
*/
export function calculateContextPercentages(
currentUsage: {
input_tokens: number
cache_creation_input_tokens: number
cache_read_input_tokens: number
} | null,
contextWindowSize: number,
): { used: number | null; remaining: number | null } {
if (!currentUsage) {
return { used: null, remaining: null }
}
const totalInputTokens =
currentUsage.input_tokens +
currentUsage.cache_creation_input_tokens +
currentUsage.cache_read_input_tokens
const usedPercentage = Math.round(
(totalInputTokens / contextWindowSize) * 100,
)
const clampedUsed = Math.min(100, Math.max(0, usedPercentage))
return {
used: clampedUsed,
remaining: 100 - clampedUsed,
}
}
/**
* Returns the model's default and upper limit for max output tokens.
*/
export function getModelMaxOutputTokens(model: string): {
default: number
upperLimit: number
} {
let defaultTokens: number
let upperLimit: number
if (process.env.USER_TYPE === 'ant') {
const antModel = resolveAntModel(model.toLowerCase())
if (antModel) {
defaultTokens = antModel.defaultMaxTokens ?? MAX_OUTPUT_TOKENS_DEFAULT
upperLimit = antModel.upperMaxTokensLimit ?? MAX_OUTPUT_TOKENS_UPPER_LIMIT
return { default: defaultTokens, upperLimit }
}
}
const m = getCanonicalName(model)
if (m.includes('opus-4-6')) {
defaultTokens = 64_000
upperLimit = 128_000
} else if (m.includes('sonnet-4-6')) {
defaultTokens = 32_000
upperLimit = 128_000
} else if (
m.includes('opus-4-5') ||
m.includes('sonnet-4') ||
m.includes('haiku-4')
) {
defaultTokens = 32_000
upperLimit = 64_000
} else if (m.includes('opus-4-1') || m.includes('opus-4')) {
defaultTokens = 32_000
upperLimit = 32_000
} else if (m.includes('claude-3-opus')) {
defaultTokens = 4_096
upperLimit = 4_096
} else if (m.includes('claude-3-sonnet')) {
defaultTokens = 8_192
upperLimit = 8_192
} else if (m.includes('claude-3-haiku')) {
defaultTokens = 4_096
upperLimit = 4_096
} else if (m.includes('3-5-sonnet') || m.includes('3-5-haiku')) {
defaultTokens = 8_192
upperLimit = 8_192
} else if (m.includes('3-7-sonnet')) {
defaultTokens = 32_000
upperLimit = 64_000
} else {
defaultTokens = MAX_OUTPUT_TOKENS_DEFAULT
upperLimit = MAX_OUTPUT_TOKENS_UPPER_LIMIT
}
const cap = getModelCapability(model)
if (cap?.max_tokens && cap.max_tokens >= 4_096) {
upperLimit = cap.max_tokens
defaultTokens = Math.min(defaultTokens, upperLimit)
}
return { default: defaultTokens, upperLimit }
}
/**
* Returns the max thinking budget tokens for a given model. The max
* thinking tokens should be strictly less than the max output tokens.
*
* Deprecated since newer models use adaptive thinking rather than a
* strict thinking token budget.
*/
export function getMaxThinkingTokensForModel(model: string): number {
return getModelMaxOutputTokens(model).upperLimit - 1
}