-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain.sh
More file actions
executable file
·56 lines (47 loc) · 1.67 KB
/
train.sh
File metadata and controls
executable file
·56 lines (47 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash
set -euo pipefail
cd "$(dirname "$0")"
AGENT_ID="${1:-$(whoami)}"
DATA_DIR="data/agents/${AGENT_ID}"
ADAPTER_DIR="${DATA_DIR}/adapter"
ADAPTER_DATA="${ADAPTER_DIR}/data"
# Training hyperparameters (override via environment)
ITERS=${TRAIN_ITERS:-1000}
NUM_LAYERS=${TRAIN_NUM_LAYERS:-16}
LEARNING_RATE=${TRAIN_LR:-1e-5}
BATCH_SIZE=${TRAIN_BATCH_SIZE:-1}
MAX_SEQ_LENGTH=${TRAIN_MAX_SEQ_LENGTH:-2048}
SAVE_EVERY=${TRAIN_SAVE_EVERY:-200}
# Check training data exists
if [ ! -f "${ADAPTER_DATA}/train.jsonl" ]; then
echo "No training data at ${ADAPTER_DATA}/train.jsonl"
echo "Run the training pipeline first to generate Q&A pairs:"
echo " uv run python -m memory_server.training.train --agent-id ${AGENT_ID}"
exit 1
fi
PAIRS=$(wc -l < "${ADAPTER_DATA}/train.jsonl")
echo "Training LoRA adapter for agent '${AGENT_ID}'"
echo " Data: ${PAIRS} Q&A pairs"
echo " Iterations: ${ITERS}, Layers: ${NUM_LAYERS}, LR: ${LEARNING_RATE}"
echo " Batch: ${BATCH_SIZE}, Max seq: ${MAX_SEQ_LENGTH}"
echo ""
# Train
uv run python -m mlx_lm lora \
--model NexVeridian/Qwen3.5-35B-A3B-4bit \
--train \
--data "${ADAPTER_DATA}" \
--adapter-path "${ADAPTER_DIR}" \
--batch-size "${BATCH_SIZE}" \
--num-layers "${NUM_LAYERS}" \
--iters "${ITERS}" \
--learning-rate "${LEARNING_RATE}" \
--save-every "${SAVE_EVERY}" \
--steps-per-report 10 \
--grad-checkpoint \
--max-seq-length "${MAX_SEQ_LENGTH}" \
--mask-prompt
# Clean up intermediate checkpoints
rm -f "${ADAPTER_DIR}"/0*_adapters.safetensors
echo ""
echo "Done! Adapter at: ${ADAPTER_DIR}/adapters.safetensors"
echo "mlx-lm server will pick it up on next request — no restart needed."