feat: 1-level TRM

ValerianRey · ValerianRey · commit c502b1be1662 · 2025-12-22T16:46:41.000Z
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -15,9 +15,10 @@
                 "arch=trm",
                 "data_paths=[data/sudoku-extreme-1k-aug-1000]",
                 "evaluators=[]",
-                "epochs=5",
-                "eval_interval=1",
+                "epochs=50000",
+                "eval_interval=5000",
                 "lr=1e-4",
+                "global_batch_size=768",
                 "puzzle_emb_lr=1e-4",
                 "weight_decay=1.0",
                 "puzzle_emb_weight_decay=1.0",
diff --git a/src/recursion/models/recursive_reasoning/trm.py b/src/recursion/models/recursive_reasoning/trm.py
@@ -286,8 +286,7 @@ def forward(
 
         # Forward iterations
         z_H, z_L = carry.z_H, carry.z_L
-        z_L = self.L_level(z_L, z_H + input_embeddings, **seq_info)
-        z_H = self.L_level(z_H, z_L, **seq_info)
+        z_H = self.L_level(z_L, z_H + input_embeddings, **seq_info)
 
         # LM Outputs
         new_carry = TinyRecursiveReasoningModel_ACTV1InnerCarry(
@@ -356,6 +355,39 @@ def forward(
 
             halted = is_last_step
 
+            # if training, and ACT is enabled
+            if self.training and (self.config.halt_max_steps > 1):
+
+                # Halt signal
+                # NOTE: During evaluation, always use max steps, this is to guarantee the same halting steps inside a batch for batching purposes
+
+                if self.config.no_ACT_continue:
+                    halted = halted | (q_halt_logits > 0)
+                else:
+                    halted = halted | (q_halt_logits > q_continue_logits)
+
+                # Exploration
+                min_halt_steps = (
+                    torch.rand_like(q_halt_logits) < self.config.halt_exploration_prob
+                ) * torch.randint_like(new_steps, low=2, high=self.config.halt_max_steps + 1)
+                halted = halted & (new_steps >= min_halt_steps)
+
+                if not self.config.no_ACT_continue:
+                    # Compute target Q
+                    # NOTE: No replay buffer and target networks for computing target Q-value.
+                    # As batch_size is large, there're many parallel envs.
+                    # Similar concept as PQN https://arxiv.org/abs/2407.04811
+                    _, _, (next_q_halt_logits, next_q_continue_logits), _, _ = self.inner(
+                        new_inner_carry, new_current_data
+                    )
+                    outputs["target_q_continue"] = torch.sigmoid(
+                        torch.where(
+                            is_last_step,
+                            next_q_halt_logits,
+                            torch.maximum(next_q_halt_logits, next_q_continue_logits),
+                        )
+                    )
+
         return (
             TinyRecursiveReasoningModel_ACTV1Carry(
                 new_inner_carry, new_steps, halted, new_current_data