Add thinking

Diegi97 · Diegi97 · commit 69c65beda145 · 2025-03-12T17:36:21.000+01:00
diff --git a/back/back/apps/broker/serializers/rpc.py b/back/back/apps/broker/serializers/rpc.py
@@ -91,6 +91,16 @@ class CacheConfigSerializer(serializers.Serializer):
     name = serializers.CharField(required=False, allow_null=True)
 
 
+class ThinkingField(serializers.Field):
+    """Custom field that accepts both a string or a dictionary"""
+    def to_internal_value(self, data):
+        # Return as is - can be either string or dict
+        return data
+
+    def to_representation(self, value):
+        return value
+
+
 class RPCLLMRequestSerializer(serializers.Serializer):
     """
     Represents the LLM requests coming from the RPC server
@@ -112,6 +122,8 @@ class RPCLLMRequestSerializer(serializers.Serializer):
         The seed to use in the LLM
     stream: bool
         Whether the LLM response should be streamed or not
+    thinking: str or Dict
+        The thinking to use in the LLM
     """
 
     llm_config_name = serializers.CharField(required=True, allow_blank=False, allow_null=False)
@@ -121,6 +133,7 @@ class RPCLLMRequestSerializer(serializers.Serializer):
     temperature = serializers.FloatField(default=0.7, required=False)
     max_tokens = serializers.IntegerField(default=1024, required=False)
     seed = serializers.IntegerField(default=42, required=False)
+    thinking = ThinkingField(default=None, required=False, allow_null=True)
     tools = serializers.ListField(
         child=serializers.DictField(), allow_empty=True, required=False, allow_null=True
     )
diff --git a/back/back/apps/language_model/consumers/__init__.py b/back/back/apps/language_model/consumers/__init__.py
@@ -1,7 +1,7 @@
 import json
 import uuid
 from logging import getLogger
-from typing import Awaitable, Callable, Dict, List, Optional
+from typing import Awaitable, Callable, Dict, List, Optional, Union
 
 from channels.db import database_sync_to_async
 from channels.generic.websocket import AsyncJsonWebsocketConsumer
@@ -205,6 +205,7 @@ async def query_llm(
     temperature: float = 0.7,
     max_tokens: int = 1024,
     seed: int = 42,
+    thinking: Union[str, Dict] = None,
     tools: List[Dict] = None,
     tool_choice: str = None,
     use_conversation_context: bool = True,
@@ -303,6 +304,7 @@ async def query_llm(
                 temperature=temperature,
                 max_tokens=max_tokens,
                 seed=seed,
+                thinking=thinking,
                 cache_config=cache_config,
             )
             async for res in response:
@@ -321,6 +323,7 @@ async def query_llm(
                 temperature=temperature,
                 max_tokens=max_tokens,
                 seed=seed,
+                thinking=thinking,
                 tools=tools,
                 tool_choice=tool_choice,
                 cache_config=cache_config,
@@ -432,6 +435,7 @@ async def process_llm_request(self, data):
             data.get("temperature"),
             data.get("max_tokens"),
             data.get("seed"),
+            data.get("thinking"),
             data.get("tools"),
             data.get("tool_choice"),
             data.get("use_conversation_context"),
diff --git a/chat_rag/chat_rag/llms/claude_client.py b/chat_rag/chat_rag/llms/claude_client.py
@@ -11,7 +11,7 @@
 
 
 class ClaudeChatModel(LLM):
-    def __init__(self, llm_name: str = "claude-3-opus-20240229", **kwargs) -> None:
+    def __init__(self, llm_name: str = "claude-3-7-sonnet-latest", **kwargs) -> None:
         self.llm_name = llm_name
         self.client = Anthropic(
             api_key=os.environ.get("ANTHROPIC_API_KEY"),
@@ -85,7 +85,7 @@ def format_content(message: Union[Dict, Message]):
             return content_list
 
         messages_formatted = [
-            {"role": message["role"], "content": format_content(message)}
+            {"role": message["role"] if isinstance(message, Dict) else message.role, "content": format_content(message)}
             for message in messages
         ]
 
@@ -102,6 +102,7 @@ def _map_anthropic_message(self, message) -> Message:
         """
         content_list = []
         for part in message.content:
+            # TODO: Handle thinking output block types
             if part.type == "text":
                 content_list.append(
                     Content(
@@ -135,6 +136,7 @@ def stream(
         temperature: float = 0.2,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: dict = None,
         **kwargs,
     ):
         """
@@ -157,18 +159,24 @@ def stream(
             temperature=temperature,
             max_tokens=max_tokens,
             stream=True,
+            thinking=thinking if thinking else NOT_GIVEN,
         )
 
         for event in stream:
             if event.type == "content_block_delta":
-                yield event.delta.text
+                if event.delta.type == "thinking_delta":
+                    pass # Pass for now until I figure out a common interface for thinking
+                    # yield event.delta.thinking
+                elif event.delta.type == "text_delta":
+                    yield event.delta.text
 
     async def astream(
         self,
         messages: List[Union[Dict, Message]],
         temperature: float = 0.2,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: dict = None,
         **kwargs,
     ):
         """
@@ -191,18 +199,24 @@ async def astream(
             temperature=temperature,
             max_tokens=max_tokens,
             stream=True,
+            thinking=thinking if thinking else NOT_GIVEN,
         )
 
         async for event in stream:
             if event.type == "content_block_delta":
-                yield event.delta.text
+                if event.delta.type == "thinking_delta":
+                    pass # Pass for now until I figure out a common interface for thinking
+                    # yield event.delta.thinking
+                elif event.delta.type == "text_delta":
+                    yield event.delta.text
 
     def generate(
         self,
         messages: List[Union[Dict, Message]],
         temperature: float = 0.2,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: dict = None,
         tools: List[Union[Callable, Dict]] = None,
         tool_choice: str = None,
         **kwargs,
@@ -232,6 +246,7 @@ def generate(
             temperature=temperature,
             max_tokens=max_tokens,
             **tool_kwargs,
+            thinking=thinking if thinking else NOT_GIVEN,
         )
 
         return self._map_anthropic_message(message)
@@ -242,6 +257,7 @@ async def agenerate(
         temperature: float = 0.2,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: dict = None,
         tools: List[Union[Callable, Dict]] = None,
         tool_choice: str = None,
         **kwargs,
@@ -271,6 +287,7 @@ async def agenerate(
             temperature=temperature,
             max_tokens=max_tokens,
             **tool_kwargs,
+            thinking=thinking if thinking else NOT_GIVEN,
         )
 
         return self._map_anthropic_message(message)
diff --git a/chat_rag/chat_rag/llms/openai_client.py b/chat_rag/chat_rag/llms/openai_client.py
@@ -1,7 +1,7 @@
 import json
 from typing import Callable, Dict, List, Union
 
-from openai import AsyncOpenAI, OpenAI
+from openai import AsyncOpenAI, OpenAI, NOT_GIVEN
 from openai.lib._pydantic import _ensure_strict_json_schema
 
 from chat_rag.llms.types import Content, Message, ToolUse, Usage
@@ -142,6 +142,7 @@ def stream(
         temperature: float = 1.0,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: str = NOT_GIVEN,
         **kwargs,
     ):
         """
@@ -161,10 +162,11 @@ def stream(
             model=self.llm_name,
             messages=messages,
             temperature=temperature,
-            max_tokens=max_tokens,
+            max_completion_tokens=max_tokens,
             seed=seed,
             n=1,
             stream=True,
+            reasoning_effort=thinking,
         )
         for chunk in response:
             if chunk.choices[0].finish_reason == "stop":
@@ -178,6 +180,7 @@ async def astream(
         temperature: float = 1.0,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: str = NOT_GIVEN,
         **kwargs,
     ):
         """
@@ -196,10 +199,11 @@ async def astream(
             model=self.llm_name,
             messages=messages,
             temperature=temperature,
-            max_tokens=max_tokens,
+            max_completion_tokens=max_tokens,
             seed=seed,
             n=1,
             stream=True,
+            reasoning_effort=thinking,
         )
         async for chunk in response:
             if chunk.choices[0].finish_reason == "stop":
@@ -213,6 +217,7 @@ def generate(
         temperature: float = 1.0,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: str = NOT_GIVEN,
         tools: List[Union[Callable, Dict]] = None,
         tool_choice: str = None,
         **kwargs,
@@ -237,8 +242,9 @@ def generate(
             model=self.llm_name,
             messages=messages,
             temperature=temperature,
-            max_tokens=max_tokens,
+            max_completion_tokens=max_tokens,
             seed=seed,
+            reasoning_effort=thinking,
             n=1,
             tools=tools,
             tool_choice=tool_choice,
@@ -253,6 +259,7 @@ async def agenerate(
         temperature: float = 1.0,
         max_tokens: int = 1024,
         seed: int = None,
+        thinking: str = NOT_GIVEN,
         tools: List[Union[Callable, Dict]] = None,
         tool_choice: str = None,
         **kwargs,
@@ -276,8 +283,9 @@ async def agenerate(
             model=self.llm_name,
             messages=messages,
             temperature=temperature,
-            max_tokens=max_tokens,
+            max_completion_tokens=max_tokens,
             seed=seed,
+            reasoning_effort=thinking,
             n=1,
             tools=tools,
             tool_choice=tool_choice,
diff --git a/chat_rag/poetry.lock b/chat_rag/poetry.lock
diff --git a/chat_rag/pyproject.toml b/chat_rag/pyproject.toml
@@ -17,8 +17,8 @@ certifi = "^2023.7.22"
 urllib3 = "^1.26.18"
 aiohttp = "^3.8.5"
 cryptography = "^41.0.4"
-openai = "^1.33.0"
-anthropic = "0.28.0"
+openai = "1.66.2"
+anthropic = "0.49.0"
 mistralai = "0.4.0"
 docstring-parser = "^0.16"
 torch = {version = "2.3.0", optional = true}
diff --git a/sdk/chatfaq_sdk/__init__.py b/sdk/chatfaq_sdk/__init__.py
@@ -328,6 +328,7 @@ async def send_llm_request(
         temperature,
         max_tokens,
         seed,
+        thinking,
         tools,
         tool_choice,
         conversation_id,
@@ -353,6 +354,7 @@ async def send_llm_request(
                         "temperature": temperature,
                         "max_tokens": max_tokens,
                         "seed": seed,
+                        "thinking": thinking,
                         "tools": tools,
                         "tool_choice": tool_choice,
                         "use_conversation_context": use_conversation_context,
diff --git a/sdk/chatfaq_sdk/clients/__init__.py b/sdk/chatfaq_sdk/clients/__init__.py
diff --git a/sdk/chatfaq_sdk/clients/agent.py b/sdk/chatfaq_sdk/clients/agent.py