simonw
diff --git a/‎llm_openai.py‎
Lines changed: 204 additions & 10 deletions b/‎llm_openai.py‎
Lines changed: 204 additions & 10 deletions
diff --git a/‎tests/__snapshots__/test_openai.ambr‎
Lines changed: 8 additions & 0 deletions b/‎tests/__snapshots__/test_openai.ambr‎
Lines changed: 8 additions & 0 deletions
@@ -1,4 +1,6 @@
+import json
 from enum import Enum
+import llm
 from llm import (
     AsyncKeyModel,
     KeyModel,
@@ -174,6 +176,7 @@ def __init__(
         if reasoning:
             options.append(ReasoningOptions)
         self.Options = combine_options(*options)
+        self.supports_tools = True
 
     def __str__(self):
         return f"OpenAI: {self.model_id}"
@@ -219,9 +222,30 @@ def _build_messages(self, prompt, conversation):
                     messages.append(
                         {"role": "user", "content": prev_response.prompt.prompt}
                     )
-                messages.append(
-                    {"role": "assistant", "content": prev_response.text_or_raise()}
-                )
+                for tool_result in getattr(prev_response.prompt, "tool_results", []):
+                    if not tool_result.tool_call_id:
+                        continue
+                    messages.append(
+                        {
+                            "type": "function_call_output",
+                            "call_id": tool_result.tool_call_id,
+                            "output": tool_result.output,
+                        }
+                    )
+                prev_text = prev_response.text_or_raise()
+                if prev_text:
+                    messages.append({"role": "assistant", "content": prev_text})
+                tool_calls = prev_response.tool_calls_or_raise()
+                if tool_calls:
+                    for tool_call in tool_calls:
+                        messages.append(
+                            {
+                                "type": "function_call",
+                                "call_id": tool_call.tool_call_id,
+                                "name": tool_call.name,
+                                "arguments": json.dumps(tool_call.arguments),
+                            }
+                        )
         if prompt.system and prompt.system != current_system:
             messages.append({"role": "system", "content": prompt.system})
         if not prompt.attachments:
@@ -233,6 +257,16 @@ def _build_messages(self, prompt, conversation):
             for attachment in prompt.attachments:
                 attachment_message.append(_attachment(attachment, image_detail))
             messages.append({"role": "user", "content": attachment_message})
+        for tool_result in getattr(prompt, "tool_results", []):
+            if not tool_result.tool_call_id:
+                continue
+            messages.append(
+                {
+                    "type": "function_call_output",
+                    "call_id": tool_result.tool_call_id,
+                    "output": tool_result.output,
+                }
+            )
         return messages
 
     def _build_kwargs(self, prompt, conversation):
@@ -248,6 +282,27 @@ def _build_kwargs(self, prompt, conversation):
             value = getattr(prompt.options, option, None)
             if value is not None:
                 kwargs[option] = value
+
+        if prompt.tools:
+            tool_defs = []
+            for tool in prompt.tools:
+                if not getattr(tool, "name", None):
+                    continue
+                parameters = tool.input_schema or {
+                    "type": "object",
+                    "properties": {},
+                }
+                tool_defs.append(
+                    {
+                        "type": "function",
+                        "name": tool.name,
+                        "description": tool.description or None,
+                        "parameters": parameters,
+                        "strict": False,
+                    }
+                )
+            if tool_defs:
+                kwargs["tools"] = tool_defs
         if self.supports_schema and prompt.schema:
             kwargs["text"] = {
                 "format": {
@@ -258,17 +313,149 @@ def _build_kwargs(self, prompt, conversation):
             }
         return kwargs
 
-    def _handle_event(self, event, response):
-        if event.type == "response.output_text.delta":
+    def _update_tool_call_from_event(self, event, _tc_buf):
+        """
+        Accumulate streaming tool-call args by tool_call_id.
+        _tc_buf is a dict[id] -> {"name": str, "arguments": str}
+        """
+        et = getattr(event, "type", None)
+        # Python SDK surfaces rich objects; also support dict fallbacks:
+        obj = getattr(event, "to_dict", None)
+        if callable(obj):
+            payload = event.to_dict()
+        else:
+            payload = getattr(event, "__dict__", {}) or {}
+
+        # The SDK emits specific typed events for tool-calls; normalize:
+        # Expected shapes (SDK may differ slightly by version):
+        # - response.tool_call.delta   => { "id", "type":"function", "name"?, "arguments_delta" }
+        # - response.tool_call.completed => { "id", "type":"function", "name", "arguments" }
+        # Keep this resilient by checking common fields.
+        item = payload.get("response", payload.get("data", payload))
+        tool = item.get("tool_call") if isinstance(item, dict) else None
+        if not tool and "tool_call" in payload:
+            tool = payload["tool_call"]
+
+        # Some SDKs put fields at top-level for tool events:
+        if (
+            tool is None
+            and ("tool_call_id" in payload or "id" in payload)
+            and (
+                "arguments_delta" in payload
+                or "arguments" in payload
+                or "name" in payload
+            )
+        ):
+            tool = payload
+
+        if not tool:
+            return None
+
+        tool_id = tool.get("id") or tool.get("tool_call_id")
+        if not tool_id:
+            return None
+
+        entry = _tc_buf.setdefault(tool_id, {"name": None, "arguments": ""})
+
+        # Name may arrive early or only at completion:
+        if tool.get("name"):
+            entry["name"] = tool["name"]
+
+        # Streaming deltas:
+        if "arguments_delta" in tool and tool["arguments_delta"]:
+            entry["arguments"] += tool["arguments_delta"]
+
+        # Completion:
+        if (
+            "arguments" in tool
+            and tool["arguments"]
+            and not tool.get("arguments_delta")
+        ):
+            entry["arguments"] = tool["arguments"]
+            return tool_id  # signal completion for this id
+
+        return None
+
+    def _finalize_streaming_tool_calls(self, response, _tc_buf):
+        # Called when we know streaming has finished or when a tool_call completed event fires.
+        for tool_id, data in list(_tc_buf.items()):
+            if data.get("name") and data.get("arguments") is not None:
+                self._add_tool_call(
+                    response,
+                    tool_id,
+                    data.get("name"),
+                    data.get("arguments"),
+                )
+                del _tc_buf[tool_id]
+
+    def _add_tool_call(self, response, tool_id, name, arguments):
+        try:
+            parsed_arguments = json.loads(arguments or "{}")
+        except Exception:
+            parsed_arguments = arguments or ""
+        response.add_tool_call(
+            llm.ToolCall(
+                tool_call_id=tool_id,
+                name=name or "unknown_tool",
+                arguments=parsed_arguments,
+            )
+        )
+
+    def _add_tool_calls_from_output(self, response, output):
+        if not output:
+            return
+        for item in output:
+            if hasattr(item, "model_dump"):
+                data = item.model_dump()
+            elif isinstance(item, dict):
+                data = item
+            else:
+                data = getattr(item, "__dict__", {}) or {}
+
+            itype = data.get("type")
+            if itype not in {"tool_call", "function_call"}:
+                continue
+
+            tool_id = (
+                data.get("call_id")
+                or data.get("id")
+                or data.get("tool_call_id")
+                or f"call_{len(output)}"
+            )
+            name = data.get("name") or "unknown_tool"
+            arguments = data.get("arguments") or "{}"
+            self._add_tool_call(response, tool_id, name, arguments)
+
+    def _handle_event(self, event, response, _tc_buf=None):
+        et = getattr(event, "type", None)
+        if et == "response.output_text.delta":
             return event.delta
-        elif event.type == "response.completed":
+
+        # Accumulate tool-call pieces if provided
+        if _tc_buf is not None and et and "tool_call" in et:
+            completed_id = self._update_tool_call_from_event(event, _tc_buf)
+            if completed_id:
+                # finalize this single tool call immediately
+                entry = _tc_buf.pop(completed_id)
+                self._finalize_streaming_tool_calls(response, {completed_id: entry})
+
+        if et == "response.completed":
             response.response_json = event.response.model_dump()
             self.set_usage(response, event.response.usage)
+            self._add_tool_calls_from_output(
+                response, getattr(event.response, "output", None)
+            )
+            # finalize any remaining buffered tool-calls
+            if _tc_buf:
+                self._finalize_streaming_tool_calls(response, _tc_buf)
             return None
 
     def _finish_non_streaming_response(self, response, client_response):
         response.response_json = client_response.model_dump()
         self.set_usage(response, client_response.usage)
+        self._add_tool_calls_from_output(
+            response, getattr(client_response, "output", None)
+        )
 
 
 class ResponsesModel(_SharedResponses, KeyModel):
@@ -284,13 +471,17 @@ def execute(
         kwargs = self._build_kwargs(prompt, conversation)
         kwargs["stream"] = stream
         if stream:
+            # Buffer for assembling tool-call deltas across events
+            _tc_buf = {}
             for event in client.responses.create(**kwargs):
-                delta = self._handle_event(event, response)
+                delta = self._handle_event(event, response, _tc_buf)
                 if delta is not None:
                     yield delta
         else:
             client_response = client.responses.create(**kwargs)
-            yield client_response.output_text
+            text = getattr(client_response, "output_text", None)
+            if text:
+                yield text
             self._finish_non_streaming_response(response, client_response)
 
 
@@ -307,13 +498,16 @@ async def execute(
         kwargs = self._build_kwargs(prompt, conversation)
         kwargs["stream"] = stream
         if stream:
+            _tc_buf = {}
             async for event in await client.responses.create(**kwargs):
-                delta = self._handle_event(event, response)
+                delta = self._handle_event(event, response, _tc_buf)
                 if delta is not None:
                     yield delta
         else:
             client_response = await client.responses.create(**kwargs)
-            yield client_response.output_text
+            text = getattr(client_response, "output_text", None)
+            if text:
+                yield text
             self._finish_non_streaming_response(response, client_response)
 
 
 
@@ -24,3 +24,11 @@
 # name: test_options[options4]
   'Hi there! How can I assist you today?'
 # ---
+# name: test_tools
+  '''
+  I called simple_tool with 5; it returned:
+  "This is a simple tool, 5"
+  
+  Anything else you’d like me to run or change?
+  '''
+# ---