docs: add example for running gpt-oss

donderom · donderom · commit a49dafc71dec · 2025-08-08T12:43:44.000+02:00
diff --git a/README.md b/README.md
@@ -86,7 +86,7 @@ llm.embeddings(prompt).foreach: embeddings =>
 llm.close()
 ```
 
-#### Self-contained [Scala CLI](https://scala-cli.virtuslab.org) example:
+#### Self-contained [Scala CLI](https://scala-cli.virtuslab.org) example (with basic [Llama 3](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF) model):
 
 `Run.scala`:
 ```scala
@@ -113,3 +113,33 @@ object Main extends App:
 ```sh
 scala-cli Run.scala
 ```
+
+#### Self-contained [Scala CLI](https://scala-cli.virtuslab.org) example (with configured [gpt-oss](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) model):
+
+`Run.scala`:
+```scala
+//> using scala 3.3.0
+//> using jvm adoptium:17
+//> using java-opt --add-modules=jdk.incubator.foreign
+//> using java-opt --enable-native-access=ALL-UNNAMED
+//> using dep com.donderom::llm4s:0.13.0-b6109
+
+import com.donderom.llm4s.{ContextParams, Llm, LlmParams}
+import java.nio.file.Paths
+import scala.util.Using
+
+object Main extends App:
+  System.load("./build/bin/libllama.dylib")
+  val model = Paths.get("gpt-oss-20b-mxfp4.gguf")
+  val prompt = "What is LLM?"
+  // Use Flash attention and context size provided by the model
+  val params = LlmParams(context = ContextParams(size = 0, flashAttention = true))
+  Using(Llm(model)): llm =>                // llm : com.donderom.llm4s.Llm
+   llm(prompt, params).foreach: stream =>  // stream : LazyList[String]
+      stream.foreach: token =>             // token : String
+        print(token)
+```
+
+```sh
+scala-cli Run.scala
+```