Do not set the ctx-size by default

rhatdan · rhatdan · commit 324763277e57 · 2025-09-09T12:36:12.000-04:00
llama.cpp is defaulting to ctx-size of 4098 and we were hard coding
2049, which means we were not using the default setting. This PR
changes to use ctx-size=0 which will not be specified in the command
unless the value is &gt; 0, so llama-server default will be used.

Also we hard coded cache_reuse=256 with no way for user to override,
this PR adds support for cache_reuse being set in ramalama.conf and on
the command line.

Signed-off-by: Daniel J Walsh &lt;dwalsh@redhat.com&gt;
diff --git a/docs/ramalama-perplexity.1.md b/docs/ramalama-perplexity.1.md
@@ -29,8 +29,11 @@ URL support means if a model is on a web site or even on your local system, you
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
+#### **--cache-reuse**=256
+Min chunk size to attempt reusing from the cache via KV shifting
+
 #### **--ctx-size**, **-c**
-size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 2048, 0 = loaded from model)
+size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model)
 
 #### **--device**
 Add a host device to the container. Optional permissions parameter can
diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md
@@ -33,12 +33,15 @@ The default can be overridden in the ramalama.conf file.
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
+#### **--cache-reuse**=256
+Min chunk size to attempt reusing from the cache via KV shifting
+
 #### **--color**
 Indicate whether or not to use color in the chat.
 Possible values are "never", "always" and "auto". (default: auto)
 
 #### **--ctx-size**, **-c**
-size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 2048, 0 = loaded from model)
+size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model)
 
 #### **--device**
 Add a host device to the container. Optional permissions parameter  can
diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -57,8 +57,11 @@ The default can be overridden in the ramalama.conf file.
 #### **--authfile**=*password*
 Path of the authentication file for OCI registries
 
+#### **--cache-reuse**=256
+Min chunk size to attempt reusing from the cache via KV shifting
+
 #### **--ctx-size**, **-c**
-size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 2048, 0 = loaded from model)
+size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model)
 
 #### **--detach**, **-d**
 Run the container in the background and print the new container ID.
@@ -426,7 +429,7 @@ spec:
       - name: model-server
 	image: quay.io/ramalama/ramalama:0.8
 	command: ["llama-server"]
-	args: ['--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
+	args: ['--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
 	securityContext:
 	  allowPrivilegeEscalation: false
 	  capabilities:
diff --git a/docs/ramalama.conf b/docs/ramalama.conf
@@ -31,10 +31,13 @@
 #
 #container = true
 
-#size of the prompt context (0 = loaded from model)
+#Min chunk size to attempt reusing from the cache via KV shifting
 #
-#ctx_size=2048
+#cache_reuse=256
 
+#size of the prompt context (0 = loaded from model)
+#
+#ctx_size=0
 
 # Run RamaLama using the specified container engine.
 #
diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md
@@ -71,12 +71,16 @@ OCI model car image
 
 Image to be used when building and pushing --type=car models
 
+**cache_reuse**=256
+
+Min chunk size to attempt reusing from the cache via KV shifting
+
 **container**=true
 
 Run RamaLama in the default container.
 RAMALAMA_IN_CONTAINER environment variable overrides this field.
 
-**ctx_size**=2048
+**ctx_size**=0
 
 Size of the prompt context (0 = loaded from model)
 
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -776,17 +776,27 @@ def runtime_options(parser, command):
         )
     parser.add_argument("--authfile", help="path of the authentication file")
     if command in ["run", "perplexity", "serve"]:
+        parser.add_argument(
+            "--cache-reuse",
+            dest="cache_reuse",
+            type=int,
+            default=CONFIG.cache_reuse,
+            help="min chunk size to attempt reusing from the cache via KV shifting",
+            completer=suppressCompleter,
+        )
         parser.add_argument(
             "-c",
             "--ctx-size",
             dest="context",
+            type=int,
             default=CONFIG.ctx_size,
             help="size of the prompt context (0 = loaded from model)",
             completer=suppressCompleter,
         )
         parser.add_argument(
             "--max-model-len",
             dest="context",
+            type=int,
             default=CONFIG.ctx_size,
             help=argparse.SUPPRESS,
             completer=suppressCompleter,
diff --git a/ramalama/config.py b/ramalama/config.py
@@ -69,7 +69,8 @@ class BaseConfig:
     api: str = "none"
     carimage: str = "registry.access.redhat.com/ubi10-micro:latest"
     container: bool = None  # type: ignore
-    ctx_size: int = 2048
+    ctx_size: int = 0
+    cache_reuse: int = 256
     default_image: str = DEFAULT_IMAGE
     dryrun: bool = False
     engine: SUPPORTED_ENGINES | None = field(default_factory=get_default_engine)
diff --git a/ramalama/model.py b/ramalama/model.py
@@ -657,13 +657,14 @@ def llama_serve(self, args):
         exec_args += [
             "--alias",
             self.model,
-            "--ctx-size",
-            f"{args.context}",
             "--temp",
             f"{args.temp}",
             "--cache-reuse",
-            "256",
+            f"{args.cache_reuse}",
         ]
+        if args.context > 0:
+            exec_args += ["--ctx-size", f"{args.context}"]
+
         exec_args += args.runtime_args
 
         if draft_model_path:
diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -20,7 +20,8 @@ EOF
 	run_ramalama -q --dryrun run ${MODEL}
 	is "$output" "${verify_begin}.*"
 	is "$output" ".*${MODEL}" "verify model name"
-	is "$output" ".*--ctx-size 2048" "verify model name"
+	is "$output" ".*--cache-reuse 256" "verify cache-reuse is being set"
+	assert "$output" !~ ".*--ctx-size" "assert ctx-size is not show by default"
 	assert "$output" !~ ".*--seed" "assert seed does not show by default"
 	assert "$output" !~ ".*-t -i" "assert -t -i not present without tty"
 
@@ -38,10 +39,11 @@ EOF
 	run_ramalama -q --dryrun run --oci-runtime foobar ${MODEL}
 	is "$output" ".*--runtime foobar" "dryrun correct with --oci-runtime"
 
-	RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun run --seed 9876 -c 4096 --net bridge --name foobar ${MODEL}
+	RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun run --cache-reuse 512 --seed 9876 -c 4096 --net bridge --name foobar ${MODEL}
 	is "$output" ".*--network bridge.*" "dryrun correct with --name"
 	is "$output" ".*${MODEL}" "verify model name"
 	is "$output" ".*--ctx-size 4096" "verify ctx-size is set"
+	is "$output" ".*--cache-reuse 512" "verify cache-reuse is being set"
 	is "$output" ".*--temp 0.8" "verify temp is set"
 	is "$output" ".*--seed 9876" "verify seed is set"
 	if not_docker; then

Original file line number	Diff line number	Diff line change
`@@ -31,10 +31,13 @@`
`31`	`31`	`#`
`32`	`32`	`#container = true`
`33`	`33`
`34`		`-#size of the prompt context (0 = loaded from model)`
	`34`	`+#Min chunk size to attempt reusing from the cache via KV shifting`
`35`	`35`	`#`
`36`		`-#ctx_size=2048`
	`36`	`+#cache_reuse=256`
`37`	`37`
	`38`	`+#size of the prompt context (0 = loaded from model)`
	`39`	`+#`
	`40`	`+#ctx_size=0`
`38`	`41`
`39`	`42`	`# Run RamaLama using the specified container engine.`
`40`	`43`	`#`