containers · rhatdan · Sep 10, 2025 · Sep 9, 2025 · gemini-code-assist · Sep 9, 2025
@@ -29,8 +29,11 @@ URL support means if a model is on a web site or even on your local system, you
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
+#### **--cache-reuse**=256
+Min chunk size to attempt reusing from the cache via KV shifting
+
 #### **--ctx-size**, **-c**
-size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 2048, 0 = loaded from model)
+size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model)
 
 #### **--device**
 Add a host device to the container. Optional permissions parameter can

@@ -33,12 +33,15 @@ The default can be overridden in the ramalama.conf file.
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
+#### **--cache-reuse**=256
+Min chunk size to attempt reusing from the cache via KV shifting
+
 #### **--color**
 Indicate whether or not to use color in the chat.
 Possible values are "never", "always" and "auto". (default: auto)
 
 #### **--ctx-size**, **-c**
-size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 2048, 0 = loaded from model)
+size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model)
 
 #### **--device**
 Add a host device to the container. Optional permissions parameter  can

@@ -57,8 +57,11 @@ The default can be overridden in the ramalama.conf file.
 #### **--authfile**=*password*
 Path of the authentication file for OCI registries
 
+#### **--cache-reuse**=256
+Min chunk size to attempt reusing from the cache via KV shifting
+
 #### **--ctx-size**, **-c**
-size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 2048, 0 = loaded from model)
+size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model)
 
 #### **--detach**, **-d**
 Run the container in the background and print the new container ID.
@@ -426,7 +429,7 @@ spec:
       - name: model-server
 	image: quay.io/ramalama/ramalama:0.8
 	command: ["llama-server"]
-	args: ['--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
+	args: ['--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
 	securityContext:
 	  allowPrivilegeEscalation: false
 	  capabilities:

@@ -31,10 +31,13 @@
 #
 #container = true
 
-#size of the prompt context (0 = loaded from model)
+#Min chunk size to attempt reusing from the cache via KV shifting
 #
-#ctx_size=2048
+#cache_reuse=256
 
+#size of the prompt context (0 = loaded from model)
+#
+#ctx_size=0
 
 # Run RamaLama using the specified container engine.
 #

@@ -71,12 +71,16 @@ OCI model car image
 
 Image to be used when building and pushing --type=car models
 
+**cache_reuse**=256
+
+Min chunk size to attempt reusing from the cache via KV shifting
+
 **container**=true
 
 Run RamaLama in the default container.
 RAMALAMA_IN_CONTAINER environment variable overrides this field.
 
-**ctx_size**=2048
+**ctx_size**=0
 
 Size of the prompt context (0 = loaded from model)
 

@@ -776,17 +776,27 @@ def runtime_options(parser, command):
         )
     parser.add_argument("--authfile", help="path of the authentication file")
     if command in ["run", "perplexity", "serve"]:
+        parser.add_argument(
+            "--cache-reuse",
+            dest="cache_reuse",
+            type=int,
+            default=CONFIG.cache_reuse,
+            help="min chunk size to attempt reusing from the cache via KV shifting",
-            help="min chunk size to attempt reusing from the cache via KV shifting",
+            help="min chunk size (in bytes) to attempt reusing from the cache via KV shifting",
-            help="min chunk size to attempt reusing from the cache via KV shifting",
+            help="min chunk size (in bytes) to attempt reusing from the cache via KV shifting",
+            completer=suppressCompleter,
+        )
         parser.add_argument(
             "-c",
             "--ctx-size",
             dest="context",
+            type=int,
             default=CONFIG.ctx_size,
             help="size of the prompt context (0 = loaded from model)",
             completer=suppressCompleter,
         )
         parser.add_argument(
             "--max-model-len",
             dest="context",
+            type=int,
             default=CONFIG.ctx_size,
             help=argparse.SUPPRESS,
             completer=suppressCompleter,

@@ -69,7 +69,8 @@ class BaseConfig:
     api: str = "none"
     carimage: str = "registry.access.redhat.com/ubi10-micro:latest"
     container: bool = None  # type: ignore
-    ctx_size: int = 2048
+    ctx_size: int = 0
+    cache_reuse: int = 256
     default_image: str = DEFAULT_IMAGE
     dryrun: bool = False
     engine: SUPPORTED_ENGINES | None = field(default_factory=get_default_engine)

@@ -657,13 +657,14 @@ def llama_serve(self, args):
         exec_args += [
             "--alias",
             self.model,
-            "--ctx-size",
-            f"{args.context}",
             "--temp",
             f"{args.temp}",
             "--cache-reuse",
-            "256",
+            f"{args.cache_reuse}",
         ]
+        if args.context > 0:
+            exec_args += ["--ctx-size", f"{args.context}"]
+
         exec_args += args.runtime_args
 
         if draft_model_path:

@@ -20,7 +20,8 @@ EOF
 	run_ramalama -q --dryrun run ${MODEL}
 	is "$output" "${verify_begin}.*"
 	is "$output" ".*${MODEL}" "verify model name"
-	is "$output" ".*--ctx-size 2048" "verify model name"
+	is "$output" ".*--cache-reuse 256" "verify cache-reuse is being set"
+	assert "$output" !~ ".*--ctx-size" "assert ctx-size is not show by default"
 	assert "$output" !~ ".*--seed" "assert seed does not show by default"
 	assert "$output" !~ ".*-t -i" "assert -t -i not present without tty"
 
@@ -38,10 +39,11 @@ EOF
 	run_ramalama -q --dryrun run --oci-runtime foobar ${MODEL}
 	is "$output" ".*--runtime foobar" "dryrun correct with --oci-runtime"
 
-	RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun run --seed 9876 -c 4096 --net bridge --name foobar ${MODEL}
+	RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun run --cache-reuse 512 --seed 9876 -c 4096 --net bridge --name foobar ${MODEL}
 	is "$output" ".*--network bridge.*" "dryrun correct with --name"
 	is "$output" ".*${MODEL}" "verify model name"
 	is "$output" ".*--ctx-size 4096" "verify ctx-size is set"
+	is "$output" ".*--cache-reuse 512" "verify cache-reuse is being set"
 	is "$output" ".*--temp 0.8" "verify temp is set"
 	is "$output" ".*--seed 9876" "verify seed is set"
 	if not_docker; then
@@ -90,8 +92,8 @@ EOF
 
     else
 	run_ramalama -q --dryrun run --ctx-size 4096 ${MODEL}
-	is "$output" '.*serve.*--ctx-size 4096 --temp 0.8.*' "dryrun correct"
-	is "$output" ".*--ctx-size 4096" "verify model name"
+	is "$output" '.*--ctx-size 4096.*' "verify ctx-size is set"
+	is "$output" '.*--cache-reuse 256.*' "assert cache-reuse is set by default to 256"
 
 	run_ramalama 22 run --ctx-size=4096 --name foobar ${MODEL}
 	is "${lines[0]}"  "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"

@@ -12,7 +12,8 @@ def test_correct_config_defaults(monkeypatch):
 
     assert cfg.carimage == "registry.access.redhat.com/ubi10-micro:latest"
     assert cfg.container in [True, False]  # depends on env/system
-    assert cfg.ctx_size == 2048
+    assert cfg.ctx_size == 0
+    assert cfg.cache_reuse == 256
     assert cfg.engine in ["podman", "docker", None]
     assert cfg.env == []
     assert cfg.host == "0.0.0.0"