From 5f75d0f87e5980570534efa4e9befd93052e8c1b Mon Sep 17 00:00:00 2001 From: Lllama <34464159+pi6am@users.noreply.github.com> Date: Tue, 14 Apr 2026 00:46:59 -0700 Subject: [PATCH 1/2] Pass img_min_params and img_max_params to ctx_clip_params These values determine the minimum and maximum size (in tokens) of vision embeddings. The default value of -1 uses a model-dependent default size, for example for Gemma 4 the default is a 280 token embedding. For higher quality results (at the cost of using more memory and slower speed) you can increase the size of the embedding to 1120 tokens. --- expose.h | 2 ++ gpttype_adapter.cpp | 6 ++++-- koboldcpp.py | 14 ++++++++++++++ otherarch/otherarch.h | 2 ++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/expose.h b/expose.h index 871661ace87..9093b595df9 100644 --- a/expose.h +++ b/expose.h @@ -47,6 +47,8 @@ struct load_model_inputs const char * mmproj_filename = nullptr; const bool mmproj_cpu = false; const int visionmaxres = 2048; + const int image_min_tokens = -1; + const int image_max_tokens = -1; const bool use_mmap = false; const bool use_mlock = false; const bool use_smartcontext = false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a41cc69dad0..83368e0fad2 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2154,6 +2154,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in kcpp_pipeline_parallelism = inputs.pipelineparallel; kcpp_data->n_batch = GetBatchSize(inputs.batchsize, in_file_format); kcpp_data->n_ubatch = kcpp_data->n_batch; + kcpp_data->image_min_tokens = inputs.image_min_tokens; + kcpp_data->image_max_tokens = inputs.image_max_tokens; if(isGguf && kcpp_pipeline_parallelism) { //double the logical batch, while keeping the physical batch the same, pipeline parallel set GGML_SCHED_MAX_COPIES to 2 @@ -2743,8 +2745,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in clip_context_params ctx_clip_params { /* use_gpu */ true, /* flash_attn_type */ clip_fa, - /* image_min_tokens */ -1, - /* image_max_tokens */ -1, + /* image_min_tokens */ kcpp_data->image_min_tokens, + /* image_max_tokens */ kcpp_data->image_max_tokens, }; clip_init_result cres = clip_init(mmproj_filename.c_str(), ctx_clip_params); clp_ctx_v = cres.ctx_v; diff --git a/koboldcpp.py b/koboldcpp.py index dfd503dd48a..b463b76649d 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -218,6 +218,8 @@ class load_model_inputs(ctypes.Structure): ("mmproj_filename", ctypes.c_char_p), ("mmproj_cpu", ctypes.c_bool), ("visionmaxres", ctypes.c_int), + ("image_min_tokens", ctypes.c_int), + ("image_max_tokens", ctypes.c_int), ("use_mmap", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("use_smartcontext", ctypes.c_bool), @@ -1769,6 +1771,8 @@ def load_model(model_filename): inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8") inputs.mmproj_cpu = (True if args.mmprojcpu else False) inputs.visionmaxres = (512 if args.visionmaxres < 512 else (2048 if args.visionmaxres > 2048 else args.visionmaxres)) + inputs.image_min_tokens = args.image_min_tokens + inputs.image_max_tokens = args.image_max_tokens inputs.use_smartcontext = args.smartcontext inputs.use_contextshift = (0 if args.noshift else 1) inputs.use_fastforward = (0 if args.nofastforward else 1) @@ -6897,6 +6901,8 @@ def hide_tooltip(event): mmproj_var = ctk.StringVar() mmprojcpu_var = ctk.IntVar(value=0) visionmaxres_var = ctk.StringVar(value=str(default_visionmaxres)) + image_min_tokens_var = ctk.StringVar(value="-1") + image_max_tokens_var = ctk.StringVar(value="-1") draftmodel_var = ctk.StringVar() draftamount_var = ctk.StringVar(value=str(default_draft_amount)) draftgpulayers_var = ctk.StringVar(value=str(999)) @@ -8033,6 +8039,8 @@ def export_vars(): args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get() args.mmprojcpu = (mmprojcpu_var.get()==1) args.visionmaxres = int(visionmaxres_var.get()) if visionmaxres_var.get()!="" else default_visionmaxres + args.image_min_tokens = int(image_min_tokens_var.get()) if image_min_tokens.get()!="" else -1 + args.image_max_tokens = int(image_max_tokens_var.get()) if image_max_tokens.get()!="" else -1 args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get() args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999 @@ -8301,6 +8309,10 @@ def import_vars(dict): mmprojcpu_var.set(1 if ("mmprojcpu" in dict and dict["mmprojcpu"]) else 0) if "visionmaxres" in dict and dict["visionmaxres"]: visionmaxres_var.set(dict["visionmaxres"]) + if "image_min_tokens" in dict and dict["image_min_tokens"]: + image_min_tokens_var.set(dict["image_min_tokens"]) + if "image_max_tokens" in dict and dict["image_max_tokens"]: + image_max_tokens_var.set(dict["image_max_tokens"]) draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "") if "draftamount" in dict: draftamount_var.set(dict["draftamount"]) @@ -10741,6 +10753,8 @@ def range_checker(arg: str): advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="") advparser.add_argument("--mmprojcpu","--no-mmproj-offload", help="Force CLIP for Vision mmproj always on CPU.", action='store_true') advparser.add_argument("--visionmaxres", metavar=('[max px]'), help="Clamp MMProj vision maximum allowed resolution. Allowed values are between 512 to 2048 px (default 1024).", type=int, default=default_visionmaxres) + advparser.add_argument("--image-min-tokens", metavar=('[tokens]'), help="Override the minimum tokens for the MMProj embedding (default -1).", type=int, default=-1) + advparser.add_argument("--image-max-tokens", metavar=('[tokens]'), help="Override the maximum tokens for the MMProj embedding (default -1).", type=int, default=-1) advparser.add_argument("--draftmodel","--model-draft","-md", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="") advparser.add_argument("--draftamount","--draft-max","--draft-n", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount) advparser.add_argument("--draftgpulayers","--gpu-layers-draft","--n-gpu-layers-draft","-ngld", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999) diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index e8a85f86749..ee47cfe93fa 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -21,6 +21,8 @@ struct kcpp_params { int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) + int image_min_tokens = -1; // Minimum image embedding tokens + int image_max_tokens = -1; // Maximum image embedding tokens int n_threads = -1; int n_blasthreads = -1; From fc9a29cdbc8b3eab6de6c00fc3b9802336c9f700 Mon Sep 17 00:00:00 2001 From: Lllama <34464159+pi6am@users.noreply.github.com> Date: Tue, 14 Apr 2026 09:55:53 -0700 Subject: [PATCH 2/2] Change dict to mydict to match change to method --- koboldcpp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 0ab760c99fb..45dd54e6dec 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -8687,10 +8687,10 @@ def import_vars(mydict): mmprojcpu_var.set(1 if ("mmprojcpu" in mydict and mydict["mmprojcpu"]) else 0) if "visionmaxres" in mydict and mydict["visionmaxres"]: visionmaxres_var.set(mydict["visionmaxres"]) - if "image_min_tokens" in dict and dict["image_min_tokens"]: - image_min_tokens_var.set(dict["image_min_tokens"]) - if "image_max_tokens" in dict and dict["image_max_tokens"]: - image_max_tokens_var.set(dict["image_max_tokens"]) + if "image_min_tokens" in mydict and mydict["image_min_tokens"]: + image_min_tokens_var.set(mydict["image_min_tokens"]) + if "image_max_tokens" in mydict and mydict["image_max_tokens"]: + image_max_tokens_var.set(mydict["image_max_tokens"]) draftmodel_var.set(mydict["draftmodel"] if ("draftmodel" in mydict and mydict["draftmodel"]) else "") if "draftamount" in mydict: draftamount_var.set(mydict["draftamount"])