Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions expose.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ struct load_model_inputs
const char * mmproj_filename = nullptr;
const bool mmproj_cpu = false;
const int visionmaxres = 2048;
const int image_min_tokens = -1;
const int image_max_tokens = -1;
const bool use_mmap = false;
const bool use_mlock = false;
const bool use_smartcontext = false;
Expand Down
6 changes: 4 additions & 2 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2159,6 +2159,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
kcpp_pipeline_parallelism = inputs.pipelineparallel;
kcpp_data->n_batch = GetBatchSize(inputs.batchsize, in_file_format);
kcpp_data->n_ubatch = kcpp_data->n_batch;
kcpp_data->image_min_tokens = inputs.image_min_tokens;
kcpp_data->image_max_tokens = inputs.image_max_tokens;
if(isGguf && kcpp_pipeline_parallelism)
{
//double the logical batch, while keeping the physical batch the same, pipeline parallel set GGML_SCHED_MAX_COPIES to 2
Expand Down Expand Up @@ -2748,8 +2750,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
clip_context_params ctx_clip_params {
/* use_gpu */ true,
/* flash_attn_type */ clip_fa,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
/* image_min_tokens */ kcpp_data->image_min_tokens,
/* image_max_tokens */ kcpp_data->image_max_tokens,
};
clip_init_result cres = clip_init(mmproj_filename.c_str(), ctx_clip_params);
clp_ctx_v = cres.ctx_v;
Expand Down
14 changes: 14 additions & 0 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ class load_model_inputs(ctypes.Structure):
("mmproj_filename", ctypes.c_char_p),
("mmproj_cpu", ctypes.c_bool),
("visionmaxres", ctypes.c_int),
("image_min_tokens", ctypes.c_int),
("image_max_tokens", ctypes.c_int),
("use_mmap", ctypes.c_bool),
("use_mlock", ctypes.c_bool),
("use_smartcontext", ctypes.c_bool),
Expand Down Expand Up @@ -1885,6 +1887,8 @@ def load_model(model_filename):
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
inputs.mmproj_cpu = (True if args.mmprojcpu else False)
inputs.visionmaxres = (512 if args.visionmaxres < 512 else (2048 if args.visionmaxres > 2048 else args.visionmaxres))
inputs.image_min_tokens = args.image_min_tokens
inputs.image_max_tokens = args.image_max_tokens
inputs.use_smartcontext = args.smartcontext
inputs.use_contextshift = (0 if args.noshift else 1)
inputs.use_fastforward = (0 if args.nofastforward else 1)
Expand Down Expand Up @@ -7268,6 +7272,8 @@ def hide_tooltip(event):
mmproj_var = ctk.StringVar()
mmprojcpu_var = ctk.IntVar(value=0)
visionmaxres_var = ctk.StringVar(value=str(default_visionmaxres))
image_min_tokens_var = ctk.StringVar(value="-1")
image_max_tokens_var = ctk.StringVar(value="-1")
draftmodel_var = ctk.StringVar()
draftamount_var = ctk.StringVar(value=str(default_draft_amount))
draftgpulayers_var = ctk.StringVar(value=str(999))
Expand Down Expand Up @@ -8411,6 +8417,8 @@ def export_vars():
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
args.mmprojcpu = (mmprojcpu_var.get()==1)
args.visionmaxres = int(visionmaxres_var.get()) if visionmaxres_var.get()!="" else default_visionmaxres
args.image_min_tokens = int(image_min_tokens_var.get()) if image_min_tokens.get()!="" else -1
args.image_max_tokens = int(image_max_tokens_var.get()) if image_max_tokens.get()!="" else -1
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount
args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999
Expand Down Expand Up @@ -8679,6 +8687,10 @@ def import_vars(mydict):
mmprojcpu_var.set(1 if ("mmprojcpu" in mydict and mydict["mmprojcpu"]) else 0)
if "visionmaxres" in mydict and mydict["visionmaxres"]:
visionmaxres_var.set(mydict["visionmaxres"])
if "image_min_tokens" in mydict and mydict["image_min_tokens"]:
image_min_tokens_var.set(mydict["image_min_tokens"])
if "image_max_tokens" in mydict and mydict["image_max_tokens"]:
image_max_tokens_var.set(mydict["image_max_tokens"])
draftmodel_var.set(mydict["draftmodel"] if ("draftmodel" in mydict and mydict["draftmodel"]) else "")
if "draftamount" in mydict:
draftamount_var.set(mydict["draftamount"])
Expand Down Expand Up @@ -11122,6 +11134,8 @@ def range_checker(arg: str):
advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="")
advparser.add_argument("--mmprojcpu","--no-mmproj-offload", help="Force CLIP for Vision mmproj always on CPU.", action='store_true')
advparser.add_argument("--visionmaxres", metavar=('[max px]'), help="Clamp MMProj vision maximum allowed resolution. Allowed values are between 512 to 2048 px (default 1024).", type=int, default=default_visionmaxres)
advparser.add_argument("--image-min-tokens", metavar=('[tokens]'), help="Override the minimum tokens for the MMProj embedding (default -1).", type=int, default=-1)
advparser.add_argument("--image-max-tokens", metavar=('[tokens]'), help="Override the maximum tokens for the MMProj embedding (default -1).", type=int, default=-1)
advparser.add_argument("--draftmodel","--model-draft","-md", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
advparser.add_argument("--draftamount","--draft-max","--draft-n", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount)
advparser.add_argument("--draftgpulayers","--gpu-layers-draft","--n-gpu-layers-draft","-ngld", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
Expand Down
2 changes: 2 additions & 0 deletions otherarch/otherarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ struct kcpp_params {
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int image_min_tokens = -1; // Minimum image embedding tokens
int image_max_tokens = -1; // Maximum image embedding tokens
int n_threads = -1;
int n_blasthreads = -1;

Expand Down