Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lmdeploy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def pipeline(model_path: str,
chat_template_config: ChatTemplateConfig | None = None,
log_level: str = 'WARNING',
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Create a pipeline for inference.
Expand All @@ -41,6 +42,7 @@ def pipeline(model_path: str,
``WARNING``, ``INFO``, ``DEBUG``]
max_log_len: Max number of prompt characters or prompt tokens
being printed in log.
trust_remote_code: whether to trust remote code from model repositories.
speculative_config: speculative decoding configuration.
**kwargs: additional keyword arguments passed to the pipeline.

Expand Down Expand Up @@ -73,6 +75,7 @@ def pipeline(model_path: str,
chat_template_config=chat_template_config,
log_level=log_level,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)

Expand Down
10 changes: 5 additions & 5 deletions lmdeploy/archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,14 @@ def check_vl_llm(backend: str, config: dict) -> bool:
return False


def get_task(backend: str, model_path: str):
def get_task(backend: str, model_path: str, trust_remote_code: bool = False):
"""Get pipeline type and pipeline class from model config."""
from lmdeploy.serve.core import AsyncEngine

if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
# workspace model
return 'llm', AsyncEngine
_, config = get_model_arch(model_path)
_, config = get_model_arch(model_path, trust_remote_code=trust_remote_code)
if check_vl_llm(backend, config.to_dict()):
from lmdeploy.serve.core import VLAsyncEngine
return 'vlm', VLAsyncEngine
Expand All @@ -144,17 +144,17 @@ def get_task(backend: str, model_path: str):
return 'llm', AsyncEngine


def get_model_arch(model_path: str):
def get_model_arch(model_path: str, trust_remote_code: bool = False):
"""Get a model's architecture and configuration.

Args:
model_path(str): the model path
"""
try:
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
except Exception as e: # noqa
from transformers import PretrainedConfig
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)

_cfg = cfg.to_dict()
if _cfg.get('architectures', None):
Expand Down
5 changes: 5 additions & 0 deletions lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def add_parser_api_server():
default=['*'],
help='A list of allowed http headers for cors')
parser.add_argument('--proxy-url', type=str, default=None, help='The proxy url for api server.')
parser.add_argument('--trust-remote-code',
action='store_true',
help='Whether to trust remote code from model repositories.')
parser.add_argument('--max-concurrent-requests',
type=int,
default=None,
Expand Down Expand Up @@ -303,6 +306,7 @@ def api_server(args):
max_log_len=args.max_log_len,
disable_fastapi_docs=args.disable_fastapi_docs,
max_concurrent_requests=args.max_concurrent_requests,
trust_remote_code=args.trust_remote_code,
reasoning_parser=args.reasoning_parser,
tool_call_parser=args.tool_call_parser,
speculative_config=speculative_config,
Expand Down Expand Up @@ -334,6 +338,7 @@ def api_server(args):
max_log_len=args.max_log_len,
disable_fastapi_docs=args.disable_fastapi_docs,
max_concurrent_requests=args.max_concurrent_requests,
trust_remote_code=args.trust_remote_code,
reasoning_parser=args.reasoning_parser,
tool_call_parser=args.tool_call_parser,
speculative_config=speculative_config,
Expand Down
5 changes: 4 additions & 1 deletion lmdeploy/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self,
chat_template_config: ChatTemplateConfig | None = None,
log_level: str = 'WARNING',
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Initialize Pipeline.
Expand All @@ -49,6 +50,7 @@ def __init__(self,
chat_template_config: Chat template configuration.
log_level: Log level.
max_log_len: Max number of prompt characters or prompt tokens being printed in log.
trust_remote_code: whether to trust remote code from model repositories.
speculative_config: Speculative decoding configuration.
**kwargs: Additional keyword arguments.
"""
Expand All @@ -69,12 +71,13 @@ def __init__(self,

# Create inference engine
backend, backend_config = autoget_backend_config(model_path, backend_config)
_, pipeline_class = get_task(backend, model_path)
_, pipeline_class = get_task(backend, model_path, trust_remote_code=trust_remote_code)
self.async_engine = pipeline_class(model_path,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)
self.internal_thread = _EventLoopThread(daemon=True)
Expand Down
5 changes: 3 additions & 2 deletions lmdeploy/pytorch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def get_head_size(self):
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
trust_remote_code: bool = True,
trust_remote_code: bool = False,
dtype: str = 'auto',
dist_config: DistConfig = None,
hf_overrides: dict[str, Any] = None,
Expand Down Expand Up @@ -563,10 +563,11 @@ def from_config(
target_cache_cfg: CacheConfig,
target_model: str = None,
dtype: str = 'auto',
trust_remote_code: bool = False,
):
model = model or target_model
model_config = ModelConfig.from_pretrained(model,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
dtype=dtype,
is_draft_model=True,
spec_method=method,
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/pytorch/engine/config_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def build_misc_config(engine_config: PytorchEngineConfig):

@staticmethod
def build_specdecode_config(target_model, speculative_config: SpeculativeConfig, engine_config: PytorchEngineConfig,
cache_config: CacheConfig):
cache_config: CacheConfig, trust_remote_code: bool = False):
"""Build spec decode config."""
specdecode_config = None
if speculative_config is not None:
Expand All @@ -113,5 +113,6 @@ def build_specdecode_config(target_model, speculative_config: SpeculativeConfig,
target_model=target_model,
target_cache_cfg=cache_config,
dtype=engine_config.dtype,
trust_remote_code=trust_remote_code,
)
return specdecode_config
7 changes: 4 additions & 3 deletions lmdeploy/pytorch/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(
self,
model_path: str,
engine_config: PytorchEngineConfig = None,
trust_remote_code: bool = True,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig = None,
) -> None:
# make sure engine config exist
Expand Down Expand Up @@ -133,7 +133,7 @@ def __init__(
misc_config = ConfigBuilder.build_misc_config(engine_config)
# spec decode
self.specdecode_config = ConfigBuilder.build_specdecode_config(model_path, speculative_config, engine_config,
cache_config)
cache_config, trust_remote_code)

# build model agent
self.executor = build_executor(
Expand All @@ -147,6 +147,7 @@ def __init__(
distributed_executor_backend=engine_config.distributed_executor_backend,
dtype=engine_config.dtype,
specdecode_config=self.specdecode_config,
trust_remote_code=trust_remote_code,
)
self.executor.init()

Expand Down Expand Up @@ -198,7 +199,7 @@ def __init__(
def from_pretrained(cls,
pretrained_model_name_or_path: str,
engine_config: PytorchEngineConfig = None,
trust_remote_code: bool = True,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig = None,
**kwargs):
"""Lmdeploy python inference engine.
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/pytorch/engine/executor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def build_executor(
distributed_executor_backend: str = None,
dtype: str = 'auto',
specdecode_config: SpecDecodeConfig = None,
trust_remote_code: bool = False,
) -> ExecutorBase:
"""Build model agent executor."""
logger = get_logger('lmdeploy')
Expand All @@ -71,7 +72,7 @@ def build_executor(

model_config = ModelConfig.from_pretrained(
model_path,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
dtype=dtype,
hf_overrides=misc_config.hf_overrides,
dist_config=dist_config,
Expand Down
30 changes: 23 additions & 7 deletions lmdeploy/serve/core/async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def __init__(self,
backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
chat_template_config: ChatTemplateConfig | None = None,
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs) -> None:
logger.info(f'input backend={backend}, backend_config={backend_config}')
Expand All @@ -118,21 +119,25 @@ def __init__(self,
if backend == 'turbomind' else PytorchEngineConfig())
self.model_name = model_name if model_name else model_path
self.chat_template = get_chat_template(model_path, chat_template_config)
self.tokenizer = Tokenizer(model_path)
self.tokenizer = Tokenizer(model_path, trust_remote_code=trust_remote_code)
self.prompt_processor = MultimodalProcessor(self.tokenizer, self.chat_template)
self.hf_gen_cfg = get_hf_gen_cfg(model_path)
self.arch, self.hf_cfg = get_model_arch(model_path)
self.hf_gen_cfg = get_hf_gen_cfg(model_path, trust_remote_code=trust_remote_code)
self.arch, self.hf_cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code)
self.session_len = (_get_and_verify_max_len(self.hf_cfg, None)
if backend_config.session_len is None else backend_config.session_len)
backend_config.session_len = self.session_len
if speculative_config is not None and backend == 'turbomind':
logger.warning('speculative decoding is not supported by turbomind ')
# build backend engine
if backend == 'turbomind':
self.engine = self._build_turbomind(model_path=model_path, backend_config=backend_config, **kwargs)
self.engine = self._build_turbomind(model_path=model_path,
backend_config=backend_config,
trust_remote_code=trust_remote_code,
**kwargs)
elif backend == 'pytorch':
self.engine = self._build_pytorch(model_path=model_path,
backend_config=backend_config,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)
else:
Expand Down Expand Up @@ -169,19 +174,30 @@ def __enter__(self):
def __exit__(self, exc_type, exc_value, traceback):
self.close()

def _build_turbomind(self, model_path: str, backend_config: TurbomindEngineConfig | None = None, **kwargs):
def _build_turbomind(self,
model_path: str,
backend_config: TurbomindEngineConfig | None = None,
trust_remote_code: bool = False,
**kwargs):
"""Inner build method for turbomind backend."""
from lmdeploy import turbomind as tm
return tm.TurboMind.from_pretrained(model_path, engine_config=backend_config, **kwargs)
return tm.TurboMind.from_pretrained(model_path,
engine_config=backend_config,
trust_remote_code=trust_remote_code,
**kwargs)

def _build_pytorch(self,
model_path: str,
backend_config: PytorchEngineConfig | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Inner build method for pytorch backend."""
from lmdeploy.pytorch.engine import Engine
return Engine.from_pretrained(model_path, engine_config=backend_config, speculative_config=speculative_config)
return Engine.from_pretrained(model_path,
engine_config=backend_config,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config)

def _build_stat_loggers(self):
self.stat_loggers = []
Expand Down
13 changes: 11 additions & 2 deletions lmdeploy/serve/core/vl_async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self,
backend: Literal['turbomind', 'pytorch'] = 'turbomind',
backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
vision_config: VisionConfig | None = None,
trust_remote_code: bool = False,
**kwargs) -> None:
from lmdeploy.serve.processors import MultimodalProcessor
from lmdeploy.utils import try_import_deeplink
Expand All @@ -27,8 +28,16 @@ def __init__(self,
if backend_config and backend_config.enable_prefix_caching:
backend_config.enable_prefix_caching = False
logger.warning('Prefix caching is disabled since LMDeploy hasn\'t support in on VL models yet')
self.vl_encoder = ImageEncoder(model_path, backend, vision_config, backend_config=backend_config)
super().__init__(model_path, backend=backend, backend_config=backend_config, **kwargs)
self.vl_encoder = ImageEncoder(model_path,
backend,
vision_config,
backend_config=backend_config,
trust_remote_code=trust_remote_code)
super().__init__(model_path,
backend=backend,
backend_config=backend_config,
trust_remote_code=trust_remote_code,
**kwargs)
# Update prompt_processor to support multimodal processing
self.prompt_processor = MultimodalProcessor(self.tokenizer,
self.chat_template,
Expand Down
4 changes: 3 additions & 1 deletion lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,7 @@ def serve(model_path: str,
max_log_len: int | None = None,
disable_fastapi_docs: bool = False,
max_concurrent_requests: int | None = None,
trust_remote_code: bool = False,
reasoning_parser: str | None = None,
tool_call_parser: str | None = None,
allow_terminate_by_client: bool = False,
Expand Down Expand Up @@ -1487,7 +1488,7 @@ def serve(model_path: str,
http_or_https = 'https'

handle_torchrun()
_, pipeline_class = get_task(backend, model_path)
_, pipeline_class = get_task(backend, model_path, trust_remote_code=trust_remote_code)
if isinstance(backend_config, PytorchEngineConfig):
backend_config.enable_mp_engine = True
# router replay
Expand All @@ -1499,6 +1500,7 @@ def serve(model_path: str,
backend_config=backend_config,
chat_template_config=chat_template_config,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)
# set reasoning parser and tool parser
Expand Down
Loading
Loading