You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
the error that occurs when setting --pipeline_parallel_size 2 while deploying model inference service using vLLM docker?
root@76c63b1ab933:/workspace# vllm serve /models/checkpoint-500 --served-model qwen --port 8000 -p 2
INFO 12-26 02:09:12 api_server.py:643] vLLM API server version 0.1.dev3820+g27244b2.d20241225
INFO 12-26 02:09:12 api_server.py:644] args: Namespace(subparser='serve', model_tag='/models/checkpoint-500', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=[''], allowed_methods=[''], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='/models/checkpoint-500', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='xgrammar', logits_processor_pattern=None, distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=2, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, mm_cache_preprocessor=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=['qwen'], qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, dispatch_function=<function serve at 0xfffe8754cb80>)
INFO 12-26 02:09:23 config.py:451] This model supports multiple tasks: {'reward', 'score', 'generate', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 12-26 02:09:23 config.py:1122] Defaulting to use mp for distributed inference
WARNING 12-26 02:09:23 config.py:569] Async output processing can not be enabled with pipeline parallel
INFO 12-26 02:09:23 importing.py:15] Triton not installed or not compatible; certain GPU-related functions will not be available.
INFO 12-26 02:09:23 llm_engine.py:249] Initializing an LLM engine (v0.1.dev3820+g27244b2.d20241225) with config: model='/models/checkpoint-500', speculative_config=None, tokenizer='/models/checkpoint-500', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=qwen, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=False, mm_cache_preprocessor=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"candidate_compile_sizes":[],"compile_sizes":[],"capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,
WARNING 12-26 02:09:23 multiproc_worker_utils.py:312] Reducing Torch parallelism from 192 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
INFO 12-26 02:09:25 selector.py:217] Cannot use _Backend.FLASH_ATTN backend on NPU.
INFO 12-26 02:09:25 selector.py:167] Using ASCEND backend.
INFO 12-26 02:09:31 importing.py:15] Triton not installed or not compatible; certain GPU-related functions will not be available.
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:32 selector.py:217] Cannot use _Backend.FLASH_ATTN backend on NPU.
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:32 selector.py:167] Using ASCEND backend.
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:32 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
INFO 12-26 02:09:49 model_runner.py:1092] Starting to load model /models/checkpoint-500...
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:49 model_runner.py:1092] Starting to load model /models/checkpoint-500...
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:01<00:00, 1.17s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:01<00:00, 1.17s/it]
(VllmWorkerProcess pid=9243) INFO 12-26 02:10:01 model_runner.py:1097] Loading model weights took 2.9023 GB
INFO 12-26 02:10:01 model_runner.py:1097] Loading model weights took 2.9023 GB
[rank0]:[W1226 02:10:01.566394193 compiler_depend.ts:659] Warning: 0Failed to find function aclrtSynchronizeDeviceWithTimeout (function operator())
[rank1]:[W1226 02:10:01.568054145 compiler_depend.ts:659] Warning: 0Failed to find function aclrtSynchronizeDeviceWithTimeout (function operator())
/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_model_runner.py:119: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
input_positions_tensor = torch.tensor(
(VllmWorkerProcess pid=9243) /usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_model_runner.py:119: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
(VllmWorkerProcess pid=9243) input_positions_tensor = torch.tensor(
(VllmWorkerProcess pid=9243) INFO 12-26 02:10:05 model_runner_base.py:120] Writing input of failed execution to /tmp/err_execute_model_input_20241226-021005.pkl...
(VllmWorkerProcess pid=9243) ('Warning: torch.save with "_use_new_zipfile_serialization = False" is not recommended for npu tensor, which may bring unexpected errors and hopefully set "_use_new_zipfile_serialization = True"', 'if it is necessary to use this, please convert the npu tensor to cpu tensor for saving')
(VllmWorkerProcess pid=9243) INFO 12-26 02:10:08 model_runner_base.py:149] Completed writing input of failed execution to /tmp/err_execute_model_input_20241226-021005.pkl.
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] Exception in worker VllmWorkerProcess while processing method determine_num_available_blocks.
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] Traceback (most recent call last):
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/model_runner_base.py", line 116, in _wrapper
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/model_runner.py", line 1683, in execute_model
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_or_intermediate_states = model_executable(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2_vl.py", line 1373, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_states = self.language_model.model(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/compilation/decorators.py", line 168, in call
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self.forward(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 340, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_states, residual = layer(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 247, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_states = self.self_attn(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 175, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] q, k = self.rotary_emb(positions, q, k)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/layers/rotary_embedding.py", line 825, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] query = query.view(num_tokens, -1, self.head_size)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] RuntimeError: shape '[32773, -1, 128]' is invalid for input of size 50331648
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236]
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] The above exception was the direct cause of the following exception:
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236]
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] Traceback (most recent call last):
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_worker_utils.py", line 230, in _run_worker_process
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] output = executor(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_worker.py", line 134, in determine_num_available_blocks
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] self.model_runner.profile_run()
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_model_runner.py", line 337, in profile_run
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] self.execute_model(model_input, kv_caches, intermediate_tensors)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/model_runner_base.py", line 152, in _wrapper
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] raise type(err)(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] RuntimeError: Error in model execution (input dumped to /tmp/err_execute_model_input_20241226-021005.pkl): shape '[32773, -1, 128]' is invalid for input of size 50331648
[rank0]: Traceback (most recent call last):
[rank0]: File "/usr/local/python3.9/bin/vllm", line 8, in
[rank0]: sys.exit(main())
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/scripts.py", line 201, in main
[rank0]: args.dispatch_function(args)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/scripts.py", line 42, in serve
[rank0]: uvloop.run(run_server(args))
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/uvloop/init.py", line 82, in run
[rank0]: return loop.run_until_complete(wrapper())
[rank0]: File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/uvloop/init.py", line 61, in wrapper
[rank0]: return await main
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 667, in run_server
[rank0]: async with build_async_engine_client(args) as engine_client:
[rank0]: File "/usr/local/python3.9/lib/python3.9/contextlib.py", line 181, in aenter
[rank0]: return await self.gen.anext()
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 117, in build_async_engine_client
[rank0]: async with build_async_engine_client_from_engine_args(
[rank0]: File "/usr/local/python3.9/lib/python3.9/contextlib.py", line 181, in aenter
[rank0]: return await self.gen.anext()
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 152, in build_async_engine_client_from_engine_args
[rank0]: engine_client = await asyncio.get_running_loop().run_in_executor(
[rank0]: File "/usr/local/python3.9/lib/python3.9/concurrent/futures/thread.py", line 58, in run
[rank0]: result = self.fn(*self.args, **self.kwargs)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 719, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 594, in init
[rank0]: self.engine = self._engine_class(*args, **kwargs)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 267, in init
[rank0]: super().init(*args, **kwargs)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 291, in init
[rank0]: self._initialize_kv_caches()
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 431, in _initialize_kv_caches
[rank0]: self.model_executor.determine_num_available_blocks())
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/distributed_gpu_executor.py", line 39, in determine_num_available_blocks
[rank0]: num_blocks = self._run_workers("determine_num_available_blocks", )
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_gpu_executor.py", line 161, in _run_workers
[rank0]: ] + [output.get() for output in worker_outputs]
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_gpu_executor.py", line 161, in
[rank0]: ] + [output.get() for output in worker_outputs]
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_worker_utils.py", line 61, in get
[rank0]: raise self.result.exception
[rank0]: RuntimeError: Error in model execution (input dumped to /tmp/err_execute_model_input_20241226-021005.pkl): shape '[32773, -1, 128]' is invalid for input of size 50331648
[ERROR] 2024-12-26-02:10:23 (PID:9043, Device:0, RankID:-1) ERR99999 UNKNOWN applicaiton exception
ERROR 12-26 02:10:26 multiproc_worker_utils.py:123] Worker VllmWorkerProcess pid 9243 died, exit code: -15
INFO 12-26 02:10:26 multiproc_worker_utils.py:127] Killing local vLLM worker processes
Process ForkServerProcess-1:7:
Process ForkServerProcess-1:4:
Process ForkServerProcess-1:9:
Process ForkServerProcess-1:6:
Process ForkServerProcess-1:5:
Process ForkServerProcess-1:8:
Process ForkServerProcess-1:2:
Process ForkServerProcess-1:3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "", line 2, in get
Traceback (most recent call last):
File "", line 2, in get
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "", line 2, in get
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "", line 2, in get
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "", line 2, in get
File "", line 2, in get
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "", line 2, in get
File "", line 2, in get
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
EOFError
EOFErro
How would you like to use vllm
I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
Before submitting a new issue...
Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
The text was updated successfully, but these errors were encountered:
Your current environment
the error that occurs when setting --pipeline_parallel_size 2 while deploying model inference service using vLLM docker?
root@76c63b1ab933:/workspace# vllm serve /models/checkpoint-500 --served-model qwen --port 8000 -p 2
INFO 12-26 02:09:12 api_server.py:643] vLLM API server version 0.1.dev3820+g27244b2.d20241225
INFO 12-26 02:09:12 api_server.py:644] args: Namespace(subparser='serve', model_tag='/models/checkpoint-500', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=[''], allowed_methods=[''], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='/models/checkpoint-500', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='xgrammar', logits_processor_pattern=None, distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=2, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, mm_cache_preprocessor=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=['qwen'], qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, dispatch_function=<function serve at 0xfffe8754cb80>)
INFO 12-26 02:09:23 config.py:451] This model supports multiple tasks: {'reward', 'score', 'generate', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 12-26 02:09:23 config.py:1122] Defaulting to use mp for distributed inference
WARNING 12-26 02:09:23 config.py:569] Async output processing can not be enabled with pipeline parallel
INFO 12-26 02:09:23 importing.py:15] Triton not installed or not compatible; certain GPU-related functions will not be available.
INFO 12-26 02:09:23 llm_engine.py:249] Initializing an LLM engine (v0.1.dev3820+g27244b2.d20241225) with config: model='/models/checkpoint-500', speculative_config=None, tokenizer='/models/checkpoint-500', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=qwen, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=False, mm_cache_preprocessor=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"candidate_compile_sizes":[],"compile_sizes":[],"capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,
WARNING 12-26 02:09:23 multiproc_worker_utils.py:312] Reducing Torch parallelism from 192 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
INFO 12-26 02:09:25 selector.py:217] Cannot use _Backend.FLASH_ATTN backend on NPU.
INFO 12-26 02:09:25 selector.py:167] Using ASCEND backend.
INFO 12-26 02:09:31 importing.py:15] Triton not installed or not compatible; certain GPU-related functions will not be available.
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:32 selector.py:217] Cannot use _Backend.FLASH_ATTN backend on NPU.
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:32 selector.py:167] Using ASCEND backend.
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:32 multiproc_worker_utils.py:222] Worker ready; awaiting tasks
INFO 12-26 02:09:49 model_runner.py:1092] Starting to load model /models/checkpoint-500...
(VllmWorkerProcess pid=9243) INFO 12-26 02:09:49 model_runner.py:1092] Starting to load model /models/checkpoint-500...
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:01<00:00, 1.17s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:01<00:00, 1.17s/it]
(VllmWorkerProcess pid=9243) INFO 12-26 02:10:01 model_runner.py:1097] Loading model weights took 2.9023 GB
INFO 12-26 02:10:01 model_runner.py:1097] Loading model weights took 2.9023 GB
[rank0]:[W1226 02:10:01.566394193 compiler_depend.ts:659] Warning: 0Failed to find function aclrtSynchronizeDeviceWithTimeout (function operator())
[rank1]:[W1226 02:10:01.568054145 compiler_depend.ts:659] Warning: 0Failed to find function aclrtSynchronizeDeviceWithTimeout (function operator())
/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_model_runner.py:119: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
input_positions_tensor = torch.tensor(
(VllmWorkerProcess pid=9243) /usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_model_runner.py:119: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
(VllmWorkerProcess pid=9243) input_positions_tensor = torch.tensor(
(VllmWorkerProcess pid=9243) INFO 12-26 02:10:05 model_runner_base.py:120] Writing input of failed execution to /tmp/err_execute_model_input_20241226-021005.pkl...
(VllmWorkerProcess pid=9243) ('Warning: torch.save with "_use_new_zipfile_serialization = False" is not recommended for npu tensor, which may bring unexpected errors and hopefully set "_use_new_zipfile_serialization = True"', 'if it is necessary to use this, please convert the npu tensor to cpu tensor for saving')
(VllmWorkerProcess pid=9243) INFO 12-26 02:10:08 model_runner_base.py:149] Completed writing input of failed execution to /tmp/err_execute_model_input_20241226-021005.pkl.
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] Exception in worker VllmWorkerProcess while processing method determine_num_available_blocks.
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] Traceback (most recent call last):
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/model_runner_base.py", line 116, in _wrapper
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/model_runner.py", line 1683, in execute_model
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_or_intermediate_states = model_executable(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2_vl.py", line 1373, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_states = self.language_model.model(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/compilation/decorators.py", line 168, in call
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self.forward(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 340, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_states, residual = layer(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 247, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] hidden_states = self.self_attn(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/models/qwen2.py", line 175, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] q, k = self.rotary_emb(positions, q, k)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return self._call_impl(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return forward_call(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/model_executor/layers/rotary_embedding.py", line 825, in forward
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] query = query.view(num_tokens, -1, self.head_size)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] RuntimeError: shape '[32773, -1, 128]' is invalid for input of size 50331648
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236]
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] The above exception was the direct cause of the following exception:
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236]
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] Traceback (most recent call last):
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_worker_utils.py", line 230, in _run_worker_process
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] output = executor(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_worker.py", line 134, in determine_num_available_blocks
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] self.model_runner.profile_run()
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/npu_model_runner.py", line 337, in profile_run
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] self.execute_model(model_input, kv_caches, intermediate_tensors)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] return func(*args, **kwargs)
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/worker/model_runner_base.py", line 152, in _wrapper
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] raise type(err)(
(VllmWorkerProcess pid=9243) ERROR 12-26 02:10:08 multiproc_worker_utils.py:236] RuntimeError: Error in model execution (input dumped to /tmp/err_execute_model_input_20241226-021005.pkl): shape '[32773, -1, 128]' is invalid for input of size 50331648
[rank0]: Traceback (most recent call last):
[rank0]: File "/usr/local/python3.9/bin/vllm", line 8, in
[rank0]: sys.exit(main())
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/scripts.py", line 201, in main
[rank0]: args.dispatch_function(args)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/scripts.py", line 42, in serve
[rank0]: uvloop.run(run_server(args))
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/uvloop/init.py", line 82, in run
[rank0]: return loop.run_until_complete(wrapper())
[rank0]: File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/uvloop/init.py", line 61, in wrapper
[rank0]: return await main
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 667, in run_server
[rank0]: async with build_async_engine_client(args) as engine_client:
[rank0]: File "/usr/local/python3.9/lib/python3.9/contextlib.py", line 181, in aenter
[rank0]: return await self.gen.anext()
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 117, in build_async_engine_client
[rank0]: async with build_async_engine_client_from_engine_args(
[rank0]: File "/usr/local/python3.9/lib/python3.9/contextlib.py", line 181, in aenter
[rank0]: return await self.gen.anext()
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 152, in build_async_engine_client_from_engine_args
[rank0]: engine_client = await asyncio.get_running_loop().run_in_executor(
[rank0]: File "/usr/local/python3.9/lib/python3.9/concurrent/futures/thread.py", line 58, in run
[rank0]: result = self.fn(*self.args, **self.kwargs)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 719, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 594, in init
[rank0]: self.engine = self._engine_class(*args, **kwargs)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 267, in init
[rank0]: super().init(*args, **kwargs)
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 291, in init
[rank0]: self._initialize_kv_caches()
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 431, in _initialize_kv_caches
[rank0]: self.model_executor.determine_num_available_blocks())
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/distributed_gpu_executor.py", line 39, in determine_num_available_blocks
[rank0]: num_blocks = self._run_workers("determine_num_available_blocks", )
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_gpu_executor.py", line 161, in _run_workers
[rank0]: ] + [output.get() for output in worker_outputs]
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_gpu_executor.py", line 161, in
[rank0]: ] + [output.get() for output in worker_outputs]
[rank0]: File "/usr/local/python3.9/lib/python3.9/site-packages/vllm/executor/multiproc_worker_utils.py", line 61, in get
[rank0]: raise self.result.exception
[rank0]: RuntimeError: Error in model execution (input dumped to /tmp/err_execute_model_input_20241226-021005.pkl): shape '[32773, -1, 128]' is invalid for input of size 50331648
[ERROR] 2024-12-26-02:10:23 (PID:9043, Device:0, RankID:-1) ERR99999 UNKNOWN applicaiton exception
ERROR 12-26 02:10:26 multiproc_worker_utils.py:123] Worker VllmWorkerProcess pid 9243 died, exit code: -15
INFO 12-26 02:10:26 multiproc_worker_utils.py:127] Killing local vLLM worker processes
Process ForkServerProcess-1:7:
Process ForkServerProcess-1:4:
Process ForkServerProcess-1:9:
Process ForkServerProcess-1:6:
Process ForkServerProcess-1:5:
Process ForkServerProcess-1:8:
Process ForkServerProcess-1:2:
Process ForkServerProcess-1:3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "", line 2, in get
Traceback (most recent call last):
File "", line 2, in get
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "", line 2, in get
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 65, in wrapper
raise exp
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "", line 2, in get
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 62, in wrapper
func(*args, **kwargs)
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
EOFError
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/tbe/common/repository_manager/route.py", line 262, in task_distribute
key, func_name, detail = resource_proxy[TASK_QUEUE].get()
File "", line 2, in get
File "", line 2, in get
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "", line 2, in get
File "", line 2, in get
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/managers.py", line 810, in _callmethod
kind, result = conn.recv()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
File "/usr/local/python3.9/lib/python3.9/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
EOFError
EOFErro
How would you like to use vllm
I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
Before submitting a new issue...
The text was updated successfully, but these errors were encountered: