{"id":10289,"date":"2026-04-18T09:24:44","date_gmt":"2026-04-18T07:24:44","guid":{"rendered":"https:\/\/myoceane.fr\/?p=10289"},"modified":"2026-04-18T09:53:53","modified_gmt":"2026-04-18T07:53:53","slug":"ai-vllm-a100-gemma-4","status":"publish","type":"post","link":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","title":{"rendered":"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100"},"content":{"rendered":"<div id=\"fb-root\"><\/div>\n\n<p>\u8a18\u9304\u4e0d\u540c\u5730\u7aef LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u6548\u679c\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408\u786c\u9ad4\u7522\u751f\u7684\u5dee\u7570\u3002<\/p>\n\n\n<h3>Gemma-4-26B-A4B<\/h3>\n<p>\u90e8\u7f72\u6307\u4ee4<\/p>\n<pre class=\"lang:bash\">vllm serve google\/gemma-4-26B-A4B \\\n  --port 10000 \\\n  --max-model-len 131072 \\\n  --enforce-eager \\\n  --enable-chunked-prefill \\\n  --enable-auto-tool-choice \\\n  --reasoning-parser gemma4 \\\n  --tool-call-parser gemma4 \\\n  --chat-template \/home\/vllm\/examples\/tool_chat_template_gemma4.jinja \\\n  --gpu-memory-utilization 0.9<\/pre>\n<p>\u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\uff1a<br \/>Model loading took <strong>48.5 GiB<\/strong> memory and <strong>10.090812 seconds<\/strong><br \/>Available KV cache memory: <strong>21.5 GiB<\/strong><\/p>\n<pre class=\"lang:bash\">(.venv) root@run-cvvmqstioi240l5-0:~# vllm serve google\/gemma-4-26B-A4B   --port 10000   --max-model-len 131072   --enforce-eager   --enable-chunked-prefill   --enable-auto-tool-choice   --reasoning-parser gemma4   --tool-call-parser gemma4   --chat-template \/home\/vllm\/examples\/tool_chat_template_gemma4.jinja   --gpu-memory-utilization 0.9\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299] \n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]        \u2588     \u2588     \u2588\u2584   \u2584\u2588\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]  \u2584\u2584 \u2584\u2588 \u2588     \u2588     \u2588 \u2580\u2584\u2580 \u2588  version 0.19.0\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]   \u2588\u2584\u2588\u2580 \u2588     \u2588     \u2588     \u2588  model   google\/gemma-4-26B-A4B\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]    \u2580\u2580  \u2580\u2580\u2580\u2580\u2580 \u2580\u2580\u2580\u2580\u2580 \u2580     \u2580\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299] \n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:233] non-default args: {'model_tag': 'google\/gemma-4-26B-A4B', 'chat_template': '\/home\/vllm\/examples\/tool_chat_template_gemma4.jinja', 'enable_auto_tool_choice': True, 'tool_call_parser': 'gemma4', 'port': 10000, 'model': 'google\/gemma-4-26B-A4B', 'max_model_len': 131072, 'enforce_eager': True, 'reasoning_parser': 'gemma4', 'enable_chunked_prefill': True}\n(APIServer pid=10793) INFO 04-17 14:03:08 [model.py:549] Resolved architecture: Gemma4ForConditionalGeneration\n(APIServer pid=10793) INFO 04-17 14:03:08 [model.py:1678] Using max model len 131072\n(APIServer pid=10793) INFO 04-17 14:03:08 [config.py:104] Gemma4 model has heterogeneous head dimensions (head_dim=256, global_head_dim=512). Forcing TRITON_ATTN backend to prevent mixed-backend numerical divergence.\n(APIServer pid=10793) INFO 04-17 14:03:08 [vllm.py:790] Asynchronous scheduling is enabled.\n(APIServer pid=10793) WARNING 04-17 14:03:08 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none\n(APIServer pid=10793) WARNING 04-17 14:03:08 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.\n(APIServer pid=10793) INFO 04-17 14:03:08 [vllm.py:1025] Cudagraph is disabled under eager mode\n(APIServer pid=10793) INFO 04-17 14:03:08 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant\n(EngineCore pid=10853) INFO 04-17 14:03:19 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google\/gemma-4-26B-A4B', speculative_config=None, tokenizer='google\/gemma-4-26B-A4B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='gemma4', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google\/gemma-4-26B-A4B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': &lt;CompilationMode.NONE: 0&gt;, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': &lt;CUDAGraphMode.NONE: 0&gt;, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': &lt;DynamicShapesType.BACKED: 'backed'&gt;, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}\n(EngineCore pid=10853) WARNING 04-17 14:03:19 [network_utils.py:36] The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.\n(EngineCore pid=10853) INFO 04-17 14:03:22 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp:\/\/10.240.1.237:39671 backend=nccl\n(EngineCore pid=10853) INFO 04-17 14:03:22 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N\/A\n(EngineCore pid=10853) INFO 04-17 14:03:23 [gpu_model_runner.py:4735] Starting to load model google\/gemma-4-26B-A4B...\n(EngineCore pid=10853) INFO 04-17 14:03:23 [vllm.py:790] Asynchronous scheduling is enabled.\n(EngineCore pid=10853) WARNING 04-17 14:03:23 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none\n(EngineCore pid=10853) WARNING 04-17 14:03:23 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.\n(EngineCore pid=10853) INFO 04-17 14:03:23 [vllm.py:1025] Cudagraph is disabled under eager mode\n(EngineCore pid=10853) INFO 04-17 14:03:23 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant\n(EngineCore pid=10853) INFO 04-17 14:03:23 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\n(EngineCore pid=10853) INFO 04-17 14:03:23 [unquantized.py:186] Using TRITON backend for Unquantized MoE\n(EngineCore pid=10853) INFO 04-17 14:03:23 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\nLoading safetensors checkpoint shards:   0% Completed | 0\/2 [00:00&lt;?, ?it\/s]\nLoading safetensors checkpoint shards:  50% Completed | 1\/2 [00:06&lt;00:06,  6.77s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:08&lt;00:00,  3.93s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:08&lt;00:00,  4.36s\/it]\n(EngineCore pid=10853) \n(EngineCore pid=10853) INFO 04-17 14:03:33 [default_loader.py:384] Loading weights took 8.77 seconds\n(EngineCore pid=10853) INFO 04-17 14:03:33 [gpu_model_runner.py:4820] Model loading took 48.5 GiB memory and 10.090812 seconds\n(EngineCore pid=10853) INFO 04-17 14:03:34 [gpu_model_runner.py:5753] Encoder cache will be initialized with a budget of 2496 tokens, and profiled with 1 video items of the maximum feature size.\n(EngineCore pid=10853) WARNING 04-17 14:03:50 [fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at \/home\/.venv\/lib\/python3.12\/site-packages\/vllm\/model_executor\/layers\/fused_moe\/configs\/E=128,N=704,device_name=NVIDIA_A100_80GB_PCIe.json\n(EngineCore pid=10853) INFO 04-17 14:03:51 [gpu_worker.py:436] Available KV cache memory: 21.5 GiB\n(EngineCore pid=10853) INFO 04-17 14:03:51 [kv_cache_utils.py:1319] GPU KV cache size: 93,936 tokens\n(EngineCore pid=10853) INFO 04-17 14:03:51 [kv_cache_utils.py:1324] Maximum concurrency for 131,072 tokens per request: 6.96x\n(EngineCore pid=10853) INFO 04-17 14:03:51 [core.py:283] init engine (profile, create kv cache, warmup model) took 17.49 seconds\n(EngineCore pid=10853) WARNING 04-17 14:03:51 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none\n(EngineCore pid=10853) WARNING 04-17 14:03:51 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.\n(EngineCore pid=10853) INFO 04-17 14:03:51 [vllm.py:1025] Cudagraph is disabled under eager mode\n(APIServer pid=10793) INFO 04-17 14:03:51 [api_server.py:590] Supported tasks: ['generate']\n(APIServer pid=10793) INFO 04-17 14:03:51 [parser_manager.py:202] \"auto\" tool choice has been enabled.\n(APIServer pid=10793) WARNING 04-17 14:03:51 [model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 1.0, 'top_k': 64, 'top_p': 0.95}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n(APIServer pid=10793) INFO 04-17 14:03:52 [hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.\n(APIServer pid=10793) INFO 04-17 14:04:07 [base.py:231] Multi-modal warmup completed in 15.030s\n(APIServer pid=10793) INFO 04-17 14:04:07 [api_server.py:594] Starting vLLM server on http:\/\/0.0.0.0:10000\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:37] Available routes are:\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/openapi.json, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/docs, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/docs\/oauth2-redirect, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/redoc, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/tokenize, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/detokenize, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/load, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/version, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/health, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/metrics, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/models, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/ping, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/ping, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/invocations, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/chat\/completions, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/chat\/completions\/batch, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/responses, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/responses\/{response_id}, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/responses\/{response_id}\/cancel, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/completions, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/messages, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/messages\/count_tokens, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/inference\/v1\/generate, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/scale_elastic_ep, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/is_scaling_elastic_ep, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/chat\/completions\/render, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/completions\/render, Methods: POST\n(APIServer pid=10793) INFO:     Started server process [10793]\n(APIServer pid=10793) INFO:     Waiting for application startup.\n(APIServer pid=10793) INFO:     Application startup complete.\n(APIServer pid=10793) INFO 04-17 14:04:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.8 tokens\/s, Avg generation throughput: 1.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.3%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.8%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.2 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.7 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.8 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:18 [loggers.py:259] Engine 000: Avg prompt throughput: 2.1 tokens\/s, Avg generation throughput: 17.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.2 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.8%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:08:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(EngineCore pid=10853) INFO 04-17 14:08:10 [core.py:1210] Shutdown initiated (timeout=0)\n(EngineCore pid=10853) INFO 04-17 14:08:10 [core.py:1215] Aborting 1 requests\n(EngineCore pid=10853) INFO 04-17 14:08:10 [core.py:1233] Shutdown complete<\/pre>\n\n\n\n<h3>Gemma-4-31B<\/h3>\n<p>\u90e8\u7f72\u6307\u4ee4\uff1a<\/p>\n<pre class=\"lang:bash\">vllm serve google\/gemma-4-31b-it \\\n  --port 10000 \\\n  --max-model-len 131072 \\\n  --enforce-eager \\\n  --enable-chunked-prefill \\\n  --enable-auto-tool-choice \\\n  --reasoning-parser gemma4 \\\n  --tool-call-parser gemma4 \\\n  --chat-template \/home\/vllm\/examples\/tool_chat_template_gemma4.jinja \\\n  --gpu-memory-utilization 0.95<\/pre>\n<p>\u4ee5\u4e0b\u7cfb\u7d71\u8a0a\u606f\u986f\u793a\uff1a<br \/>Model loading took <strong>58.9 GiB<\/strong> memory and <strong>265.606047 seconds<\/strong><br \/>Available KV cache memory: <strong>15.18 GiB<\/strong><\/p>\n<pre class=\"lang:bash\">(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299] \n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]        \u2588     \u2588     \u2588\u2584   \u2584\u2588\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]  \u2584\u2584 \u2584\u2588 \u2588     \u2588     \u2588 \u2580\u2584\u2580 \u2588  version 0.19.0\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]   \u2588\u2584\u2588\u2580 \u2588     \u2588     \u2588     \u2588  model   google\/gemma-4-31B-it\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]    \u2580\u2580  \u2580\u2580\u2580\u2580\u2580 \u2580\u2580\u2580\u2580\u2580 \u2580     \u2580\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299] \n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:233] non-default args: {'model_tag': 'google\/gemma-4-31B-it', 'chat_template': '\/home\/vllm\/examples\/tool_chat_template_gemma4.jinja', 'enable_auto_tool_choice': True, 'tool_call_parser': 'gemma4', 'port': 10000, 'model': 'google\/gemma-4-31B-it', 'max_model_len': 131072, 'reasoning_parser': 'gemma4', 'gpu_memory_utilization': 0.95, 'enable_chunked_prefill': True}\n(APIServer pid=12007) INFO 04-17 14:23:00 [model.py:549] Resolved architecture: Gemma4ForConditionalGeneration\n(APIServer pid=12007) INFO 04-17 14:23:00 [model.py:1678] Using max model len 131072\n(APIServer pid=12007) INFO 04-17 14:23:00 [config.py:104] Gemma4 model has heterogeneous head dimensions (head_dim=256, global_head_dim=512). Forcing TRITON_ATTN backend to prevent mixed-backend numerical divergence.\n(APIServer pid=12007) INFO 04-17 14:23:00 [vllm.py:790] Asynchronous scheduling is enabled.\ntokenizer_config.json: 2.10kB [00:00, 13.9MB\/s]\ntokenizer.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 32.2M\/32.2M [00:00&lt;00:00, 38.8MB\/s]\nchat_template.jinja: 16.4kB [00:00, 60.6MB\/s]\ngeneration_config.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208\/208 [00:00&lt;00:00, 1.26MB\/s]\n(EngineCore pid=12103) INFO 04-17 14:23:12 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google\/gemma-4-31B-it', speculative_config=None, tokenizer='google\/gemma-4-31B-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='gemma4', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google\/gemma-4-31B-it, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': &lt;CompilationMode.VLLM_COMPILE: 3&gt;, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': &lt;CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)&gt;, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': &lt;DynamicShapesType.BACKED: 'backed'&gt;, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}\n(EngineCore pid=12103) WARNING 04-17 14:23:12 [network_utils.py:36] The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.\n(EngineCore pid=12103) INFO 04-17 14:23:15 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp:\/\/10.240.1.237:44807 backend=nccl\n(EngineCore pid=12103) INFO 04-17 14:23:15 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N\/A, EPLB rank N\/A\n(EngineCore pid=12103) INFO 04-17 14:23:16 [gpu_model_runner.py:4735] Starting to load model google\/gemma-4-31B-it...\n(EngineCore pid=12103) INFO 04-17 14:23:16 [vllm.py:790] Asynchronous scheduling is enabled.\n(EngineCore pid=12103) INFO 04-17 14:23:16 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\n(EngineCore pid=12103) INFO 04-17 14:23:16 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\nmodel.safetensors.index.json: 120kB [00:00, 172MB\/s]\n(EngineCore pid=12103) INFO 04-17 14:27:31 [weight_utils.py:581] Time spent downloading weights for google\/gemma-4-31B-it: 254.377387 seconds\nLoading safetensors checkpoint shards:   0% Completed | 0\/2 [00:00&lt;?, ?it\/s]\nLoading safetensors checkpoint shards:  50% Completed | 1\/2 [00:06&lt;00:06,  6.03s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:09&lt;00:00,  4.42s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:09&lt;00:00,  4.66s\/it]\n(EngineCore pid=12103) \n(EngineCore pid=12103) INFO 04-17 14:27:41 [default_loader.py:384] Loading weights took 9.81 seconds\n(EngineCore pid=12103) INFO 04-17 14:27:42 [gpu_model_runner.py:4820] Model loading took 58.9 GiB memory and 265.606047 seconds\n(EngineCore pid=12103) INFO 04-17 14:27:42 [gpu_model_runner.py:5753] Encoder cache will be initialized with a budget of 2496 tokens, and profiled with 1 video items of the maximum feature size.\n(EngineCore pid=12103) INFO 04-17 14:28:12 [backends.py:1051] Using cache directory: \/root\/.cache\/vllm\/torch_compile_cache\/7adb633e8c\/rank_0_0\/backbone for vLLM's torch.compile\n(EngineCore pid=12103) INFO 04-17 14:28:12 [backends.py:1111] Dynamo bytecode transform time: 13.96 s\n(EngineCore pid=12103) INFO 04-17 14:28:20 [backends.py:372] Cache the graph of compile range (1, 2048) for later use\n(EngineCore pid=12103) INFO 04-17 14:28:37 [backends.py:390] Compiling a graph for compile range (1, 2048) takes 24.11 s\n(EngineCore pid=12103) INFO 04-17 14:28:41 [decorators.py:640] saved AOT compiled function to \/root\/.cache\/vllm\/torch_compile_cache\/torch_aot_compile\/d173ae923688602925c8eea81edef979ba093c96f1721ef4b4e7b1eff5b17f9b\/rank_0_0\/model\n(EngineCore pid=12103) INFO 04-17 14:28:41 [monitor.py:48] torch.compile took 43.47 s in total\n(EngineCore pid=12103) INFO 04-17 14:28:43 [monitor.py:76] Initial profiling\/warmup run took 1.02 s\n(EngineCore pid=12103) INFO 04-17 14:28:43 [kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512\n(EngineCore pid=12103) INFO 04-17 14:28:43 [gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=35 (largest=256)\n(EngineCore pid=12103) INFO 04-17 14:28:47 [gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.86 GiB total\n(EngineCore pid=12103) INFO 04-17 14:28:48 [gpu_worker.py:436] Available KV cache memory: 15.18 GiB\n(EngineCore pid=12103) INFO 04-17 14:28:48 [gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9608 to maintain the same effective KV cache size.\n(EngineCore pid=12103) INFO 04-17 14:28:48 [kv_cache_utils.py:1319] GPU KV cache size: 16,576 tokens\n(EngineCore pid=12103) INFO 04-17 14:28:48 [kv_cache_utils.py:1324] Maximum concurrency for 131,072 tokens per request: 1.23x\nCapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 51\/51 [00:06&lt;00:00,  7.97it\/s]\nCapturing CUDA graphs (decode, FULL): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 35\/35 [00:06&lt;00:00,  5.79it\/s]\n(EngineCore pid=12103) INFO 04-17 14:29:01 [gpu_model_runner.py:6046] Graph capturing finished in 13 secs, took 0.84 GiB\n(EngineCore pid=12103) INFO 04-17 14:29:01 [gpu_worker.py:597] CUDA graph pool memory: 0.84 GiB (actual), 0.86 GiB (estimated), difference: 0.02 GiB (2.6%).\n(EngineCore pid=12103) INFO 04-17 14:29:01 [core.py:283] init engine (profile, create kv cache, warmup model) took 79.33 seconds\n(APIServer pid=12007) INFO 04-17 14:29:02 [api_server.py:590] Supported tasks: ['generate']\n(APIServer pid=12007) INFO 04-17 14:29:02 [parser_manager.py:202] \"auto\" tool choice has been enabled.\n(APIServer pid=12007) WARNING 04-17 14:29:02 [model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 1.0, 'top_k': 64, 'top_p': 0.95}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n(APIServer pid=12007) INFO 04-17 14:29:02 [hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.\n(APIServer pid=12007) INFO 04-17 14:29:17 [base.py:231] Multi-modal warmup completed in 15.017s\n(APIServer pid=12007) INFO 04-17 14:29:18 [api_server.py:594] Starting vLLM server on http:\/\/0.0.0.0:10000\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:37] Available routes are:\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/openapi.json, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/docs, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/docs\/oauth2-redirect, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/redoc, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/tokenize, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/detokenize, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/load, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/version, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/health, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/metrics, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/models, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/ping, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/ping, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/invocations, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/chat\/completions, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/chat\/completions\/batch, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/responses, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/responses\/{response_id}, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/responses\/{response_id}\/cancel, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/completions, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/messages, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/messages\/count_tokens, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/inference\/v1\/generate, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/scale_elastic_ep, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/is_scaling_elastic_ep, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/chat\/completions\/render, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/completions\/render, Methods: POST\n(APIServer pid=12007) INFO:     Started server process [12007]\n(APIServer pid=12007) INFO:     Waiting for application startup.\n(APIServer pid=12007) INFO:     Application startup complete.\n(APIServer pid=12007) INFO 04-17 14:29:38 [loggers.py:259] Engine 000: Avg prompt throughput: 2.1 tokens\/s, Avg generation throughput: 9.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.7%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:29:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 23.2 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.9%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:29:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 22.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 3.3%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:30:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 22.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 4.5%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO:     127.0.0.1:42834 - \"POST \/v1\/chat\/completions HTTP\/1.1\" 200 OK\n(APIServer pid=12007) INFO 04-17 14:30:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:30:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 0.0 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%<\/pre>","protected":false},"excerpt":{"rendered":"<p>\u8a18\u9304\u4e0d\u540c\u5730\u7aef LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u6548\u679c\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408\u786c\u9ad4\u7522\u751f\u7684\u5dee\u7570\u3002<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[9,1761,14],"tags":[2038,2039],"class_list":["post-10289","post","type-post","status-publish","format-standard","hentry","category-bigdata-ml","category-gpu","category-it-technology","tag-a100","tag-gemma-4"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v24.6 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane\" \/>\n<meta property=\"og:description\" content=\"\u8a18\u9304\u4e0d\u540c\u5730\u7aef LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u6548\u679c\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408\u786c\u9ad4\u7522\u751f\u7684\u5dee\u7570\u3002\" \/>\n<meta property=\"og:url\" content=\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\" \/>\n<meta property=\"og:site_name\" content=\"\u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane\" \/>\n<meta property=\"article:published_time\" content=\"2026-04-18T07:24:44+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2026-04-18T07:53:53+00:00\" \/>\n<meta name=\"author\" content=\"\u6ab8\u6aac\u7238\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"\u6ab8\u6aac\u7238\" \/>\n\t<meta name=\"twitter:label2\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"1 minute\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#article\",\"isPartOf\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\"},\"author\":{\"name\":\"\u6ab8\u6aac\u7238\",\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\"},\"headline\":\"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100\",\"datePublished\":\"2026-04-18T07:24:44+00:00\",\"dateModified\":\"2026-04-18T07:53:53+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\"},\"wordCount\":39,\"commentCount\":0,\"publisher\":{\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\"},\"keywords\":[\"A100\",\"Gemma-4\"],\"articleSection\":[\"Big Data &amp; Machine Learning\",\"GPU\",\"IT Technology\"],\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"CommentAction\",\"name\":\"Comment\",\"target\":[\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#respond\"]}]},{\"@type\":\"WebPage\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\",\"url\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\",\"name\":\"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane\",\"isPartOf\":{\"@id\":\"https:\/\/myoceane.fr\/#website\"},\"datePublished\":\"2026-04-18T07:24:44+00:00\",\"dateModified\":\"2026-04-18T07:53:53+00:00\",\"breadcrumb\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/myoceane.fr\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/myoceane.fr\/#website\",\"url\":\"https:\/\/myoceane.fr\/\",\"name\":\"M-Y-Oceane \u60f3\u65b9\u6d89\u6cd5\u3002\u91cf\u74f6\u5916\u7684\u5929\u7a7a\",\"description\":\"\u60f3\u65b9\u6d89\u6cd5, France, Taiwan, Health, Information Technology\",\"publisher\":{\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/myoceane.fr\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":[\"Person\",\"Organization\"],\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\",\"name\":\"\u6ab8\u6aac\u7238\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g\",\"contentUrl\":\"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g\",\"caption\":\"\u6ab8\u6aac\u7238\"},\"logo\":{\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/image\/\"},\"url\":\"https:\/\/myoceane.fr\/index.php\/author\/johnny5584767gmail-com\/\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","og_locale":"en_US","og_type":"article","og_title":"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","og_description":"\u8a18\u9304\u4e0d\u540c\u5730\u7aef LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u6548\u679c\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408\u786c\u9ad4\u7522\u751f\u7684\u5dee\u7570\u3002","og_url":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","og_site_name":"\u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","article_published_time":"2026-04-18T07:24:44+00:00","article_modified_time":"2026-04-18T07:53:53+00:00","author":"\u6ab8\u6aac\u7238","twitter_card":"summary_large_image","twitter_misc":{"Written by":"\u6ab8\u6aac\u7238","Est. reading time":"1 minute"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#article","isPartOf":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/"},"author":{"name":"\u6ab8\u6aac\u7238","@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b"},"headline":"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100","datePublished":"2026-04-18T07:24:44+00:00","dateModified":"2026-04-18T07:53:53+00:00","mainEntityOfPage":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/"},"wordCount":39,"commentCount":0,"publisher":{"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b"},"keywords":["A100","Gemma-4"],"articleSection":["Big Data &amp; Machine Learning","GPU","IT Technology"],"inLanguage":"en-US","potentialAction":[{"@type":"CommentAction","name":"Comment","target":["https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#respond"]}]},{"@type":"WebPage","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","url":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","name":"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","isPartOf":{"@id":"https:\/\/myoceane.fr\/#website"},"datePublished":"2026-04-18T07:24:44+00:00","dateModified":"2026-04-18T07:53:53+00:00","breadcrumb":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/myoceane.fr\/"},{"@type":"ListItem","position":2,"name":"[AI] vLLM \u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100"}]},{"@type":"WebSite","@id":"https:\/\/myoceane.fr\/#website","url":"https:\/\/myoceane.fr\/","name":"M-Y-Oceane \u60f3\u65b9\u6d89\u6cd5\u3002\u91cf\u74f6\u5916\u7684\u5929\u7a7a","description":"\u60f3\u65b9\u6d89\u6cd5, France, Taiwan, Health, Information Technology","publisher":{"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/myoceane.fr\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":["Person","Organization"],"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b","name":"\u6ab8\u6aac\u7238","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/myoceane.fr\/#\/schema\/person\/image\/","url":"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g","caption":"\u6ab8\u6aac\u7238"},"logo":{"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/image\/"},"url":"https:\/\/myoceane.fr\/index.php\/author\/johnny5584767gmail-com\/"}]}},"amp_enabled":false,"_links":{"self":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts\/10289","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/comments?post=10289"}],"version-history":[{"count":11,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts\/10289\/revisions"}],"predecessor-version":[{"id":10303,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts\/10289\/revisions\/10303"}],"wp:attachment":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/media?parent=10289"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/categories?post=10289"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/tags?post=10289"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}