{"id":10289,"date":"2026-04-18T09:24:44","date_gmt":"2026-04-18T07:24:44","guid":{"rendered":"https:\/\/myoceane.fr\/?p=10289"},"modified":"2026-04-19T07:11:21","modified_gmt":"2026-04-19T05:11:21","slug":"ai-vllm-a100-gemma-4","status":"publish","type":"post","link":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","title":{"rendered":"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100"},"content":{"rendered":"<div id=\"fb-root\"><\/div>\n\n<p style=\"text-align: justify;\">2025 \u5e74\u516b\u6708\u91cb\u51fa\u7684 GPT-OSS-20B \u662f\u666e\u904d\u5730\u7aef\u63a1\u7528\u7684\u6a21\u578b\uff0c\u4e00\u91cb\u51fa\u7684\u6642\u5019\u5c31\u6709 128K \u7684 max-model-len\uff0c2026 \u5e74\u56db\u6708 Google Gemma-4 \u4e0a\u4e0b\u6587\u5927\u5c0f\u66f4\u53ef\u4ee5\u5230\u5169\u500d\uff0c\u672c\u7bc7\u56e0\u70ba\u5de5\u4f5c\u9700\u8981\uff0c\u9700\u8981\u5617\u8a66\u4f7f\u7528 Gemma-4 \u6a21\u578b\uff0c\u56e0\u6b64\u9806\u4fbf\u8a18\u9304\u4e0d\u540c\u5730\u7aef (GPT-OSS-20B, Gemma-4) LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u7cfb\u7d71\u8a0a\u606f\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408 A100 \u7522\u751f\u7684\u5dee\u7570\uff0c\u63d0\u4f9b\u4e00\u500b\u4f4e\u6210\u672c\u7684\u89e3\u6c7a\u65b9\u6848\u3002<\/p>\n\n\n\n<h3><a href=\"https:\/\/huggingface.co\/openai\/gpt-oss-20b\">GPT-OSS-20B<\/a> \u6a21\u578b\u662f\u6211\u5011\u7684\u53c3\u8003\u4f9d\u64da\uff1a<\/h3>\n<p>\u90e8\u7f72\u6307\u4ee4\u53c3\u8003 vLLM <a href=\"https:\/\/docs.vllm.ai\/projects\/recipes\/en\/latest\/OpenAI\/GPT-OSS.html\">\u5b98\u65b9\u9023\u7d50<\/a>\uff1a<\/p>\n<pre class=\"lang:bash\">vllm serve openai\/gpt-oss-20b --port 10000 --max-model-len 131072  --gpu-memory-utilization 0.65 --trust-remote-code<\/pre>\n<p class=\"p1\">\u7cfb\u7d71\u8a0a\u606f\u900f\u9732\u5e7e\u500b\u91cd\u8981\u8a0a\u606f\uff1a<br \/>Model loading took <strong>13.72 GiB<\/strong> memory and <strong>62.336108 seconds<\/strong><br \/>Available KV cache memory: <strong>36.56 GiB<\/strong><br \/>quantization=mxfp4\uff0c\u9019\u500b\u9810\u8a2d\u503c\u662f\u70ba\u4ec0\u9ebc 20B \u7684\u6a21\u578b\u53ea\u9700\u8981 13.72 GiB \u7684\u5167\u5b58\u7684\u95dc\u9375\u3002<\/p>\n<pre class=\"lang:bash\">(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:299] \n(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:299]        \u2588     \u2588     \u2588\u2584   \u2584\u2588\n(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:299]  \u2584\u2584 \u2584\u2588 \u2588     \u2588     \u2588 \u2580\u2584\u2580 \u2588  version 0.19.0\n(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:299]   \u2588\u2584\u2588\u2580 \u2588     \u2588     \u2588     \u2588  model   openai\/gpt-oss-20b\n(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:299]    \u2580\u2580  \u2580\u2580\u2580\u2580\u2580 \u2580\u2580\u2580\u2580\u2580 \u2580     \u2580\n(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:299] \n(APIServer pid=6539) INFO 04-18 20:22:57 [utils.py:233] non-default args: {'model_tag': 'openai\/gpt-oss-20b', 'port': 10000, 'model': 'openai\/gpt-oss-20b', 'trust_remote_code': True, 'max_model_len': 131072, 'gpu_memory_utilization': 0.65}\nconfig.json: 1.81kB [00:00, 5.73MB\/s]\n(APIServer pid=6539) INFO 04-18 20:23:04 [model.py:549] Resolved architecture: GptOssForCausalLM\nmodel.safetensors.index.json: 36.4kB [00:00, 80.0MB\/s]\nParse safetensors files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3\/3 [00:00&lt;00:00, 12.56it\/s]\n(APIServer pid=6539) INFO 04-18 20:23:04 [model.py:1678] Using max model len 131072\n(APIServer pid=6539) INFO 04-18 20:23:04 [config.py:131] Overriding max cuda graph capture size to 1024 for performance.\n(APIServer pid=6539) INFO 04-18 20:23:04 [vllm.py:790] Asynchronous scheduling is enabled.\ntokenizer_config.json: 4.20kB [00:00, 13.5MB\/s]\ntokenizer.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 27.9M\/27.9M [00:00&lt;00:00, 33.1MB\/s]\nspecial_tokens_map.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 98.0\/98.0 [00:00&lt;00:00, 660kB\/s]\nchat_template.jinja: 16.7kB [00:00, 60.7MB\/s]\ngeneration_config.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 177\/177 [00:00&lt;00:00, 952kB\/s]\n(EngineCore pid=6686) INFO 04-18 20:23:14 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai\/gpt-oss-20b', speculative_config=None, tokenizer='openai\/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai\/gpt-oss-20b, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': &lt;CompilationMode.VLLM_COMPILE: 3&gt;, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': &lt;CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)&gt;, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': &lt;DynamicShapesType.BACKED: 'backed'&gt;, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}\n(EngineCore pid=6686) WARNING 04-18 20:23:14 [network_utils.py:36] The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.\n(EngineCore pid=6686) INFO 04-18 20:23:14 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp:\/\/10.240.1.138:45759 backend=nccl\n(EngineCore pid=6686) INFO 04-18 20:23:14 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N\/A\n(EngineCore pid=6686) INFO 04-18 20:23:15 [gpu_model_runner.py:4735] Starting to load model openai\/gpt-oss-20b...\n(EngineCore pid=6686) INFO 04-18 20:23:16 [cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN'].\n(EngineCore pid=6686) INFO 04-18 20:23:16 [mxfp4.py:352] Using 'MARLIN' Mxfp4 MoE backend.\n(EngineCore pid=6686) INFO 04-18 20:24:15 [weight_utils.py:581] Time spent downloading weights for openai\/gpt-oss-20b: 58.546043 seconds\nLoading safetensors checkpoint shards:   0% Completed | 0\/3 [00:00&lt;?, ?it\/s]\nLoading safetensors checkpoint shards:  33% Completed | 1\/3 [00:00&lt;00:01,  1.71it\/s]\nLoading safetensors checkpoint shards:  67% Completed | 2\/3 [00:01&lt;00:00,  1.50it\/s]\nLoading safetensors checkpoint shards: 100% Completed | 3\/3 [00:01&lt;00:00,  1.55it\/s]\nLoading safetensors checkpoint shards: 100% Completed | 3\/3 [00:01&lt;00:00,  1.56it\/s]\n(EngineCore pid=6686) \n(EngineCore pid=6686) INFO 04-18 20:24:17 [default_loader.py:384] Loading weights took 2.08 seconds\n(EngineCore pid=6686) INFO 04-18 20:24:17 [mxfp4.py:836] Using MoEPrepareAndFinalizeNoDPEPModular\n(EngineCore pid=6686) INFO 04-18 20:24:18 [gpu_model_runner.py:4820] Model loading took 13.72 GiB memory and 62.336108 seconds\n(EngineCore pid=6686) INFO 04-18 20:24:23 [backends.py:1051] Using cache directory: \/root\/.cache\/vllm\/torch_compile_cache\/53309fb7e3\/rank_0_0\/backbone for vLLM's torch.compile\n(EngineCore pid=6686) INFO 04-18 20:24:23 [backends.py:1111] Dynamo bytecode transform time: 4.82 s\n(EngineCore pid=6686) INFO 04-18 20:24:26 [backends.py:372] Cache the graph of compile range (1, 2048) for later use\n(EngineCore pid=6686) INFO 04-18 20:24:30 [backends.py:390] Compiling a graph for compile range (1, 2048) takes 6.34 s\n(EngineCore pid=6686) INFO 04-18 20:24:31 [decorators.py:640] saved AOT compiled function to \/root\/.cache\/vllm\/torch_compile_cache\/torch_aot_compile\/a7c56a287d6bc73f13d9858ad8e60b103c9a539350d9acfe4f98c0a15ebd82e3\/rank_0_0\/model\n(EngineCore pid=6686) INFO 04-18 20:24:31 [monitor.py:48] torch.compile took 12.46 s in total\n(EngineCore pid=6686) INFO 04-18 20:24:31 [monitor.py:76] Initial profiling\/warmup run took 0.23 s\n(EngineCore pid=6686) INFO 04-18 20:24:37 [kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024\n(EngineCore pid=6686) INFO 04-18 20:24:38 [gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=35 (largest=256)\n(EngineCore pid=6686) INFO 04-18 20:24:40 [gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.42 GiB total\n(EngineCore pid=6686) INFO 04-18 20:24:40 [gpu_worker.py:436] Available KV cache memory: 36.56 GiB\n(EngineCore pid=6686) INFO 04-18 20:24:40 [gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.6500 to 0.6553 to maintain the same effective KV cache size.\n(EngineCore pid=6686) INFO 04-18 20:24:40 [kv_cache_utils.py:1319] GPU KV cache size: 798,624 tokens\n(EngineCore pid=6686) INFO 04-18 20:24:40 [kv_cache_utils.py:1324] Maximum concurrency for 131,072 tokens per request: 11.99x\nCapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 83\/83 [00:05&lt;00:00, 16.07it\/s]\nCapturing CUDA graphs (decode, FULL): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 35\/35 [00:04&lt;00:00,  8.20it\/s]\n(EngineCore pid=6686) INFO 04-18 20:24:50 [gpu_model_runner.py:6046] Graph capturing finished in 10 secs, took 0.69 GiB\n(EngineCore pid=6686) INFO 04-18 20:24:50 [gpu_worker.py:597] CUDA graph pool memory: 0.69 GiB (actual), 0.42 GiB (estimated), difference: 0.26 GiB (38.4%).\n(EngineCore pid=6686) INFO 04-18 20:24:50 [core.py:283] init engine (profile, create kv cache, warmup model) took 32.50 seconds\n(EngineCore pid=6686) INFO 04-18 20:24:53 [vllm.py:790] Asynchronous scheduling is enabled.\n(APIServer pid=6539) INFO 04-18 20:24:53 [api_server.py:590] Supported tasks: ['generate']\n(APIServer pid=6539) WARNING 04-18 20:24:54 [serving.py:233] For gpt-oss, we ignore --enable-auto-tool-choice and always enable tool use.\n(APIServer pid=6539) INFO 04-18 20:24:58 [hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.\n(APIServer pid=6539) INFO 04-18 20:24:58 [api_server.py:594] Starting vLLM server on http:\/\/0.0.0.0:10000\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:37] Available routes are:\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/openapi.json, Methods: GET, HEAD\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/docs, Methods: GET, HEAD\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/docs\/oauth2-redirect, Methods: GET, HEAD\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/redoc, Methods: GET, HEAD\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/tokenize, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/detokenize, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/load, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/version, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/health, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/metrics, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/models, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/ping, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/ping, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/invocations, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/chat\/completions, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/chat\/completions\/batch, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/responses, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/responses\/{response_id}, Methods: GET\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/responses\/{response_id}\/cancel, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/completions, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/messages, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/messages\/count_tokens, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/inference\/v1\/generate, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/scale_elastic_ep, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/is_scaling_elastic_ep, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/chat\/completions\/render, Methods: POST\n(APIServer pid=6539) INFO 04-18 20:24:58 [launcher.py:46] Route: \/v1\/completions\/render, Methods: POST\n(APIServer pid=6539) INFO:     Started server process [6539]\n(APIServer pid=6539) INFO:     Waiting for application startup.\n(APIServer pid=6539) INFO:     Application startup complete.\n(APIServer pid=6539) INFO 04-18 20:25:48 [loggers.py:259] Engine 000: Avg prompt throughput: 8.0 tokens\/s, Avg generation throughput: 111.7 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=6539) INFO:     127.0.0.1:39600 - \"POST \/v1\/responses HTTP\/1.1\" 200 OK\n(APIServer pid=6539) INFO 04-18 20:25:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 164.7 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=6539) INFO 04-18 20:26:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 0.0 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%<\/pre>\n<p>\u00a0<\/p>\n\n\n<h3><a href=\"https:\/\/huggingface.co\/google\/gemma-4-26B-A4B\/tree\/main\">Gemma-4-26B-A4B<\/a><\/h3>\n<p>\u90e8\u7f72\u6307\u4ee4\u53c3\u8003 <a href=\"https:\/\/docs.vllm.ai\/projects\/recipes\/en\/latest\/Google\/Gemma4.html\">vLLM \u5b98\u65b9\u7db2\u7ad9<\/a>\uff1a<\/p>\n<pre class=\"lang:bash\">vllm serve google\/gemma-4-26B-A4B \\\n  --port 10000 \\\n  --max-model-len 131072 \\\n  --enable-auto-tool-choice \\\n  --reasoning-parser gemma4 \\\n  --tool-call-parser gemma4 \\\n  --chat-template \/home\/vllm\/examples\/tool_chat_template_gemma4.jinja \\\n  --gpu-memory-utilization 0.9<\/pre>\n<p>\u90e8\u7f72\u7cfb\u7d71\u8a0a\u606f\uff1a<br \/>Model loading took <strong>48.5 GiB<\/strong> memory and <strong>10.090812 seconds<\/strong><br \/>Available KV cache memory: <strong>21.5 GiB<\/strong><\/p>\n<pre class=\"lang:bash\">(.venv) root@run-cvvmqstioi240l5-0:~# vllm serve google\/gemma-4-26B-A4B   --port 10000   --max-model-len 131072   --enforce-eager   --enable-chunked-prefill   --enable-auto-tool-choice   --reasoning-parser gemma4   --tool-call-parser gemma4   --chat-template \/home\/vllm\/examples\/tool_chat_template_gemma4.jinja   --gpu-memory-utilization 0.9\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299] \n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]        \u2588     \u2588     \u2588\u2584   \u2584\u2588\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]  \u2584\u2584 \u2584\u2588 \u2588     \u2588     \u2588 \u2580\u2584\u2580 \u2588  version 0.19.0\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]   \u2588\u2584\u2588\u2580 \u2588     \u2588     \u2588     \u2588  model   google\/gemma-4-26B-A4B\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299]    \u2580\u2580  \u2580\u2580\u2580\u2580\u2580 \u2580\u2580\u2580\u2580\u2580 \u2580     \u2580\n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:299] \n(APIServer pid=10793) INFO 04-17 14:03:07 [utils.py:233] non-default args: {'model_tag': 'google\/gemma-4-26B-A4B', 'chat_template': '\/home\/vllm\/examples\/tool_chat_template_gemma4.jinja', 'enable_auto_tool_choice': True, 'tool_call_parser': 'gemma4', 'port': 10000, 'model': 'google\/gemma-4-26B-A4B', 'max_model_len': 131072, 'enforce_eager': True, 'reasoning_parser': 'gemma4', 'enable_chunked_prefill': True}\n(APIServer pid=10793) INFO 04-17 14:03:08 [model.py:549] Resolved architecture: Gemma4ForConditionalGeneration\n(APIServer pid=10793) INFO 04-17 14:03:08 [model.py:1678] Using max model len 131072\n(APIServer pid=10793) INFO 04-17 14:03:08 [config.py:104] Gemma4 model has heterogeneous head dimensions (head_dim=256, global_head_dim=512). Forcing TRITON_ATTN backend to prevent mixed-backend numerical divergence.\n(APIServer pid=10793) INFO 04-17 14:03:08 [vllm.py:790] Asynchronous scheduling is enabled.\n(APIServer pid=10793) WARNING 04-17 14:03:08 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none\n(APIServer pid=10793) WARNING 04-17 14:03:08 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.\n(APIServer pid=10793) INFO 04-17 14:03:08 [vllm.py:1025] Cudagraph is disabled under eager mode\n(APIServer pid=10793) INFO 04-17 14:03:08 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant\n(EngineCore pid=10853) INFO 04-17 14:03:19 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google\/gemma-4-26B-A4B', speculative_config=None, tokenizer='google\/gemma-4-26B-A4B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='gemma4', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google\/gemma-4-26B-A4B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': &lt;CompilationMode.NONE: 0&gt;, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': &lt;CUDAGraphMode.NONE: 0&gt;, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': &lt;DynamicShapesType.BACKED: 'backed'&gt;, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}\n(EngineCore pid=10853) WARNING 04-17 14:03:19 [network_utils.py:36] The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.\n(EngineCore pid=10853) INFO 04-17 14:03:22 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp:\/\/10.240.1.237:39671 backend=nccl\n(EngineCore pid=10853) INFO 04-17 14:03:22 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N\/A\n(EngineCore pid=10853) INFO 04-17 14:03:23 [gpu_model_runner.py:4735] Starting to load model google\/gemma-4-26B-A4B...\n(EngineCore pid=10853) INFO 04-17 14:03:23 [vllm.py:790] Asynchronous scheduling is enabled.\n(EngineCore pid=10853) WARNING 04-17 14:03:23 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none\n(EngineCore pid=10853) WARNING 04-17 14:03:23 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.\n(EngineCore pid=10853) INFO 04-17 14:03:23 [vllm.py:1025] Cudagraph is disabled under eager mode\n(EngineCore pid=10853) INFO 04-17 14:03:23 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant\n(EngineCore pid=10853) INFO 04-17 14:03:23 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\n(EngineCore pid=10853) INFO 04-17 14:03:23 [unquantized.py:186] Using TRITON backend for Unquantized MoE\n(EngineCore pid=10853) INFO 04-17 14:03:23 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\nLoading safetensors checkpoint shards:   0% Completed | 0\/2 [00:00&lt;?, ?it\/s]\nLoading safetensors checkpoint shards:  50% Completed | 1\/2 [00:06&lt;00:06,  6.77s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:08&lt;00:00,  3.93s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:08&lt;00:00,  4.36s\/it]\n(EngineCore pid=10853) \n(EngineCore pid=10853) INFO 04-17 14:03:33 [default_loader.py:384] Loading weights took 8.77 seconds\n(EngineCore pid=10853) INFO 04-17 14:03:33 [gpu_model_runner.py:4820] Model loading took 48.5 GiB memory and 10.090812 seconds\n(EngineCore pid=10853) INFO 04-17 14:03:34 [gpu_model_runner.py:5753] Encoder cache will be initialized with a budget of 2496 tokens, and profiled with 1 video items of the maximum feature size.\n(EngineCore pid=10853) WARNING 04-17 14:03:50 [fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at \/home\/.venv\/lib\/python3.12\/site-packages\/vllm\/model_executor\/layers\/fused_moe\/configs\/E=128,N=704,device_name=NVIDIA_A100_80GB_PCIe.json\n(EngineCore pid=10853) INFO 04-17 14:03:51 [gpu_worker.py:436] Available KV cache memory: 21.5 GiB\n(EngineCore pid=10853) INFO 04-17 14:03:51 [kv_cache_utils.py:1319] GPU KV cache size: 93,936 tokens\n(EngineCore pid=10853) INFO 04-17 14:03:51 [kv_cache_utils.py:1324] Maximum concurrency for 131,072 tokens per request: 6.96x\n(EngineCore pid=10853) INFO 04-17 14:03:51 [core.py:283] init engine (profile, create kv cache, warmup model) took 17.49 seconds\n(EngineCore pid=10853) WARNING 04-17 14:03:51 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none\n(EngineCore pid=10853) WARNING 04-17 14:03:51 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.\n(EngineCore pid=10853) INFO 04-17 14:03:51 [vllm.py:1025] Cudagraph is disabled under eager mode\n(APIServer pid=10793) INFO 04-17 14:03:51 [api_server.py:590] Supported tasks: ['generate']\n(APIServer pid=10793) INFO 04-17 14:03:51 [parser_manager.py:202] \"auto\" tool choice has been enabled.\n(APIServer pid=10793) WARNING 04-17 14:03:51 [model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 1.0, 'top_k': 64, 'top_p': 0.95}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n(APIServer pid=10793) INFO 04-17 14:03:52 [hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.\n(APIServer pid=10793) INFO 04-17 14:04:07 [base.py:231] Multi-modal warmup completed in 15.030s\n(APIServer pid=10793) INFO 04-17 14:04:07 [api_server.py:594] Starting vLLM server on http:\/\/0.0.0.0:10000\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:37] Available routes are:\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/openapi.json, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/docs, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/docs\/oauth2-redirect, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/redoc, Methods: GET, HEAD\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/tokenize, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/detokenize, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/load, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/version, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/health, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/metrics, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/models, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/ping, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/ping, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/invocations, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/chat\/completions, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/chat\/completions\/batch, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/responses, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/responses\/{response_id}, Methods: GET\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/responses\/{response_id}\/cancel, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/completions, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/messages, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/messages\/count_tokens, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/inference\/v1\/generate, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/scale_elastic_ep, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/is_scaling_elastic_ep, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/chat\/completions\/render, Methods: POST\n(APIServer pid=10793) INFO 04-17 14:04:07 [launcher.py:46] Route: \/v1\/completions\/render, Methods: POST\n(APIServer pid=10793) INFO:     Started server process [10793]\n(APIServer pid=10793) INFO:     Waiting for application startup.\n(APIServer pid=10793) INFO:     Application startup complete.\n(APIServer pid=10793) INFO 04-17 14:04:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.8 tokens\/s, Avg generation throughput: 1.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.3%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:04:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.8%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.2 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:05:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.7 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.8 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:06:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:18 [loggers.py:259] Engine 000: Avg prompt throughput: 2.1 tokens\/s, Avg generation throughput: 17.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:38 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.2 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:07:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.3 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.8%, Prefix cache hit rate: 0.0%\n(APIServer pid=10793) INFO 04-17 14:08:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.4 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 0.0%\n(EngineCore pid=10853) INFO 04-17 14:08:10 [core.py:1210] Shutdown initiated (timeout=0)\n(EngineCore pid=10853) INFO 04-17 14:08:10 [core.py:1215] Aborting 1 requests\n(EngineCore pid=10853) INFO 04-17 14:08:10 [core.py:1233] Shutdown complete<\/pre>\n\n\n<p style=\"text-align: justify;\">\u53ef\u4ee5\u770b\u5230\u90e8\u7f72 Gemma-4-26B-A4B \u5df2\u7d93\u4f54\u64da\u4e86\u5927\u90e8\u5206\u7684\u8a18\u61b6\u9ad4\u8cc7\u6e90\u975e\u5e38\u8017\u8cbb\u8cc7\u6e90\uff0c\u800c\u4e14\u7522\u751f tokens \u7684\u901f\u5ea6\u4e5f\u4e0d\u9ad8\u6bcf\u79d2\u5927\u7d04 20 tokens\uff0c\u6b64\u6642\u53ef\u4ee5\u8003\u616e\u4f7f\u7528 quantization fp8 \u5982\u4e0b\u5716\u6240\u793a\u53ef\u4ee5\u964d\u4f4e\u4e00\u534a\u7684\u8a18\u61b6\u9ad4\u4f7f\u7528\u91cf\uff0c\u4e26\u4e14\u5c07 &#8211;max-model-len \u8a2d\u5230 256K \u53ef\u4ee5\u5145\u5206\u767c\u63ee Gemma \u9577\u4e0a\u4e0b\u6587\u7684\u512a\u52e2\u3002<\/p>\n<pre class=\"lang:bash\">vllm serve google\/gemma-4-26B-A4B \\\n  --port 10000 \\\n  --max-model-len 256K \\\n  --quantization fp8 \\\n  --enable-auto-tool-choice \\\n  --reasoning-parser gemma4 \\\n  --tool-call-parser gemma4 \\\n  --chat-template \/root\/vllm\/examples\/tool_chat_template_gemma4.jinja \\\n  --gpu-memory-utilization 0.65<\/pre>\n<p>\u89c0\u5bdf\u7cfb\u7d71 Log\uff1a<br \/>Model loading took <strong>25.7 GiB<\/strong> memory and <strong>15.197657 seconds<br \/><\/strong>Available KV cache memory: <strong>24.45 GiB<br \/><\/strong><\/p>\n<pre class=\"lang:bash\">(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:299] \n(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:299]        \u2588     \u2588     \u2588\u2584   \u2584\u2588\n(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:299]  \u2584\u2584 \u2584\u2588 \u2588     \u2588     \u2588 \u2580\u2584\u2580 \u2588  version 0.19.0\n(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:299]   \u2588\u2584\u2588\u2580 \u2588     \u2588     \u2588     \u2588  model   google\/gemma-4-26B-A4B\n(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:299]    \u2580\u2580  \u2580\u2580\u2580\u2580\u2580 \u2580\u2580\u2580\u2580\u2580 \u2580     \u2580\n(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:299] \n(APIServer pid=6048) INFO 04-18 20:14:28 [utils.py:233] non-default args: {'model_tag': 'google\/gemma-4-26B-A4B', 'chat_template': '\/root\/vllm\/examples\/tool_chat_template_gemma4.jinja', 'enable_auto_tool_choice': True, 'tool_call_parser': 'gemma4', 'port': 10000, 'model': 'google\/gemma-4-26B-A4B', 'max_model_len': 262144, 'quantization': 'fp8', 'reasoning_parser': 'gemma4', 'gpu_memory_utilization': 0.65}\n(APIServer pid=6048) INFO 04-18 20:14:29 [model.py:549] Resolved architecture: Gemma4ForConditionalGeneration\n(APIServer pid=6048) INFO 04-18 20:14:29 [model.py:1678] Using max model len 262144\n(APIServer pid=6048) INFO 04-18 20:14:29 [config.py:104] Gemma4 model has heterogeneous head dimensions (head_dim=256, global_head_dim=512). Forcing TRITON_ATTN backend to prevent mixed-backend numerical divergence.\n(APIServer pid=6048) INFO 04-18 20:14:29 [vllm.py:790] Asynchronous scheduling is enabled.\n(EngineCore pid=6116) INFO 04-18 20:14:40 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google\/gemma-4-26B-A4B', speculative_config=None, tokenizer='google\/gemma-4-26B-A4B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='gemma4', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google\/gemma-4-26B-A4B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': &lt;CompilationMode.VLLM_COMPILE: 3&gt;, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': &lt;CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)&gt;, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': &lt;DynamicShapesType.BACKED: 'backed'&gt;, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}\n(EngineCore pid=6116) WARNING 04-18 20:14:40 [network_utils.py:36] The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.\n(EngineCore pid=6116) INFO 04-18 20:14:43 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp:\/\/10.240.1.138:52415 backend=nccl\n(EngineCore pid=6116) INFO 04-18 20:14:43 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N\/A\n(EngineCore pid=6116) INFO 04-18 20:14:43 [gpu_model_runner.py:4735] Starting to load model google\/gemma-4-26B-A4B...\n(EngineCore pid=6116) INFO 04-18 20:14:44 [vllm.py:790] Asynchronous scheduling is enabled.\n(EngineCore pid=6116) INFO 04-18 20:14:44 [__init__.py:261] Selected MarlinFP8ScaledMMLinearKernel for Fp8OnlineLinearMethod\n(EngineCore pid=6116) INFO 04-18 20:14:44 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\n(EngineCore pid=6116) INFO 04-18 20:14:44 [fp8.py:396] Using MARLIN Fp8 MoE backend out of potential backends: ['AITER', 'FLASHINFER_TRTLLM', 'FLASHINFER_CUTLASS', 'DEEPGEMM', 'TRITON', 'MARLIN', 'BATCHED_DEEPGEMM', 'BATCHED_TRITON', 'XPU'].\n(EngineCore pid=6116) INFO 04-18 20:14:44 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\nLoading safetensors checkpoint shards:   0% Completed | 0\/2 [00:00&lt;?, ?it\/s]\n(EngineCore pid=6116) WARNING 04-18 20:14:46 [marlin_utils_fp8.py:216] Your GPU does not have native support for FP8 computation but FP8 quantization is being used. Weight-only FP8 compression will be used leveraging the Marlin kernel. This may degrade performance for compute-heavy workloads.\n(EngineCore pid=6116) INFO 04-18 20:14:46 [fp8.py:560] Using MoEPrepareAndFinalizeNoDPEPModular\nLoading safetensors checkpoint shards:  50% Completed | 1\/2 [00:08&lt;00:08,  8.47s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:13&lt;00:00,  6.36s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:13&lt;00:00,  6.68s\/it]\n(EngineCore pid=6116) \n(EngineCore pid=6116) INFO 04-18 20:14:59 [default_loader.py:384] Loading weights took 13.51 seconds\n(EngineCore pid=6116) INFO 04-18 20:14:59 [gpu_model_runner.py:4820] Model loading took 25.7 GiB memory and 15.197657 seconds\n(EngineCore pid=6116) INFO 04-18 20:15:00 [gpu_model_runner.py:5753] Encoder cache will be initialized with a budget of 2496 tokens, and profiled with 1 video items of the maximum feature size.\n(EngineCore pid=6116) INFO 04-18 20:15:22 [backends.py:1051] Using cache directory: \/root\/.cache\/vllm\/torch_compile_cache\/ed28a1082a\/rank_0_0\/backbone for vLLM's torch.compile\n(EngineCore pid=6116) INFO 04-18 20:15:22 [backends.py:1111] Dynamo bytecode transform time: 3.74 s\n(EngineCore pid=6116) INFO 04-18 20:15:24 [backends.py:285] Directly load the compiled graph(s) for compile range (1, 2048) from the cache, took 1.461 s\n(EngineCore pid=6116) INFO 04-18 20:15:24 [decorators.py:303] Directly load AOT compilation from path \/root\/.cache\/vllm\/torch_compile_cache\/torch_aot_compile\/cf7df09e7e9c2bd5fb3ccbf0eeff6a2bc7dbbc8d6ea39a535b47ecd6869e8076\/rank_0_0\/model\n(EngineCore pid=6116) INFO 04-18 20:15:24 [monitor.py:48] torch.compile took 5.71 s in total\n(EngineCore pid=6116) INFO 04-18 20:15:24 [monitor.py:76] Initial profiling\/warmup run took 0.67 s\n(EngineCore pid=6116) INFO 04-18 20:15:25 [kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512\n(EngineCore pid=6116) INFO 04-18 20:15:25 [gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=35 (largest=256)\n(EngineCore pid=6116) INFO 04-18 20:15:26 [gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.11 GiB total\n(EngineCore pid=6116) INFO 04-18 20:15:27 [gpu_worker.py:436] Available KV cache memory: 24.45 GiB\n(EngineCore pid=6116) INFO 04-18 20:15:27 [gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.6500 to 0.6640 to maintain the same effective KV cache size.\n(EngineCore pid=6116) INFO 04-18 20:15:27 [kv_cache_utils.py:1319] GPU KV cache size: 106,800 tokens\n(EngineCore pid=6116) INFO 04-18 20:15:27 [kv_cache_utils.py:1324] Maximum concurrency for 262,144 tokens per request: 4.37x\nCapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 51\/51 [00:04&lt;00:00, 10.39it\/s]\nCapturing CUDA graphs (decode, FULL): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 35\/35 [00:03&lt;00:00,  9.96it\/s]\n(EngineCore pid=6116) INFO 04-18 20:15:36 [gpu_model_runner.py:6046] Graph capturing finished in 9 secs, took 0.95 GiB\n(EngineCore pid=6116) INFO 04-18 20:15:36 [gpu_worker.py:597] CUDA graph pool memory: 0.95 GiB (actual), 1.11 GiB (estimated), difference: 0.16 GiB (16.6%).\n(EngineCore pid=6116) INFO 04-18 20:15:36 [core.py:283] init engine (profile, create kv cache, warmup model) took 36.33 seconds\n(APIServer pid=6048) INFO 04-18 20:15:36 [api_server.py:590] Supported tasks: ['generate']\n(APIServer pid=6048) INFO 04-18 20:15:36 [parser_manager.py:202] \"auto\" tool choice has been enabled.\n(APIServer pid=6048) WARNING 04-18 20:15:36 [model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 1.0, 'top_k': 64, 'top_p': 0.95}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n(APIServer pid=6048) INFO 04-18 20:15:37 [hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.\n(APIServer pid=6048) INFO 04-18 20:15:52 [base.py:231] Multi-modal warmup completed in 15.028s\n(APIServer pid=6048) INFO 04-18 20:15:52 [api_server.py:594] Starting vLLM server on http:\/\/0.0.0.0:10000\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:37] Available routes are:\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/openapi.json, Methods: GET, HEAD\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/docs, Methods: GET, HEAD\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/docs\/oauth2-redirect, Methods: GET, HEAD\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/redoc, Methods: GET, HEAD\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/tokenize, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/detokenize, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/load, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/version, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/health, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/metrics, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/models, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/ping, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/ping, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/invocations, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/chat\/completions, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/chat\/completions\/batch, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/responses, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/responses\/{response_id}, Methods: GET\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/responses\/{response_id}\/cancel, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/completions, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/messages, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/messages\/count_tokens, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/inference\/v1\/generate, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/scale_elastic_ep, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/is_scaling_elastic_ep, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/chat\/completions\/render, Methods: POST\n(APIServer pid=6048) INFO 04-18 20:15:52 [launcher.py:46] Route: \/v1\/completions\/render, Methods: POST\n(APIServer pid=6048) INFO:     Started server process [6048]\n(APIServer pid=6048) INFO:     Waiting for application startup.\n(APIServer pid=6048) INFO:     Application startup complete.\n(APIServer pid=6048) INFO 04-18 20:17:42 [loggers.py:259] Engine 000: Avg prompt throughput: 2.5 tokens\/s, Avg generation throughput: 14.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 0.0%\n(APIServer pid=6048) INFO:     127.0.0.1:50130 - \"POST \/v1\/chat\/completions HTTP\/1.1\" 200 OK\n(APIServer pid=6048) INFO 04-18 20:17:52 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 80.6 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=6048) INFO 04-18 20:18:02 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 0.0 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%\n^C(EngineCore pid=6116) INFO 04-18 20:19:20 [core.py:1210] Shutdown initiated (timeout=0)\n(EngineCore pid=6116) INFO 04-18 20:19:20 [core.py:1233] Shutdown complete\n(APIServer pid=6048) INFO:     Shutting down<\/pre>\n<p>\u5099\u8a3b\uff1atokens \u7522\u751f\u901f\u7387\u6709\u6240\u63d0\u5347\u3002<\/p>\n\n\n<h3><a href=\"https:\/\/huggingface.co\/google\/gemma-4-31B\">Gemma-4-31B<\/a><\/h3>\n<p>\u90e8\u7f72\u6307\u4ee4\uff1a<\/p>\n<pre class=\"lang:bash\">vllm serve google\/gemma-4-31b-it \\\n  --port 10000 \\\n  --max-model-len 131072 \\\n  --enable-auto-tool-choice \\\n  --reasoning-parser gemma4 \\\n  --tool-call-parser gemma4 \\\n  --chat-template \/home\/vllm\/examples\/tool_chat_template_gemma4.jinja \\\n  --gpu-memory-utilization 0.95<\/pre>\n<p>\u4ee5\u4e0b\u7cfb\u7d71\u8a0a\u606f\u986f\u793a\uff1a<br \/>Model loading took <strong>58.9 GiB<\/strong> memory and <strong>265.606047 seconds<\/strong><br \/>Available KV cache memory: <strong>15.18 GiB<\/strong><\/p>\n<pre class=\"lang:bash\">(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299] \n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]        \u2588     \u2588     \u2588\u2584   \u2584\u2588\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]  \u2584\u2584 \u2584\u2588 \u2588     \u2588     \u2588 \u2580\u2584\u2580 \u2588  version 0.19.0\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]   \u2588\u2584\u2588\u2580 \u2588     \u2588     \u2588     \u2588  model   google\/gemma-4-31B-it\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299]    \u2580\u2580  \u2580\u2580\u2580\u2580\u2580 \u2580\u2580\u2580\u2580\u2580 \u2580     \u2580\n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:299] \n(APIServer pid=12007) INFO 04-17 14:22:59 [utils.py:233] non-default args: {'model_tag': 'google\/gemma-4-31B-it', 'chat_template': '\/home\/vllm\/examples\/tool_chat_template_gemma4.jinja', 'enable_auto_tool_choice': True, 'tool_call_parser': 'gemma4', 'port': 10000, 'model': 'google\/gemma-4-31B-it', 'max_model_len': 131072, 'reasoning_parser': 'gemma4', 'gpu_memory_utilization': 0.95, 'enable_chunked_prefill': True}\n(APIServer pid=12007) INFO 04-17 14:23:00 [model.py:549] Resolved architecture: Gemma4ForConditionalGeneration\n(APIServer pid=12007) INFO 04-17 14:23:00 [model.py:1678] Using max model len 131072\n(APIServer pid=12007) INFO 04-17 14:23:00 [config.py:104] Gemma4 model has heterogeneous head dimensions (head_dim=256, global_head_dim=512). Forcing TRITON_ATTN backend to prevent mixed-backend numerical divergence.\n(APIServer pid=12007) INFO 04-17 14:23:00 [vllm.py:790] Asynchronous scheduling is enabled.\ntokenizer_config.json: 2.10kB [00:00, 13.9MB\/s]\ntokenizer.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 32.2M\/32.2M [00:00&lt;00:00, 38.8MB\/s]\nchat_template.jinja: 16.4kB [00:00, 60.6MB\/s]\ngeneration_config.json: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 208\/208 [00:00&lt;00:00, 1.26MB\/s]\n(EngineCore pid=12103) INFO 04-17 14:23:12 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google\/gemma-4-31B-it', speculative_config=None, tokenizer='google\/gemma-4-31B-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='gemma4', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google\/gemma-4-31B-it, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': &lt;CompilationMode.VLLM_COMPILE: 3&gt;, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': &lt;CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)&gt;, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': &lt;DynamicShapesType.BACKED: 'backed'&gt;, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}\n(EngineCore pid=12103) WARNING 04-17 14:23:12 [network_utils.py:36] The environment variable HOST_IP is deprecated and ignored, as it is often used by Docker and other software to interact with the container's network stack. Please use VLLM_HOST_IP instead to set the IP address for vLLM processes to communicate with each other.\n(EngineCore pid=12103) INFO 04-17 14:23:15 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp:\/\/10.240.1.237:44807 backend=nccl\n(EngineCore pid=12103) INFO 04-17 14:23:15 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N\/A, EPLB rank N\/A\n(EngineCore pid=12103) INFO 04-17 14:23:16 [gpu_model_runner.py:4735] Starting to load model google\/gemma-4-31B-it...\n(EngineCore pid=12103) INFO 04-17 14:23:16 [vllm.py:790] Asynchronous scheduling is enabled.\n(EngineCore pid=12103) INFO 04-17 14:23:16 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\n(EngineCore pid=12103) INFO 04-17 14:23:16 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.\nmodel.safetensors.index.json: 120kB [00:00, 172MB\/s]\n(EngineCore pid=12103) INFO 04-17 14:27:31 [weight_utils.py:581] Time spent downloading weights for google\/gemma-4-31B-it: 254.377387 seconds\nLoading safetensors checkpoint shards:   0% Completed | 0\/2 [00:00&lt;?, ?it\/s]\nLoading safetensors checkpoint shards:  50% Completed | 1\/2 [00:06&lt;00:06,  6.03s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:09&lt;00:00,  4.42s\/it]\nLoading safetensors checkpoint shards: 100% Completed | 2\/2 [00:09&lt;00:00,  4.66s\/it]\n(EngineCore pid=12103) \n(EngineCore pid=12103) INFO 04-17 14:27:41 [default_loader.py:384] Loading weights took 9.81 seconds\n(EngineCore pid=12103) INFO 04-17 14:27:42 [gpu_model_runner.py:4820] Model loading took 58.9 GiB memory and 265.606047 seconds\n(EngineCore pid=12103) INFO 04-17 14:27:42 [gpu_model_runner.py:5753] Encoder cache will be initialized with a budget of 2496 tokens, and profiled with 1 video items of the maximum feature size.\n(EngineCore pid=12103) INFO 04-17 14:28:12 [backends.py:1051] Using cache directory: \/root\/.cache\/vllm\/torch_compile_cache\/7adb633e8c\/rank_0_0\/backbone for vLLM's torch.compile\n(EngineCore pid=12103) INFO 04-17 14:28:12 [backends.py:1111] Dynamo bytecode transform time: 13.96 s\n(EngineCore pid=12103) INFO 04-17 14:28:20 [backends.py:372] Cache the graph of compile range (1, 2048) for later use\n(EngineCore pid=12103) INFO 04-17 14:28:37 [backends.py:390] Compiling a graph for compile range (1, 2048) takes 24.11 s\n(EngineCore pid=12103) INFO 04-17 14:28:41 [decorators.py:640] saved AOT compiled function to \/root\/.cache\/vllm\/torch_compile_cache\/torch_aot_compile\/d173ae923688602925c8eea81edef979ba093c96f1721ef4b4e7b1eff5b17f9b\/rank_0_0\/model\n(EngineCore pid=12103) INFO 04-17 14:28:41 [monitor.py:48] torch.compile took 43.47 s in total\n(EngineCore pid=12103) INFO 04-17 14:28:43 [monitor.py:76] Initial profiling\/warmup run took 1.02 s\n(EngineCore pid=12103) INFO 04-17 14:28:43 [kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512\n(EngineCore pid=12103) INFO 04-17 14:28:43 [gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=35 (largest=256)\n(EngineCore pid=12103) INFO 04-17 14:28:47 [gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.86 GiB total\n(EngineCore pid=12103) INFO 04-17 14:28:48 [gpu_worker.py:436] Available KV cache memory: 15.18 GiB\n(EngineCore pid=12103) INFO 04-17 14:28:48 [gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9608 to maintain the same effective KV cache size.\n(EngineCore pid=12103) INFO 04-17 14:28:48 [kv_cache_utils.py:1319] GPU KV cache size: 16,576 tokens\n(EngineCore pid=12103) INFO 04-17 14:28:48 [kv_cache_utils.py:1324] Maximum concurrency for 131,072 tokens per request: 1.23x\nCapturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 51\/51 [00:06&lt;00:00,  7.97it\/s]\nCapturing CUDA graphs (decode, FULL): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 35\/35 [00:06&lt;00:00,  5.79it\/s]\n(EngineCore pid=12103) INFO 04-17 14:29:01 [gpu_model_runner.py:6046] Graph capturing finished in 13 secs, took 0.84 GiB\n(EngineCore pid=12103) INFO 04-17 14:29:01 [gpu_worker.py:597] CUDA graph pool memory: 0.84 GiB (actual), 0.86 GiB (estimated), difference: 0.02 GiB (2.6%).\n(EngineCore pid=12103) INFO 04-17 14:29:01 [core.py:283] init engine (profile, create kv cache, warmup model) took 79.33 seconds\n(APIServer pid=12007) INFO 04-17 14:29:02 [api_server.py:590] Supported tasks: ['generate']\n(APIServer pid=12007) INFO 04-17 14:29:02 [parser_manager.py:202] \"auto\" tool choice has been enabled.\n(APIServer pid=12007) WARNING 04-17 14:29:02 [model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 1.0, 'top_k': 64, 'top_p': 0.95}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n(APIServer pid=12007) INFO 04-17 14:29:02 [hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.\n(APIServer pid=12007) INFO 04-17 14:29:17 [base.py:231] Multi-modal warmup completed in 15.017s\n(APIServer pid=12007) INFO 04-17 14:29:18 [api_server.py:594] Starting vLLM server on http:\/\/0.0.0.0:10000\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:37] Available routes are:\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/openapi.json, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/docs, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/docs\/oauth2-redirect, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/redoc, Methods: GET, HEAD\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/tokenize, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/detokenize, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/load, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/version, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/health, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/metrics, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/models, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/ping, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/ping, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/invocations, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/chat\/completions, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/chat\/completions\/batch, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/responses, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/responses\/{response_id}, Methods: GET\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/responses\/{response_id}\/cancel, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/completions, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/messages, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/messages\/count_tokens, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/inference\/v1\/generate, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/scale_elastic_ep, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/is_scaling_elastic_ep, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/chat\/completions\/render, Methods: POST\n(APIServer pid=12007) INFO 04-17 14:29:18 [launcher.py:46] Route: \/v1\/completions\/render, Methods: POST\n(APIServer pid=12007) INFO:     Started server process [12007]\n(APIServer pid=12007) INFO:     Waiting for application startup.\n(APIServer pid=12007) INFO:     Application startup complete.\n(APIServer pid=12007) INFO 04-17 14:29:38 [loggers.py:259] Engine 000: Avg prompt throughput: 2.1 tokens\/s, Avg generation throughput: 9.6 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.7%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:29:48 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 23.2 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.9%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:29:58 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 22.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 3.3%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:30:08 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 22.9 tokens\/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 4.5%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO:     127.0.0.1:42834 - \"POST \/v1\/chat\/completions HTTP\/1.1\" 200 OK\n(APIServer pid=12007) INFO 04-17 14:30:18 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 18.5 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%\n(APIServer pid=12007) INFO 04-17 14:30:28 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens\/s, Avg generation throughput: 0.0 tokens\/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%<\/pre>\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>2025 \u5e74\u516b\u6708\u91cb\u51fa\u7684 GPT-OSS-20B \u662f\u666e\u904d\u5730\u7aef\u63a1\u7528\u7684\u6a21\u578b\uff0c\u4e00\u91cb\u51fa\u7684\u6642\u5019\u5c31\u6709 128K \u7684 max-model-len\uff0c2026 \u5e74\u56db\u6708 Google Gemma-4 \u4e0a\u4e0b\u6587\u5927\u5c0f\u66f4\u53ef\u4ee5\u5230\u5169\u500d\uff0c\u672c\u7bc7\u56e0\u70ba\u5de5\u4f5c\u9700\u8981\uff0c\u9700\u8981\u5617\u8a66\u4f7f\u7528 Gemma-4 \u6a21\u578b\uff0c\u56e0\u6b64\u9806\u4fbf\u8a18\u9304\u4e0d\u540c\u5730\u7aef (GPT-OSS-20B, Gemma-4) LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u7cfb\u7d71\u8a0a\u606f\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408 A100 \u7522\u751f\u7684\u5dee\u7570\uff0c\u63d0\u4f9b\u4e00\u500b\u4f4e\u6210\u672c\u7684\u89e3\u6c7a\u65b9\u6848\u3002<\/p>\n","protected":false},"author":1,"featured_media":10329,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[9,1761,14],"tags":[2038,2039,2040,1980],"class_list":["post-10289","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-bigdata-ml","category-gpu","category-it-technology","tag-a100","tag-gemma-4","tag-gpt-oss-20b","tag-vllm"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v24.6 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane<\/title>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane\" \/>\n<meta property=\"og:description\" content=\"2025 \u5e74\u516b\u6708\u91cb\u51fa\u7684 GPT-OSS-20B \u662f\u666e\u904d\u5730\u7aef\u63a1\u7528\u7684\u6a21\u578b\uff0c\u4e00\u91cb\u51fa\u7684\u6642\u5019\u5c31\u6709 128K \u7684 max-model-len\uff0c2026 \u5e74\u56db\u6708 Google Gemma-4 \u4e0a\u4e0b\u6587\u5927\u5c0f\u66f4\u53ef\u4ee5\u5230\u5169\u500d\uff0c\u672c\u7bc7\u56e0\u70ba\u5de5\u4f5c\u9700\u8981\uff0c\u9700\u8981\u5617\u8a66\u4f7f\u7528 Gemma-4 \u6a21\u578b\uff0c\u56e0\u6b64\u9806\u4fbf\u8a18\u9304\u4e0d\u540c\u5730\u7aef (GPT-OSS-20B, Gemma-4) LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u7cfb\u7d71\u8a0a\u606f\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408 A100 \u7522\u751f\u7684\u5dee\u7570\uff0c\u63d0\u4f9b\u4e00\u500b\u4f4e\u6210\u672c\u7684\u89e3\u6c7a\u65b9\u6848\u3002\" \/>\n<meta property=\"og:url\" content=\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\" \/>\n<meta property=\"og:site_name\" content=\"\u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane\" \/>\n<meta property=\"article:published_time\" content=\"2026-04-18T07:24:44+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2026-04-19T05:11:21+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png\" \/>\n\t<meta property=\"og:image:width\" content=\"1200\" \/>\n\t<meta property=\"og:image:height\" content=\"900\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/png\" \/>\n<meta name=\"author\" content=\"\u6ab8\u6aac\u7238\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Written by\" \/>\n\t<meta name=\"twitter:data1\" content=\"\u6ab8\u6aac\u7238\" \/>\n\t<meta name=\"twitter:label2\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data2\" content=\"1 minute\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#article\",\"isPartOf\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\"},\"author\":{\"name\":\"\u6ab8\u6aac\u7238\",\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\"},\"headline\":\"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100\",\"datePublished\":\"2026-04-18T07:24:44+00:00\",\"dateModified\":\"2026-04-19T05:11:21+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\"},\"wordCount\":95,\"commentCount\":0,\"publisher\":{\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\"},\"image\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage\"},\"thumbnailUrl\":\"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png\",\"keywords\":[\"A100\",\"Gemma-4\",\"GPT-OSS-20B\",\"vLLM\"],\"articleSection\":[\"Big Data &amp; Machine Learning\",\"GPU\",\"IT Technology\"],\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"CommentAction\",\"name\":\"Comment\",\"target\":[\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#respond\"]}]},{\"@type\":\"WebPage\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\",\"url\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\",\"name\":\"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane\",\"isPartOf\":{\"@id\":\"https:\/\/myoceane.fr\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage\"},\"image\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage\"},\"thumbnailUrl\":\"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png\",\"datePublished\":\"2026-04-18T07:24:44+00:00\",\"dateModified\":\"2026-04-19T05:11:21+00:00\",\"breadcrumb\":{\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage\",\"url\":\"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png\",\"contentUrl\":\"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png\",\"width\":1200,\"height\":900},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/myoceane.fr\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/myoceane.fr\/#website\",\"url\":\"https:\/\/myoceane.fr\/\",\"name\":\"M-Y-Oceane \u60f3\u65b9\u6d89\u6cd5\u3002\u91cf\u74f6\u5916\u7684\u5929\u7a7a\",\"description\":\"\u60f3\u65b9\u6d89\u6cd5, France, Taiwan, Health, Information Technology\",\"publisher\":{\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/myoceane.fr\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":[\"Person\",\"Organization\"],\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b\",\"name\":\"\u6ab8\u6aac\u7238\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g\",\"contentUrl\":\"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g\",\"caption\":\"\u6ab8\u6aac\u7238\"},\"logo\":{\"@id\":\"https:\/\/myoceane.fr\/#\/schema\/person\/image\/\"},\"url\":\"https:\/\/myoceane.fr\/index.php\/author\/johnny5584767gmail-com\/\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","og_locale":"en_US","og_type":"article","og_title":"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","og_description":"2025 \u5e74\u516b\u6708\u91cb\u51fa\u7684 GPT-OSS-20B \u662f\u666e\u904d\u5730\u7aef\u63a1\u7528\u7684\u6a21\u578b\uff0c\u4e00\u91cb\u51fa\u7684\u6642\u5019\u5c31\u6709 128K \u7684 max-model-len\uff0c2026 \u5e74\u56db\u6708 Google Gemma-4 \u4e0a\u4e0b\u6587\u5927\u5c0f\u66f4\u53ef\u4ee5\u5230\u5169\u500d\uff0c\u672c\u7bc7\u56e0\u70ba\u5de5\u4f5c\u9700\u8981\uff0c\u9700\u8981\u5617\u8a66\u4f7f\u7528 Gemma-4 \u6a21\u578b\uff0c\u56e0\u6b64\u9806\u4fbf\u8a18\u9304\u4e0d\u540c\u5730\u7aef (GPT-OSS-20B, Gemma-4) LLM \u6a21\u578b\u5728 Nvidia A100 \u4e0a\u914d\u5408\u4e0d\u540c\u53c3\u6578\u7684\u7cfb\u7d71\u8a0a\u606f\uff0c\u85c9\u6b64\u8b93\u81ea\u5df1\u7406\u89e3\u4e0d\u540c LLM Model \u8207\u914d\u5408 A100 \u7522\u751f\u7684\u5dee\u7570\uff0c\u63d0\u4f9b\u4e00\u500b\u4f4e\u6210\u672c\u7684\u89e3\u6c7a\u65b9\u6848\u3002","og_url":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","og_site_name":"\u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","article_published_time":"2026-04-18T07:24:44+00:00","article_modified_time":"2026-04-19T05:11:21+00:00","og_image":[{"width":1200,"height":900,"url":"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png","type":"image\/png"}],"author":"\u6ab8\u6aac\u7238","twitter_card":"summary_large_image","twitter_misc":{"Written by":"\u6ab8\u6aac\u7238","Est. reading time":"1 minute"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#article","isPartOf":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/"},"author":{"name":"\u6ab8\u6aac\u7238","@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b"},"headline":"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100","datePublished":"2026-04-18T07:24:44+00:00","dateModified":"2026-04-19T05:11:21+00:00","mainEntityOfPage":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/"},"wordCount":95,"commentCount":0,"publisher":{"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b"},"image":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage"},"thumbnailUrl":"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png","keywords":["A100","Gemma-4","GPT-OSS-20B","vLLM"],"articleSection":["Big Data &amp; Machine Learning","GPU","IT Technology"],"inLanguage":"en-US","potentialAction":[{"@type":"CommentAction","name":"Comment","target":["https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#respond"]}]},{"@type":"WebPage","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","url":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/","name":"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100 - \u60f3\u65b9\u6d89\u6cd5 - \u91cf\u74f6\u5916\u7684\u5929\u7a7a M-Y-Oceane","isPartOf":{"@id":"https:\/\/myoceane.fr\/#website"},"primaryImageOfPage":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage"},"image":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage"},"thumbnailUrl":"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png","datePublished":"2026-04-18T07:24:44+00:00","dateModified":"2026-04-19T05:11:21+00:00","breadcrumb":{"@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/"]}]},{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#primaryimage","url":"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png","contentUrl":"https:\/\/myoceane.fr\/wp-content\/uploads\/2026\/04\/vllm.png","width":1200,"height":900},{"@type":"BreadcrumbList","@id":"https:\/\/myoceane.fr\/index.php\/ai-vllm-a100-gemma-4\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/myoceane.fr\/"},{"@type":"ListItem","position":2,"name":"[AI] vLLM \u90e8\u7f72\u5730\u7aef\u6a21\u578b\u7cfb\u7d71\u8a0a\u606f\u8a18\u9304 A100"}]},{"@type":"WebSite","@id":"https:\/\/myoceane.fr\/#website","url":"https:\/\/myoceane.fr\/","name":"M-Y-Oceane \u60f3\u65b9\u6d89\u6cd5\u3002\u91cf\u74f6\u5916\u7684\u5929\u7a7a","description":"\u60f3\u65b9\u6d89\u6cd5, France, Taiwan, Health, Information Technology","publisher":{"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/myoceane.fr\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":["Person","Organization"],"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/4a4552fb8c27693083d465e12db7658b","name":"\u6ab8\u6aac\u7238","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/myoceane.fr\/#\/schema\/person\/image\/","url":"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g","contentUrl":"https:\/\/secure.gravatar.com\/avatar\/6cc678684664f8ad45a8d56a6630b183?s=96&d=mm&r=g","caption":"\u6ab8\u6aac\u7238"},"logo":{"@id":"https:\/\/myoceane.fr\/#\/schema\/person\/image\/"},"url":"https:\/\/myoceane.fr\/index.php\/author\/johnny5584767gmail-com\/"}]}},"amp_enabled":false,"_links":{"self":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts\/10289","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/comments?post=10289"}],"version-history":[{"count":32,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts\/10289\/revisions"}],"predecessor-version":[{"id":10334,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/posts\/10289\/revisions\/10334"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/media\/10329"}],"wp:attachment":[{"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/media?parent=10289"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/categories?post=10289"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/myoceane.fr\/index.php\/wp-json\/wp\/v2\/tags?post=10289"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}