qwen3-235b w8a8
2.1.T10.B060
2k/2k输入卡3s, 100ms数据
TTFT 779ms TPOT 109.9ms

2.1.RC1.B092版本
PD分离, 1P1D,P节点双机,D节点4机
user_config.json
{
"version": "v1.0",
"deploy_config": {
"p_instances_num": 1,
"d_instances_num": 1,
"single_p_instance_pod_num": 2,
"single_d_instance_pod_num": 4,
"p_pod_npu_num": 8,
"d_pod_npu_num": 8,
"p_instances_scale_num": 0,
"d_instances_scale_num": 0,
"model_id": "",
"prefill_distribute_enable": 0,
"decode_distribute_enable": 1,
"image_name": "b092_qwen3:v0",
"job_id": "mindie",
"hardware_type": "800I_A2",
"mindie_env_path": "./conf/mindie_env.json",
"mindie_host_log_path": "/root/log/ascend_log",
"mindie_container_log_path": "/root/mindie",
"weight_mount_path": "/home/weight/Qwen3-235B-A22B-W8A8",
"coordinator_backup_cfg": {
"function_enable": false
},
"controller_backup_cfg": {
"function_sw": false
},
"deploy_mount_path": {
"ms_controller_mount": {
"$user_define_host_path": "$user_define_container_path"
},
"ms_coordinator_mount": {
"$user_define_host_path": "$user_define_container_path"
},
"prefill_server_mount": {
"$user_define_host_path1": "$user_define_container_path",
"$user_define_host_path2": "$user_define_container_path"
},
"decode_server_mount": {
"$user_define_host_path1": "$user_define_container_path",
"$user_define_host_path2": "$user_define_container_path"
}
},
"tls_config": {
"tls_enable": false,
"kmc_ksf_master": "./security/master/tools/pmt/master/ksfa",
"kmc_ksf_standby": "./security/standby/tools/pmt/standby/ksfb",
"infer_tls_items": {
"ca_cert": "./security/infer/security/certs/ca.pem",
"tls_cert": "./security/infer/security/certs/cert.pem",
"tls_key": "./security/infer/security/keys/cert.key.pem",
"tls_passwd": "./security/infer/security/pass/key_pwd.txt",
"tls_crl": ""
},
"management_tls_items": {
"ca_cert": "./security/management/security/certs/management_ca.pem",
"tls_cert": "./security/management/security/certs/cert.pem",
"tls_key": "./security/management/security/keys/cert.key.pem",
"tls_passwd": "./security/management/security/pass/key_pwd.txt",
"tls_crl": ""
},
"cluster_tls_enable": false,
"cluster_tls_items": {
"ca_cert": "./security/clusterd/security/certs/ca.pem",
"tls_cert": "./security/clusterd/security/certs/cert.pem",
"tls_key": "./security/clusterd/security/keys/cert.key.pem",
"tls_passwd": "./security/clusterd/security/pass/key_pwd.txt",
"tls_crl": ""
},
"etcd_server_tls_enable": false,
"etcd_server_tls_items": {
"ca_cert": "./security/etcd_server/security/certs/ca.pem",
"tls_cert": "./security/etcd_server/security/certs/cert.pem",
"tls_key": "./security/etcd_server/security/keys/cert.key.pem",
"tls_passwd": "./security/etcd_server/security/pass/key_pwd.txt",
"kmc_ksf_master": "./security/etcd_server/tools/pmt/master/ksfa",
"kmc_ksf_standby": "./security/etcd_server/tools/pmt/standby/ksfb",
"tls_crl": ""
}
}
},
"mindie_ms_controller_config": {
"deploy_mode": "pd_separate",
"digs_prefill_slo": 1000,
"digs_decode_slo": 50,
"multi_node_infer_config": {
"multi_node_infer_enable": true
}
},
"mindie_ms_coordinator_config": {
"http_config": {
"predict_ip": "127.0.0.1",
"predict_port": "1025",
"manage_ip": "127.0.0.1",
"manage_port": "1026",
"server_thread_num": 10,
"client_thread_num": 10,
"http_timeout_seconds": 360,
"keep_alive_seconds": 180
},
"request_limit": {
"single_node_max_requests": 4096,
"max_requests": 10000
},
"exception_config": {
"first_token_timeout": 3600,
"infer_timeout": 3600
}
},
"mindie_server_prefill_config": {
"ServerConfig": {
"maxLinkNum": 4096,
"inferMode": "dmi",
"tokenTimeout": 3600,
"e2eTimeout": 3600,
"distDPServerEnabled": false
},
"BackendConfig": {
"npuDeviceIds": [
[
0,
1,
2,
3,
4,
5,
6,
7
]
],
"tokenizerProcessNumber": 1,
"multiNodesInferEnabled": true,
"ModelDeployConfig": {
"maxSeqLen": 8192,
"maxInputTokenLen": 6144,
"ModelConfig": [
{
"modelInstanceType": "Standard",
"modelName": "qwen3",
"modelWeightPath": "/home/weight/Qwen3-235B-A22B-W8A8",
"worldSize": 8,
"cpuMemSize": 5,
"npuMemSize": -1,
"backendType": "atb",
"trustRemoteCode": false,
"dp": 16,
"cp": 1,
"tp": 1,
"sp": 1,
"moe_ep": 4,
"pp": 1,
"moe_tp": 4,
"kv_link_timeout": 1080,
"modelCutPolicy": "custom",
"models": {
"deepseekv2": {
"kv_cache_options": {
"enable_nz": true
},
"ep_level": 1,
"enable_init_routing_cutoff": false,
"topk_scaling_factor": 1.0
}
}
}
]
},
"ScheduleConfig": {
"maxPrefillBatchSize": 16,
"maxPrefillTokens": 6144
}
}
},
"mindie_server_decode_config": {
"ServerConfig": {
"maxLinkNum": 256,
"fullTextEnabled": false,
"inferMode": "dmi",
"tokenTimeout": 3600,
"e2eTimeout": 3600,
"distDPServerEnabled": true
},
"BackendConfig": {
"npuDeviceIds": [
[
0
]
],
"tokenizerProcessNumber": 1,
"multiNodesInferEnabled": false,
"ModelDeployConfig": {
"maxSeqLen": 8192,
"maxInputTokenLen": 6144,
"truncation": false,
"ModelConfig": [
{
"modelInstanceType": "Standard",
"modelName": "qwen3",
"modelWeightPath": "/home/weight/Qwen3-235B-A22B-W8A8",
"worldSize": 1,
"cpuMemSize": 5,
"npuMemSize": -1,
"backendType": "atb",
"trustRemoteCode": false,
"dp": 32,
"cp": 1,
"tp": 1,
"sp": 1,
"moe_ep": 32,
"pp": 1,
"moe_tp": 1,
"kv_trans_timeout": 10,
"kv_link_timeout": 1080,
"modelCutPolicy": "custom",
"models": {
"deepseekv2": {
"kv_cache_options": {
"enable_nz": true
}
}
}
}
]
},
"ScheduleConfig": {
"distributedEnable": true,
"maxPrefillBatchSize": 2,
"maxPrefillTokens": 6144,
"maxBatchSize": 64,
"maxIterTimes": 2048,
"maxQueueDelayMicroseconds": 5000
}
}
}
}
mindie_env.json:
增加两行:
"HCCL_INTRA_PCIE_ENABLE": 1,
"HCCL_INTRA_ROCE_ENABLE": 0
删除"ATB_LLM_ENABLE_AUTO_TRANSPOSE": 0,
{
"version": "1.0.0",
"mindie_common_env": {
"CANN_INSTALL_PATH": "/usr/local/Ascend",
"MIES_INSTALL_PATH": "/usr/local/Ascend/mindie/latest/mindie-service",
"MINDIE_LLM_HOME_PATH": "/usr/local/Ascend/mindie/latest/mindie-llm",
"TASK_QUEUE_ENABLE": 1,
"HCCL_BUFFSIZE": 120,
"MINDIE_LOG_TO_FILE": 1,
"MINDIE_LOG_TO_STDOUT": 1,
"MINDIE_LOG_LEVEL": "INFO",
"ASCEND_SLOG_PRINT_TO_STDOUT": 0,
"ASCEND_GLOBAL_LOG_LEVEL": 3,
"ASCEND_GLOBAL_EVENT_ENABLE": 1,
"ATB_LOG_TO_FILE": 0,
"ASDOPS_LOG_TO_STDOUT": 0,
"ASDOPS_LOG_LEVEL": "ERROR",
"MINDIE_LLM_CONTINUOUS_BATCHING": 1,
"MINDIE_LLM_RECOMPUTE_THRESHOLD": 0.5,
"DIST_PD_DISAGGREGATION": 1,
"HCCL_RDMA_RETRY_CNT": 7,
"HCCL_RDMA_TIMEOUT": 18,
"HCCL_EXEC_TIMEOUT": 60
},
"mindie_ms_controller_env": {
},
"mindie_ms_coordinator_env": {
},
"mindie_server_prefill_env": {
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"OMP_NUM_THREADS": 8,
"HCCL_CONNECT_TIMEOUT": 7200,
"HCCL_ENTRY_LOG_ENABLE": 1,
"HCCL_ALGO": "level0:NA;level1:pipeline",
"DP_MOVE_UP_ENABLE": 1,
"MINDIE_ASYNC_SCHEDULING_ENABLE": 0,
"HCCL_OP_EXPANSION_MODE": "AIV",
"NPU_MEMORY_FRACTION": 0.92
},
"mindie_server_decode_env": {
"MINDIE_ENABLE_DP_DISTRIBUTED": 1,
"DP_PARTITION_UP_ENABLE": 1,
"DP_MOVE_UP_ENABLE": 1,
"HCCL_BUFFSIZE": 512,
"ATB_LAYER_INTERNAL_TENSOR_REUSE": 1,
"ATB_CONVERT_NCHW_TO_ND": 1,
"ATB_CONTEXT_WORKSPACE_SIZE": 0,
"ATB_LAUNCH_KERNEL_WITH_TILING": 1,
"ATB_LLM_HCCL_ENABLE": 1,
"TASK_QUEUE_ENABLE": 1,
"INF_NAN_MODE_ENABLE": 0,
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"OMP_NUM_THREADS": 8,
"NPU_MEMORY_FRACTION": 0.92,
"HCCL_ENTRY_LOG_ENABLE": 1,
"HCCL_CONNECT_TIMEOUT": 7200,
"MINDIE_ASYNC_SCHEDULING_ENABLE": 1,
"HCCL_INTRA_PCIE_ENABLE": 1,
"HCCL_INTRA_ROCE_ENABLE": 0
}
}
不设置rr先拉大并发将tpot打满:

设置rr尝试降低TTFT,发现Median TTFT已经超过要求的3s

设置输出长度为2,测试系统qps,qps为7

单并发测试TTFT最小值,发现也略超3s,高并发肯定会超过3s

对齐推荐的rr测试,TTFT接近5s

参数优化

boot.sh:

测试结果:
