Expected a 200 code and a response. Instead, received an error and timeout.
(lightllm) azureuser@trainer1:/workspace/lightllm$ ./run.sh
Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
[W ProcessGroupGloo.cpp:695] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())
[W ProcessGroupGloo.cpp:695] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())
[W ProcessGroupGloo.cpp:695] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())
[W ProcessGroupGloo.cpp:695] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())
INFO: Started server process [32679]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
Task exception was never retrieved
future: <Task finished name='Task-8' coro=<RouterManager.loop_for_fwd() done, defined at /workspace/lightllm/lightllm/server/router/manager.py:88> exception=at 38:4:
def _rotary_kernel(
Q, Cos, Sin,
stride_qbs, stride_qh, stride_qd,
stride_cosbs, stride_cosd,
stride_sinbs, stride_sind,
max_total_len,
H, # N_CTX 代表要计算的上下文长度
BLOCK_HEAD: tl.constexpr,
BLOCK_SEQ: tl.constexpr,
BLOCK_DMODEL: tl.constexpr,
):
cur_head_index = tl.program_id(0)
cur_seq_index = tl.program_id(1)
cur_head_range = cur_head_index * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD)
cur_seq_range = cur_seq_index * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
dim_range0 = tl.arange(0, BLOCK_DMODEL // 2)
dim_range1 = tl.arange(BLOCK_DMODEL // 2, BLOCK_DMODEL)
off_q0 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range0[None, None, :] * stride_qd
off_q1 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range1[None, None, :] * stride_qd
off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + dim_range0[None, None, :] * stride_cosd
q0 = tl.load(Q + off_q0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
q1 = tl.load(Q + off_q1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
out0 = q0 * cos - q1 * sin
out1 = q0 * sin + q1 * cos
tl.store(Q + off_q0, out0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
tl.store(Q + off_q1, out1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
return
^
========= Remote Traceback (1) =========
Traceback (most recent call last):
File "<string>", line 21, in _rotary_kernel
KeyError: ('2-.-0-.-0-83ca8b715a9dc5f32dc1110973485f64-d6252949da17ceb5f3a278a70250af13-1af5134066c618146d2cd009138944a0-bde58180cc67fc4675629069557a5d0a-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.float16, torch.float16, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (4, 32, 128), (True, True, True, (True, False), (True, False), (False, True), (True, False), (False, True), (True, False), (False, True), (False, False), (True, False)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 937, in build_triton_ir
generator.visit(fn.parse())
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 183, in visit_Module
ast.NodeVisitor.generic_visit(self, node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/ast.py", line 426, in generic_visit
self.visit(item)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 263, in visit_FunctionDef
fn.reset_type(self.prototype.to_ir(self.builder))
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/language/core.py", line 298, in to_ir
ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types]
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/language/core.py", line 298, in <listcomp>
ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types]
AttributeError: 'NoneType' object has no attribute 'to_ir'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/rpyc-5.3.1-py3.10.egg/rpyc/core/protocol.py", line 359, in _dispatch_request
res = self._HANDLERS[handler](self, *args)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/rpyc-5.3.1-py3.10.egg/rpyc/core/protocol.py", line 837, in _handle_call
return obj(*args, **dict(kwargs))
File "/workspace/lightllm/lightllm/utils/infer_utils.py", line 49, in inner_func
result = func(*args, **kwargs)
File "/workspace/lightllm/lightllm/server/router/model_infer/model_rpc.py", line 92, in exposed_prefill_batch
return self.forward(batch_id, is_prefill=True)
File "/workspace/lightllm/lightllm/server/router/model_infer/model_rpc.py", line 143, in forward
logits = self.model.forward(**kwargs)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/lightllm/lightllm/common/basemodel/basemodel.py", line 125, in forward
return self._prefill(batch_size, total_token_num, max_len_in_batch, input_ids, b_loc, b_start_loc, b_seq_len)
File "/workspace/lightllm/lightllm/common/basemodel/basemodel.py", line 149, in _prefill
predict_logics = self._context_forward(input_ids, infer_state)
File "/workspace/lightllm/lightllm/common/basemodel/basemodel.py", line 189, in _context_forward
input_embs = self.layers_infer[i].context_forward(input_embs, infer_state, self.trans_layers_weight[i])
File "/workspace/lightllm/lightllm/common/basemodel/layer_infer/template/transformer_layer_infer_template.py", line 129, in context_forward
self._context_attention(input_embdings,
File "/workspace/lightllm/lightllm/utils/infer_utils.py", line 21, in time_func
ans = func(*args, **kwargs)
File "/workspace/lightllm/lightllm/common/basemodel/layer_infer/template/transformer_layer_infer_template.py", line 81, in _context_attention
q = self._get_qkv(input1, cache_k, cache_v, infer_state, layer_weight)
File "/workspace/lightllm/lightllm/models/llama/layer_infer/transformer_layer_infer.py", line 43, in _get_qkv
rotary_emb_fwd(q.view(-1, self.tp_q_head_num_, self.head_dim_), infer_state.position_cos, infer_state.position_sin)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/lightllm/lightllm/models/llama/triton_kernel/rotary_emb.py", line 62, in rotary_emb_fwd
_rotary_kernel[grid](
File "<string>", line 41, in _rotary_kernel
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 1620, in compile
next_module = compile(module)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 1549, in <lambda>
lambda src: ast_to_ttir(src, signature, configs[0], constants)),
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 962, in ast_to_ttir
mod, _ = build_triton_ir(fn, signature, specialization, constants)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 942, in build_triton_ir
raise CompilationError(fn.src, node) from e
triton.compiler.CompilationError: at 38:4:
def _rotary_kernel(
Q, Cos, Sin,
stride_qbs, stride_qh, stride_qd,
stride_cosbs, stride_cosd,
stride_sinbs, stride_sind,
max_total_len,
H, # N_CTX 代表要计算的上下文长度
BLOCK_HEAD: tl.constexpr,
BLOCK_SEQ: tl.constexpr,
BLOCK_DMODEL: tl.constexpr,
):
cur_head_index = tl.program_id(0)
cur_seq_index = tl.program_id(1)
cur_head_range = cur_head_index * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD)
cur_seq_range = cur_seq_index * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
dim_range0 = tl.arange(0, BLOCK_DMODEL // 2)
dim_range1 = tl.arange(BLOCK_DMODEL // 2, BLOCK_DMODEL)
off_q0 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range0[None, None, :] * stride_qd
off_q1 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range1[None, None, :] * stride_qd
off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + dim_range0[None, None, :] * stride_cosd
q0 = tl.load(Q + off_q0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
q1 = tl.load(Q + off_q1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
out0 = q0 * cos - q1 * sin
out1 = q0 * sin + q1 * cos
tl.store(Q + off_q0, out0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
tl.store(Q + off_q1, out1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
return
^
>
Traceback (most recent call last):
File "/workspace/lightllm/lightllm/server/router/manager.py", line 91, in loop_for_fwd
await self._step()
File "/workspace/lightllm/lightllm/server/router/manager.py", line 112, in _step
await self._prefill_batch(self.running_batch)
File "/workspace/lightllm/lightllm/server/router/manager.py", line 149, in _prefill_batch
ans = await asyncio.gather(*rets)
File "/workspace/lightllm/lightllm/server/router/model_infer/model_rpc.py", line 218, in prefill_batch
return await ans
File "/workspace/lightllm/lightllm/server/router/model_infer/model_rpc.py", line 180, in _func
return ans.value
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/rpyc-5.3.1-py3.10.egg/rpyc/core/async_.py", line 108, in value
raise self._obj
rpyc.core.vinegar/triton.compiler._get_exception_class.<locals>.Derived: at 38:4:
def _rotary_kernel(
Q, Cos, Sin,
stride_qbs, stride_qh, stride_qd,
stride_cosbs, stride_cosd,
stride_sinbs, stride_sind,
max_total_len,
H, # N_CTX 代表要计算的上下文长度
BLOCK_HEAD: tl.constexpr,
BLOCK_SEQ: tl.constexpr,
BLOCK_DMODEL: tl.constexpr,
):
cur_head_index = tl.program_id(0)
cur_seq_index = tl.program_id(1)
cur_head_range = cur_head_index * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD)
cur_seq_range = cur_seq_index * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
dim_range0 = tl.arange(0, BLOCK_DMODEL // 2)
dim_range1 = tl.arange(BLOCK_DMODEL // 2, BLOCK_DMODEL)
off_q0 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range0[None, None, :] * stride_qd
off_q1 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range1[None, None, :] * stride_qd
off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + dim_range0[None, None, :] * stride_cosd
q0 = tl.load(Q + off_q0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
q1 = tl.load(Q + off_q1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
out0 = q0 * cos - q1 * sin
out1 = q0 * sin + q1 * cos
tl.store(Q + off_q0, out0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
tl.store(Q + off_q1, out1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
return
^
========= Remote Traceback (1) =========
Traceback (most recent call last):
File "<string>", line 21, in _rotary_kernel
KeyError: ('2-.-0-.-0-83ca8b715a9dc5f32dc1110973485f64-d6252949da17ceb5f3a278a70250af13-1af5134066c618146d2cd009138944a0-bde58180cc67fc4675629069557a5d0a-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.float16, torch.float16, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (4, 32, 128), (True, True, True, (True, False), (True, False), (False, True), (True, False), (False, True), (True, False), (False, True), (False, False), (True, False)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 937, in build_triton_ir
generator.visit(fn.parse())
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 183, in visit_Module
ast.NodeVisitor.generic_visit(self, node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/ast.py", line 426, in generic_visit
self.visit(item)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 263, in visit_FunctionDef
fn.reset_type(self.prototype.to_ir(self.builder))
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/language/core.py", line 298, in to_ir
ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types]
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/language/core.py", line 298, in <listcomp>
ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types]
AttributeError: 'NoneType' object has no attribute 'to_ir'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/rpyc-5.3.1-py3.10.egg/rpyc/core/protocol.py", line 359, in _dispatch_request
res = self._HANDLERS[handler](self, *args)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/rpyc-5.3.1-py3.10.egg/rpyc/core/protocol.py", line 837, in _handle_call
return obj(*args, **dict(kwargs))
File "/workspace/lightllm/lightllm/utils/infer_utils.py", line 49, in inner_func
result = func(*args, **kwargs)
File "/workspace/lightllm/lightllm/server/router/model_infer/model_rpc.py", line 92, in exposed_prefill_batch
return self.forward(batch_id, is_prefill=True)
File "/workspace/lightllm/lightllm/server/router/model_infer/model_rpc.py", line 143, in forward
logits = self.model.forward(**kwargs)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/lightllm/lightllm/common/basemodel/basemodel.py", line 125, in forward
return self._prefill(batch_size, total_token_num, max_len_in_batch, input_ids, b_loc, b_start_loc, b_seq_len)
File "/workspace/lightllm/lightllm/common/basemodel/basemodel.py", line 149, in _prefill
predict_logics = self._context_forward(input_ids, infer_state)
File "/workspace/lightllm/lightllm/common/basemodel/basemodel.py", line 189, in _context_forward
input_embs = self.layers_infer[i].context_forward(input_embs, infer_state, self.trans_layers_weight[i])
File "/workspace/lightllm/lightllm/common/basemodel/layer_infer/template/transformer_layer_infer_template.py", line 129, in context_forward
self._context_attention(input_embdings,
File "/workspace/lightllm/lightllm/utils/infer_utils.py", line 21, in time_func
ans = func(*args, **kwargs)
File "/workspace/lightllm/lightllm/common/basemodel/layer_infer/template/transformer_layer_infer_template.py", line 81, in _context_attention
q = self._get_qkv(input1, cache_k, cache_v, infer_state, layer_weight)
File "/workspace/lightllm/lightllm/models/llama/layer_infer/transformer_layer_infer.py", line 43, in _get_qkv
rotary_emb_fwd(q.view(-1, self.tp_q_head_num_, self.head_dim_), infer_state.position_cos, infer_state.position_sin)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/lightllm/lightllm/models/llama/triton_kernel/rotary_emb.py", line 62, in rotary_emb_fwd
_rotary_kernel[grid](
File "<string>", line 41, in _rotary_kernel
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 1620, in compile
next_module = compile(module)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 1549, in <lambda>
lambda src: ast_to_ttir(src, signature, configs[0], constants)),
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 962, in ast_to_ttir
mod, _ = build_triton_ir(fn, signature, specialization, constants)
File "/workspace/miniconda3/envs/lightllm/lib/python3.10/site-packages/triton/compiler.py", line 942, in build_triton_ir
raise CompilationError(fn.src, node) from e
triton.compiler.CompilationError: at 38:4:
def _rotary_kernel(
Q, Cos, Sin,
stride_qbs, stride_qh, stride_qd,
stride_cosbs, stride_cosd,
stride_sinbs, stride_sind,
max_total_len,
H, # N_CTX 代表要计算的上下文长度
BLOCK_HEAD: tl.constexpr,
BLOCK_SEQ: tl.constexpr,
BLOCK_DMODEL: tl.constexpr,
):
cur_head_index = tl.program_id(0)
cur_seq_index = tl.program_id(1)
cur_head_range = cur_head_index * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD)
cur_seq_range = cur_seq_index * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
dim_range0 = tl.arange(0, BLOCK_DMODEL // 2)
dim_range1 = tl.arange(BLOCK_DMODEL // 2, BLOCK_DMODEL)
off_q0 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range0[None, None, :] * stride_qd
off_q1 = cur_seq_range[:, None, None] * stride_qbs + cur_head_range[None, :, None] * stride_qh + dim_range1[None, None, :] * stride_qd
off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + dim_range0[None, None, :] * stride_cosd
q0 = tl.load(Q + off_q0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
q1 = tl.load(Q + off_q1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H), other=0.0)
cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
out0 = q0 * cos - q1 * sin
out1 = q0 * sin + q1 * cos
tl.store(Q + off_q0, out0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
tl.store(Q + off_q1, out1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < H))
return
^
Not using container. Using clean conda environment.