octoml · masahi · Nov 9, 2023 · Oct 8, 2023 · Oct 8, 2023 · Oct 8, 2023
diff --git a/cpp/conv_templates.cc b/cpp/conv_templates.cc
@@ -7,6 +7,27 @@ namespace mlc {
 namespace llm {
 namespace {
 
+Conversation ChatML() {
+  Conversation conv;
+  conv.name = "chatml";
+  conv.roles = {"<|im_start|>user", "<|im_start|>assistant"};
+  conv.system =
+      ("<|im_start|>system A conversation between a user and an LLM-based AI assistant. The "
+       "assistant gives helpful and honest answers.<|im_end|> ");
+  conv.messages = {};
+  conv.offset = 0;
+  conv.separator_style = SeparatorStyle::kSepRoleMsg;
+  conv.seps = {"<|im_end|>", "<|im_end|>"};
+  conv.role_msg_sep = "\n";
+  conv.role_empty_sep = "\n";
+  // TODO(mlc-team): add eos to mlc-chat-config
+  // and remove eos from stop token setting.
+  conv.stop_tokens = {2};
+  conv.stop_str = "<|im_end|>";
+  conv.add_bos = true;
+  return conv;
+}
+
 Conversation LlamaDefault() {
   Conversation conv;
   conv.name = "llama_default";
@@ -583,6 +604,7 @@ using ConvFactory = Conversation (*)();
 
 Conversation Conversation::FromTemplate(const std::string& name) {
   static std::unordered_map<std::string, ConvFactory> factory = {
+      {"chatml", ChatML},
       {"llama_default", LlamaDefault},
       {"llama-2", Llama2},
       {"mistral_default", MistralDefault},

diff --git a/cpp/llm_chat.cc b/cpp/llm_chat.cc
@@ -409,10 +409,16 @@ class LLMChat {
       CHECK(!config.count("max_window_size"))
           << "Cannot specify both sliding_window and max_window_size.";
       this->sliding_window_ = config["sliding_window"].get<int64_t>();
+      CHECK(this->sliding_window_ > 0) << "Sliding window size needs to be positive";
+      CHECK(config.count("sliding_window_chunk_size"))
+          << "Need to specify chunk size if using sliding window attention.";
     }
     if (config.count("sliding_window_chunk_size")) {
       CHECK(config["sliding_window_chunk_size"].is<int64_t>());
       this->sliding_window_chunk_size_ = config["sliding_window_chunk_size"].get<int64_t>();
+      CHECK(this->sliding_window_chunk_size_ > 0)
+          << "Sliding window chunk size needs to be positive";
+      CHECK(config.count("sliding_window")) << "Need to specify sliding window size.";
     }
     if (config.count("model_name")) {
       CHECK(config["model_name"].is<std::string>());
@@ -828,13 +834,8 @@ class LLMChat {
     NDArray logits_on_device;
     if (this->sliding_window_ != -1) {
       // Use chunking if we use sliding window attention (see Mistral paper figure 3).
-      int64_t sliding_window_chunk_size = this->sliding_window_chunk_size_;
-      if (this->sliding_window_chunk_size_ == -1) {
-        // One chunk if chunk size not specified
-        sliding_window_chunk_size = token_len;
-      }
-      for (int64_t begin = 0; begin < token_len; begin += sliding_window_chunk_size) {
-        int64_t end = std::min(token_len, begin + sliding_window_chunk_size);
+      for (int64_t begin = 0; begin < token_len; begin += this->sliding_window_chunk_size_) {
+        int64_t end = std::min(token_len, begin + this->sliding_window_chunk_size_);
         std::vector<int32_t> chunk =
             std::vector<int32_t>(prompt_tokens.begin() + begin, prompt_tokens.begin() + end);
         new_seq_len += static_cast<int64_t>(chunk.size());

diff --git a/mlc_llm/core.py b/mlc_llm/core.py
@@ -849,7 +849,7 @@ def build_model_from_args(args: argparse.Namespace):
 
         mod = mod_transform_before_build(mod, param_manager, args, model_config)
         if args.num_shards > 1:
-            # We requires a "create_sharding_info" function for all
+            # We require a "create_sharding_info" function for all
             # multi-GPU models, even if they are using pre-sharded
             # weights.  When using pre-sharded weights, the list of
             # initialization-time transforms to apply is empty.

diff --git a/mlc_llm/relax_model/mistral.py b/mlc_llm/relax_model/mistral.py
@@ -949,6 +949,9 @@ def get_model(args, hf_config):
         sliding_window_chunk_size=args.sliding_window_chunk_size,
     )
 
+    assert config.sliding_window != -1
+    assert config.sliding_window_chunk_size != -1
+
     param_manager = ParamManager()
     bb = relax.BlockBuilder()
 
@@ -962,6 +965,8 @@ def get_model(args, hf_config):
         max_window_size=config.max_sequence_length,
         stop_tokens=[2],
         add_prefix_space=False,
+        sliding_window=config.sliding_window,
+        sliding_window_chunk_size=config.sliding_window_chunk_size,
     )
 
     mod = bb.get()

diff --git a/python/mlc_chat/chat_module.py b/python/mlc_chat/chat_module.py
@@ -8,13 +8,15 @@
 import warnings
 from dataclasses import asdict, dataclass, fields
 from enum import Enum
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import tvm
 from tvm.runtime import disco  # pylint: disable=unused-import
 
-from .base import _LIB  # pylint: disable=unused-import
-from .interface.openai_api import ChatMessage
+from . import base  # pylint: disable=unused-import
+
+if TYPE_CHECKING:
+    from .interface.openai_api import ChatMessage
 
 # pylint: disable=line-too-long
 _PYTHON_GET_STARTED_TUTORIAL_URL = "https://github.com/mlc-ai/notebooks/blob/main/mlc-llm/tutorial_chat_module_getting_started.ipynb"
@@ -41,10 +43,10 @@ class ConvConfig:  # pylint: disable=too-many-instance-attributes
     roles : Optional[List[str]]
         An array that describes the role names of the user and the model. These
         names are specific to the model being used.
-    messages : Optional[List[str]]
+    messages : Optional[List[List[str]]]
         The chat history represented as an array of string pairs in the following
         format: ``[[role_0, msg_0], [role_1, msg_1], ...]``.
-    offset : Optional[str]
+    offset : Optional[int]
         The offset used to begin the chat from the chat history. When offset
         is not ``0``, ``messages[0:offset-1]`` will be encoded.
     separator_style : Optional[int]
@@ -69,7 +71,7 @@ class ConvConfig:  # pylint: disable=too-many-instance-attributes
     system: Optional[str] = None
     roles: Optional[List[str]] = None
     messages: Optional[List[List[str]]] = None
-    offset: Optional[str] = None
+    offset: Optional[int] = None
     separator_style: Optional[int] = None
     seps: Optional[List[str]] = None
     role_msg_sep: Optional[str] = None
@@ -787,7 +789,7 @@ def __init__(
 
     def generate(
         self,
-        prompt: Union[str, List[ChatMessage]],
+        prompt: Union[str, List["ChatMessage"]],
         generation_config: Optional[GenerationConfig] = None,
         progress_callback=None,
     ) -> Union[str, List[str]]:
@@ -797,14 +799,18 @@ def generate(
 
         Parameters
         ----------
-        prompt : Union[str, List[ChatMessage]]
+        prompt: Union[str, List[ChatMessage]]
             The user input prompt, i.e. a question to ask the chat module.
             It can also be the whole conversation history (list of messages with role and content)
-            eg: ```[
-                ChatMessage(role="user", content="Hello, how are you?"),
-                ChatMessage(role="assistant", content="I'm fine, thank you. How about you?"),
-                ChatMessage(role="user", content="I'm good too."),
-            ]```
+            eg:
+
+            .. code::
+
+                [
+                    ChatMessage(role="user", content="Hello, how are you?"),
+                    ChatMessage(role="assistant", content="I'm fine, thank you. How about you?"),
+                    ChatMessage(role="user", content="I'm good too."),
+                ]
         generation_config: Optional[GenerationConfig]
             The generation config object to override the ChatConfig generation settings.
         progress_callback: object
@@ -841,8 +847,6 @@ def generate(
         if (generation_config is not None) and (generation_config.n is not None):
             num_return_sequences = generation_config.n
             return_str = False
-        else:
-            num_return_sequences = 1
 
         for _ in range(num_return_sequences):
             self.reset_chat()
@@ -1001,7 +1005,7 @@ def _unload(self):
 
     def _prefill(
         self,
-        input: Union[str, List[ChatMessage]],  # pylint: disable=redefined-builtin
+        input: Union[str, List["ChatMessage"]],  # pylint: disable=redefined-builtin
         decode_next_token: bool = True,
         place_in_prompt: PlaceInPrompt = PlaceInPrompt.All,
         generation_config: Optional[GenerationConfig] = None,
@@ -1014,11 +1018,15 @@ def _prefill(
         input : Union[str, List[ChatMessage]]
             The user input prompt, i.e. a question to ask the chat module.
             It can also be the whole conversation history (list of messages with role and content)
-            eg: ```[
-                ChatMessage(role="user", content="Hello, how are you?"),
-                ChatMessage(role="assistant", content="I'm fine, thank you. How about you?"),
-                ChatMessage(role="user", content="I'm good too."),
-            ]```
+            eg:
+
+            .. code::
+
+                [
+                    ChatMessage(role="user", content="Hello, how are you?"),
+                    ChatMessage(role="assistant", content="I'm fine, thank you. How about you?"),
+                    ChatMessage(role="user", content="I'm good too."),
+                ]
         decode_next_token : bool
             Whether to decode the next token after prefilling.
         place_in_prompt: PlaceInPrompt

diff --git a/python/mlc_chat/compiler/compile.py b/python/mlc_chat/compiler/compile.py
@@ -52,39 +52,46 @@ def _attach_auxiliary_methods(
     mod: IRModule,
     named_params: List[Tuple[str, nn.Parameter]],
     args: CompileArgs,
-    model_config,
 ) -> None:
-    def _metadata():
-        metadata = {
-            "quantization": args.quantization.name,
-            "model_type": args.model.name,
-            "params": [
-                {
-                    "name": name,
-                    "shape": list(param.shape),
-                    "dtype": param.dtype,
-                }
-                for name, param in named_params
-            ],
-        }
+    def _get_memory_usage():
+        return {str(k): int(v) for k, v in mod.attrs["mlc_llm.memory_usage"].items()}
+
+    def _get_param_info():
+        return [
+            {
+                "name": name,
+                "shape": list(param.shape),
+                "dtype": param.dtype,
+            }
+            for name, param in named_params
+        ]
+
+    def _emit_metadata(metadata):
         bb = relax.BlockBuilder()  # pylint: disable=invalid-name
         with bb.function("main", params=[]):
             bb.emit_func_output(relax.StringImm(json.dumps(metadata)))
         return bb.get()["main"]
 
-    def _attach_variable_bounds():
-        for g_var, func in mod.functions_items():
-            if isinstance(func, relax.Function):
-                mod[g_var] = func.with_attr(
-                    "tir_var_upper_bound",
-                    {
-                        "seq_len": model_config.max_sequence_length,
-                        "total_seq_len": model_config.max_sequence_length,
-                    },
-                )
+    mod["_metadata"] = _emit_metadata(
+        metadata={
+            "quantization": args.quantization.name,
+            "model_type": args.model.name,
+            "memory_usage": _get_memory_usage(),
+            "params": _get_param_info(),
+        }
+    )
+
 
-    mod["_metadata"] = _metadata()
-    _attach_variable_bounds()
+def _attach_variable_bounds(mod, model_config):
+    for g_var, func in mod.functions_items():
+        if isinstance(func, relax.Function):
+            mod[g_var] = func.with_attr(
+                "tir_var_upper_bound",
+                {
+                    "seq_len": model_config.max_sequence_length,
+                    "total_seq_len": model_config.max_sequence_length,
+                },
+            )
 
 
 def _compile(args: CompileArgs):
@@ -96,10 +103,11 @@ def _compile(args: CompileArgs):
     mod, named_params = model.export_tvm(
         spec=model.get_default_spec(),  # type: ignore
     )
-    _attach_auxiliary_methods(mod, named_params, args, model_config)
     logger.info("Running optimizations using TVM Unity")
+    _attach_variable_bounds(mod, model_config)
     with args.target:
         mod = relax.get_pipeline("mlc_llm")(mod)
+    _attach_auxiliary_methods(mod, named_params, args)
     logger.info("Generating code using TVM Unity")
     args.build_func(mod, args)
     logger.info("Generated: %s", bold(str(args.output)))

diff --git a/python/mlc_chat/compiler/compiler_pass/estimate_memory_usage.py b/python/mlc_chat/compiler/compiler_pass/estimate_memory_usage.py
@@ -0,0 +1,77 @@
+"""Memory usage estimation analysis function for Relax functions."""
+from typing import Dict
+
+import tvm
+from tvm import relax
+from tvm.ir import IRModule, Op
+from tvm.relax.expr_functor import PyExprVisitor, visitor
+
+
+@tvm.transform.module_pass(opt_level=0, name="EstimateMemoryUsage")
+class EstimateMemoryUsage:  # pylint: disable=too-few-public-methods
+    """A pass that attaches the memory usage information as an IRModule attribute.
+
+    This pass relies on static analysis on each TVM Relax function in the specific IRModule.
+    It simply accumulates all memory allocation calls in a function, and does not consider
+    more dynamic runtime features like control flo "if" or function calls.
+    """
+
+    def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
+        """Entry point of the pass."""
+        lowered_mod = tvm.transform.Sequential(
+            [
+                relax.transform.RewriteDataflowReshape(),
+                relax.transform.ToNonDataflow(),
+                relax.transform.RemovePurityChecking(),
+                relax.transform.CallTIRRewrite(),
+                relax.transform.StaticPlanBlockMemory(),
+            ],
+            name="relax.lower",
+        )(mod)
+        usage = _MemoryEstimator().run(lowered_mod)
+        return mod.with_attr("mlc_llm.memory_usage", usage)
+
+
+@visitor
+class _MemoryEstimator(PyExprVisitor):
+    """The IR visitor which estimates the memory usage of each Relax function."""
+
+    def __init__(self) -> None:
+        self.planned_alloc_mem = 0
+        self.planned_mem_num = 0
+        self._op_alloc_tensor = Op.get("relax.builtin.alloc_tensor")
+        self._op_alloc_storage = Op.get("relax.memory.alloc_storage")
+
+    def run(self, mod: IRModule) -> Dict[str, int]:
+        """Entry point of the visitor."""
+        result: Dict[str, int] = {}
+        for global_var, func in mod.functions_items():
+            if isinstance(func, relax.Function):
+                self.planned_alloc_mem = 0
+                self.planned_mem_num = 0
+                self.visit_expr(func)
+                result[global_var.name_hint] = self.planned_alloc_mem
+        return result
+
+    def visit_call_(self, call: relax.Call) -> None:  # pylint: disable=arguments-renamed
+        if call.op == self._op_alloc_tensor:
+            self._builtin_tensor_alloc(shape=call.args[0], dtype_str=call.args[1].value)
+        elif call.op == self._op_alloc_storage:
+            self._storage_alloc(size=call.args[0])
+        super().visit_call_(call)
+
+    def _builtin_tensor_alloc(self, shape: relax.Expr, dtype_str: str) -> None:
+        assert isinstance(shape, relax.ShapeExpr)
+        size = 1
+        for dim_len in shape.values:
+            if not isinstance(dim_len, tvm.tir.IntImm):
+                return
+            size *= dim_len.value
+        dtype = tvm.DataType(dtype_str)
+        self.planned_mem_num += 1
+        self.planned_alloc_mem += size * ((dtype.bits + 7) // 8) * dtype.lanes
+
+    def _storage_alloc(self, size: relax.Expr) -> None:
+        assert isinstance(size, relax.ShapeExpr)
+        self.planned_mem_num += 1
+        self.planned_alloc_mem += size.values[0].value
diff --git a/python/mlc_chat/compiler/compiler_pass/pipeline.py b/python/mlc_chat/compiler/compiler_pass/pipeline.py
@@ -7,6 +7,7 @@
 from tvm.relax import register_pipeline  # pylint: disable=no-name-in-module
 
 from .clean_up_tir_attrs import CleanUpTIRAttrs
+from .estimate_memory_usage import EstimateMemoryUsage
 from .fuse_dequantize_matmul_ewise import FuseDequantizeMatmulEwise
 from .fuse_dequantize_take import FuseDequantizeTake
 from .fuse_dequantize_transpose import FuseDequantizeTranspose
@@ -64,6 +65,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                 _LogProgress("Running memory optimizations"),
                 LiftTIRGlobalBufferAlloc(),
                 tvm.tir.transform.ForceNarrowIndexToInt32(),
+                EstimateMemoryUsage(),
             ]
         )
         mod = seq(mod._move())  # pylint: disable=protected-access