使用 microTVM Autotuning#

原作者:

本教程解释如何使用 C 运行时自动调优模型。

import os
import json
import numpy as np
import pathlib

import tvm
from tvm.relay.backend import Runtime

use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))

定义模型#

首先,在 Relay 中定义要在设备上执行的模型。然后从 Relay 模型创建 IRModule,并用随机数填充参数。

data_shape = (1, 3, 10, 10)
weight_shape = (6, 3, 5, 5)

data = tvm.relay.var("data", tvm.relay.TensorType(data_shape, "float32"))
weight = tvm.relay.var("weight", tvm.relay.TensorType(weight_shape, "float32"))

y = tvm.relay.nn.conv2d(
    data,
    weight,
    padding=(2, 2),
    kernel_size=(5, 5),
    kernel_layout="OIHW",
    out_dtype="float32",
)
f = tvm.relay.Function([data, weight], y)

relay_mod = tvm.IRModule.from_expr(f)
relay_mod = tvm.relay.transform.InferType()(relay_mod)

weight_sample = np.random.rand(
    weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
).astype("float32")
params = {"weight": weight_sample}

定义目标#

现在我们定义描述执行环境的 TVM 目标。这看起来与其他 microTVM 教程中的目标定义非常相似。与此同时,选择 C 运行时来代码生成我们的模型。

在物理硬件上运行时,选择描述该硬件的 target 和 board。本教程中可以从 PLATFORM 列表中选择多个硬件目标。在运行本教程时,您可以通过传递 –platform 参数来选择平台。

RUNTIME = Runtime("crt", {"system-lib": True})
TARGET = tvm.target.target.micro("host")

# Compiling for physical hardware
# --------------------------------------------------------------------------
#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
#  STM32L4R5ZI Nucleo target and board is chosen in the example below.
if use_physical_hw:
    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
    with open(boards_file) as f:
        boards = json.load(f)

    BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
    TARGET = tvm.target.target.micro(boards[BOARD]["model"])

提取优化任务#

不是所有的算子在上面打印的 Relay 程序可以调谐。有些非常简单,只定义了单个实现;其他任务作为调优任务没有意义。使用 extract_from_program,可以生成可调任务列表。

因为任务提取涉及到运行编译器,所以首先需要配置编译器的 transformation passes;将在稍后的自动调优期间应用相同的配置。

pass_context = tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True})
with pass_context:
    tasks = tvm.autotvm.task.extract_from_program(relay_mod["main"], {}, TARGET)
assert len(tasks) > 0

配置 microTVM#

在进行自动调优之前,需要定义模块加载器,并将其传递给 tvm.autotvm.LocalBuilder。然后创建 tvm.autotvm.LocalRunner,并使用构建器和运行器为自动调谐器生成多个度量值。

在本教程中,可以选择使用 x86 主机作为示例,或者使用来自 Zephyr RTOS 的不同目标。如果您选择 pass --platform=host 到本教程,它将使用 x86。您可以从 PLATFORM 列表中选择其他选项。

import tvm.micro
module_loader = tvm.micro.AutoTvmModuleLoader(
    template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("crt")),
    project_options={"verbose": False},
)
builder = tvm.autotvm.LocalBuilder(
    n_parallel=1,
    build_kwargs={"build_option": {"tir.disable_vectorize": True}},
    do_fork=True,
    build_func=tvm.micro.autotvm_build_func,
    runtime=RUNTIME,
)
runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)

measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)

# Compiling for physical hardware
if use_physical_hw:
    module_loader = tvm.micro.AutoTvmModuleLoader(
        template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")),
        project_options={
            "zephyr_board": BOARD,
            "west_cmd": "west",
            "verbose": False,
            "project_type": "host_driven",
        },
    )
    builder = tvm.autotvm.LocalBuilder(
        n_parallel=1,
        build_kwargs={"build_option": {"tir.disable_vectorize": True}},
        do_fork=False,
        build_func=tvm.micro.autotvm_build_func,
        runtime=RUNTIME,
    )
    runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)

    measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)

运行 Autotuning#

现在可以在 microTVM 设备上分别对每个提取任务进行自动调优。

autotune_log_file = pathlib.Path("build/microtvm_autotune.log.txt")
if os.path.exists(autotune_log_file):
    os.remove(autotune_log_file)

num_trials = 10
for task in tasks:
    tuner = tvm.autotvm.tuner.GATuner(task)
    tuner.tune(
        n_trial=num_trials,
        measure_option=measure_option,
        callbacks=[
            tvm.autotvm.callback.log_to_file(str(autotune_log_file)),
            tvm.autotvm.callback.progress_bar(num_trials, si_prefix="M"),
        ],
        si_prefix="M",
    )

为未调优的程序计时#

为了进行比较,让我们在不施加任何自动调优调度的情况下编译和运行 graph。TVM 将为每个算子随机选择调优的实现,它的性能应该不如调优后的算子。

with pass_context:
    lowered = tvm.relay.build(relay_mod, target=TARGET, runtime=RUNTIME, params=params)

temp_dir = tvm.contrib.utils.tempdir()
project = tvm.micro.generate_project(
    str(tvm.micro.get_microtvm_template_projects("crt")),
    lowered,
    temp_dir / "project",
    {"verbose": False},
)

# Compiling for physical hardware
if use_physical_hw:
    temp_dir = tvm.contrib.utils.tempdir()
    project = tvm.micro.generate_project(
        str(tvm.micro.get_microtvm_template_projects("zephyr")),
        lowered,
        temp_dir / "project",
        {
            "zephyr_board": BOARD,
            "west_cmd": "west",
            "verbose": False,
            "project_type": "host_driven",
        },
    )

project.build()
project.flash()
with tvm.micro.Session(project.transport()) as session:
    debug_module = tvm.micro.create_local_debug_executor(
        lowered.get_graph_json(), session.get_system_lib(), session.device
    )
    debug_module.set_input(**lowered.get_params())
    print("########## Build without Autotuning ##########")
    debug_module.run()
    del debug_module

为调优后的程序计时#

一旦自动调优完成,您可以使用调试运行时对整个程序的执行进行计时:

with tvm.autotvm.apply_history_best(str(autotune_log_file)):
    with pass_context:
        lowered_tuned = tvm.relay.build(relay_mod, target=TARGET, runtime=RUNTIME, params=params)

temp_dir = tvm.contrib.utils.tempdir()
project = tvm.micro.generate_project(
    str(tvm.micro.get_microtvm_template_projects("crt")),
    lowered_tuned,
    temp_dir / "project",
    {"verbose": False},
)

# Compiling for physical hardware
if use_physical_hw:
    temp_dir = tvm.contrib.utils.tempdir()
    project = tvm.micro.generate_project(
        str(tvm.micro.get_microtvm_template_projects("zephyr")),
        lowered_tuned,
        temp_dir / "project",
        {
            "zephyr_board": BOARD,
            "west_cmd": "west",
            "verbose": False,
            "project_type": "host_driven",
        },
    )

project.build()
project.flash()
with tvm.micro.Session(project.transport()) as session:
    debug_module = tvm.micro.create_local_debug_executor(
        lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
    )
    debug_module.set_input(**lowered_tuned.get_params())
    print("########## Build with Autotuning ##########")
    debug_module.run()
    del debug_module