问题描述
我想实现基于vit的双模态的情感检测,使用视频和音频模态,在训练后得到pth文件,转为onnx后再转为onnxsim,但是在将onnx转kmodel时报错,下面附上代码。
export_kmodel.py代码
import os
import nncase
import numpy as np
import torch
import onnx
from onnxsim import simplify # 引入 simplifier
from torch.utils.data import DataLoader
from dataset import RavdessDataset
# ================= 基础配置 =================
RAW_ONNX_FILE = "fusion_model.onnx" # 原始导出的模型
SIM_ONNX_FILE = "fusion_model_sim.onnx" # 简化后的模型 (将生成这个文件)
KMODEL_FILE = "fusion_model.kmodel" # 最终输出
DATASET_DIR = r"D:\Learning\Python_learning\dataset\P-Ravdess"
CALIB_COUNT = 50
# ===========================================
def simplify_model(input_path, output_path):
"""
使用 onnx-simplifier 优化模型结构
"""
print(f"[*] 正在运行 ONNX Simplifier...")
print(f" 输入: {input_path}")
# 1. 加载模型
model = onnx.load(input_path)
# 2. 执行简化
# check_n=3 表示会随机生成3组数据验证简化前后精度是否一致
model_sim, check = simplify(model, check_n=3)
if not check:
print("❌ 警告: ONNX Sim 校验失败!简化后的模型可能精度受损。")
# 视情况决定是否继续,这里选择抛出异常终止
raise RuntimeError("ONNX Simplifier validation failed.")
# 3. 保存简化后的模型
onnx.save(model_sim, output_path)
print(f"✅ 简化成功! 已保存至: {output_path}")
return True
def get_calib_data():
print("[*] 读取校准数据...")
dataset = RavdessDataset(DATASET_DIR)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
calib_data = []
for i, (img, aud, _) in enumerate(loader):
if i >= CALIB_COUNT: break
img_np = img.numpy().astype(np.float32)
aud_np = aud.numpy().astype(np.float32)
# 默认假设:列表顺序即为输入顺序
calib_data.append([img_np, aud_np])
return calib_data
def main():
print("------------------------------------------------")
print(" NNCase Standard Converter (with ONNX-Sim)")
print("------------------------------------------------")
if not os.path.exists(RAW_ONNX_FILE):
print(f"❌ 错误: 找不到原始文件 {RAW_ONNX_FILE}")
return
# ==========================================
# 第一步:运行 ONNX Simplifier
# ==========================================
try:
simplify_model(RAW_ONNX_FILE, SIM_ONNX_FILE)
except Exception as e:
print(f"❌ 模型简化失败: {e}")
return
# ==========================================
# 第二步:NNCase 编译流程
# ==========================================
# 1. 设置编译选项
compile_options = nncase.CompileOptions()
compile_options.target = "k230"
compile_options.input_type = "float32"
compile_options.input_layout = "NCHW"
compile_options.dump_ir = False
compiler = nncase.Compiler(compile_options)
# 2. 导入 **简化后** 的模型
print(f"[*] 正在导入简化模型: {SIM_ONNX_FILE}")
with open(SIM_ONNX_FILE, 'rb') as f:
compiler.import_onnx(f.read(), nncase.ImportOptions())
# 3. 配置量化
print("[*] 配置量化参数 (PTQ)...")
ptq_options = nncase.PTQTensorOptions()
ptq_options.w_quant_type = "int8"
ptq_options.i_quant_type = "int8"
ptq_options.calibrate_method = "Kld"
# 4. 加载数据
calib_data = get_calib_data()
ptq_options.set_tensor_data(calib_data)
ptq_options.samples_count = len(calib_data)
compiler.use_ptq(ptq_options)
# 5. 编译
print("[*] 开始编译...")
try:
compiler.compile()
kmodel = compiler.gencode_tobytes()
with open(KMODEL_FILE, 'wb') as f:
f.write(kmodel)
print(f"\n✅ 转换成功! 文件已保存: {KMODEL_FILE}")
except Exception as e:
print(f"\n❌ 编译发生错误: {e}")
if __name__ == "__main__":
main()
我这里输入图片和音频两个参数,参数分别为:
图片:[1,3,224,224]
音频:[1,1,128,128]
但是在运行时报错,说尺寸不合规,终端输出如下:
warn: Nncase.Hosting.PluginLoader[0]
NNCASE_PLUGIN_PATH is not set.
------------------------------------------------
NNCase Standard Converter (with ONNX-Sim)
------------------------------------------------
[*] 正在运行 ONNX Simplifier...
输入: fusion_model.onnx
Checking 0/3...
Checking 1/3...
Checking 2/3...
✅ 简化成功! 已保存至: fusion_model_sim.onnx
[*] 正在导入简化模型: fusion_model_sim.onnx
[*] 配置量化参数 (PTQ)...
[*] 读取校准数据...
[train] 数据集加载完毕: 1440 个样本
[*] 开始编译...
报错位置:
Unhandled exception. System.AggregateException: One or more errors occurred. (Feed Value Is Invalid, need f32[1,1,128,128] but get f32[1,3,224,224]!)
---> System.InvalidOperationException: Feed Value Is Invalid, need f32[1,1,128,128] but get f32[1,3,224,224]!
at Nncase.Quantization.CalibrationEvaluator.<>c__DisplayClass11_0.<Visit>b__0()
at Nncase.Quantization.CalibrationEvaluator.VisitLeaf(ENode enode, Func`1 valueGetter)
at Nncase.Quantization.CalibrationEvaluator.Visit(ENode enode, Var var)
at Nncase.Quantization.CalibrationEvaluator.Visit(ENode enode)
at Nncase.Quantization.CalibrationEvaluator.Visit(EClass eclass)
at Nncase.Quantization.CalibrationEvaluator.Visit(ENode enode, Func`2 valueGetter)
dataset.py代码
import os
import random
import glob
from PIL import Image
import torch
from torch.utils.data import Dataset
import torchaudio
import torchvision.transforms as transforms
import numpy as np
class RavdessDataset(Dataset):
def __init__(self, root_dir, phase='train', target_sample_rate=16000, target_len=128):
"""
root_dir: 数据集根目录
phase: 'train' 或 'val'
target_sample_rate: 目标采样率 (默认 16000)
target_len: 音频时间轴长度 (默认 128)
"""
self.root_dir = root_dir
self.phase = phase
self.audio_root = os.path.join(root_dir, 'Audio')
self.visual_root = os.path.join(root_dir, 'Visual')
self.target_sample_rate = target_sample_rate
self.target_len = target_len
# 检查路径是否存在
if not os.path.exists(self.audio_root) or not os.path.exists(self.visual_root):
raise ValueError(f"数据集路径错误!请检查 {self.audio_root} 和 {self.visual_root}")
self.classes = sorted(os.listdir(self.audio_root))
self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
self.samples = []
# 遍历收集样本
for cls_name in self.classes:
cls_folder = os.path.join(self.audio_root, cls_name)
if not os.path.isdir(cls_folder): continue
audio_files = glob.glob(os.path.join(cls_folder, "*.wav"))
for audio_path in audio_files:
# 文件名关联逻辑
filename = os.path.basename(audio_path)
video_name_no_ext = filename.replace(".wav", "")
visual_folder = os.path.join(self.visual_root, cls_name, video_name_no_ext)
# 只有当音频和视频都存在时才加入列表
if os.path.exists(visual_folder) and len(glob.glob(os.path.join(visual_folder, "*.jpg"))) > 0:
self.samples.append((audio_path, visual_folder, self.class_to_idx[cls_name]))
print(f"[{phase}] 数据集加载完毕: {len(self.samples)} 个样本")
# === 视觉预处理 ===
self.visual_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
# ImageNet 标准归一化
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# === 音频特征提取器 ===
self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=self.target_sample_rate,
n_mels=128, # 对应频域高度 (Height)
n_fft=1024,
hop_length=512
)
self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
audio_path, visual_folder, label = self.samples[idx]
# -----------------------------------------------------------
# 1. 视觉处理 (Visual Processing) -> [3, 224, 224]
# -----------------------------------------------------------
frames = glob.glob(os.path.join(visual_folder, "*.jpg"))
# 容错:如果没图片,生成黑图
if len(frames) == 0:
image = torch.zeros((3, 224, 224), dtype=torch.float32)
else:
# 训练时随机抽帧,测试时固定抽中间帧
if self.phase == 'train':
selected_frame = random.choice(frames)
else:
selected_frame = frames[len(frames) // 2]
try:
img_pil = Image.open(selected_frame).convert('RGB')
image = self.visual_transform(img_pil)
except Exception as e:
print(f"图片读取错误 {selected_frame}: {e}")
image = torch.zeros((3, 224, 224), dtype=torch.float32)
# -----------------------------------------------------------
# 2. 音频处理 (Audio Processing) -> [1, 128, 128]
# -----------------------------------------------------------
try:
waveform, sr = torchaudio.load(audio_path)
# 🔥 修复 1: 强制重采样 (Resample)
if sr != self.target_sample_rate:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.target_sample_rate)
waveform = resampler(waveform)
# 🔥 修复 2: 强制单声道 (Mix to Mono)
# 如果是立体声 [2, Time],取平均值变成 [1, Time]
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# 生成 Mel 谱图 -> [1, 128, Time]
spec = self.mel_spectrogram(waveform)
spec = self.amplitude_to_db(spec)
# 🔥 修复 3: 固定时间长度 (Padding / Truncate)
current_len = spec.shape[2]
if current_len < self.target_len:
# 补零
padding = self.target_len - current_len
spec = torch.nn.functional.pad(spec, (0, padding))
else:
# 截断
spec = spec[:, :, :self.target_len]
# 确保是 3 维 [1, 128, 128]
# MelSpectrogram 通常保留 Channel 维,但为了保险起见:
if spec.dim() == 2:
spec = spec.unsqueeze(0)
except Exception as e:
print(f"音频处理错误 {audio_path}: {e}")
spec = torch.zeros((1, 128, self.target_len), dtype=torch.float32)
return image, spec, label
# 简单的自测代码
if __name__ == "__main__":
# 请修改为你的实际路径
root = r"D:\Learning\Python_learning\dataset\P-Ravdess"
ds = RavdessDataset(root)
img, aud, lbl = ds[0]
print("\n✅ Dataset 自检通过!")
print(f" Image Shape: {img.shape} (Expect: [3, 224, 224])")
print(f" Audio Shape: {aud.shape} (Expect: [1, 128, 128])")
# 验证是否为单声道
if aud.shape[0] != 1:
print("❌ 警告:音频不是单声道!")
后面我尝试将数据顺序颠倒,先输入音频再输入图片,但是还是报错,这次是:
need f32[1,3,224,224] but get f32[1,1,128,128]!
头大了,不知道是哪里的问题,是nncase对多输入模型有严格的规范吗?
硬件板卡
亚博K230视觉识别模块
软件版本
nncase_v2.8.0_onnx_v1.14.0_onnx-simplifier_v0.4.33
其他信息
验证了模型是没有问题的,准确率很高,就是卡在转换为kmodel这里了
硬件板卡
亚博K230视觉识别模块
软件版本
nncase_v2.8.0_onnx_v1.14.0_onnx-simplifier_v0.4.33
其他信息
在之前转换过单模态图像输入模型,当时成功转换了,环境应该没有问题。