在k230d上部署的yolo11.kmodel置信度只有0.05

Question

问题描述

我是用官方给的4类打印数字识别训练yolo11.pt，通过在pycharm里编写model.export(format="onnx"),得出yolo11.onnx，然后用test_yolo11.zipl里的to_kmodel得到yolo11.kmodel，我调用的代码是用官方的4类打印数字识别视频推理代码,运行后打印最高置信度只用0.05，如果把to_kmodel.py里的compile_options.input_range=[0,1]改成[0,255]，置信度就会一直在1以上，拍天花板也是1，虽然有框，但是没有一个能框到正确的目标上，我想问问大家这种情况怎么解决？我用test_det_onnx和test_det_kmodel测试模型结果都是空套件，实在是不知道怎么解决了。置信度为1时超低置信度

复现步骤

import os
import argparse
import numpy as np
from PIL import Image
import onnxsim
import onnx
import nncase
import shutil
import math

def parse_model_input_output(model_file,input_shape):
    onnx_model = onnx.load(model_file)
    input_all = [node.name for node in onnx_model.graph.input]
    input_initializer = [node.name for node in onnx_model.graph.initializer]
    input_names = list(set(input_all) - set(input_initializer))
    input_tensors = [
        node for node in onnx_model.graph.input if node.name in input_names]

    # input
    inputs = []
    for _, e in enumerate(input_tensors):
        onnx_type = e.type.tensor_type
        input_dict = {}
        input_dict['name'] = e.name
        input_dict['dtype'] = onnx.helper.tensor_dtype_to_np_dtype(onnx_type.elem_type)
        input_dict['shape'] = [(i.dim_value if i.dim_value != 0 else d) for i, d in zip(
            onnx_type.shape.dim, input_shape)]
        inputs.append(input_dict)

    return onnx_model, inputs


def onnx_simplify(model_file, dump_dir,input_shape):
    onnx_model, inputs = parse_model_input_output(model_file,input_shape)
    onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
    input_shapes = {}
    for input in inputs:
        input_shapes[input['name']] = input['shape']

    onnx_model, check = onnxsim.simplify(onnx_model, overwrite_input_shapes=input_shapes)
    assert check, "Simplified ONNX model could not be validated"

    model_file = os.path.join(dump_dir, 'simplified.onnx')
    onnx.save_model(onnx_model, model_file)
    return model_file


def read_model_file(model_file):
    with open(model_file, 'rb') as f:
        model_content = f.read()
    return model_content

def generate_data_ramdom(shape, batch):
    data = []
    for i in range(batch):
        data.append([np.random.randint(0, 256, shape).astype(np.uint8)])
    return data


def generate_data(shape, batch, calib_dir):
    img_paths = [os.path.join(calib_dir, p) for p in os.listdir(calib_dir)]
    data = []
    for i in range(batch):
        assert i < len(img_paths), "calibration images not enough."
        img_data = Image.open(img_paths[i]).convert('RGB')
        img_data = img_data.resize((shape[3], shape[2]), Image.BILINEAR)
        img_data = np.asarray(img_data, dtype=np.uint8)
        img_data = np.transpose(img_data, (2, 0, 1))
        data.append([img_data[np.newaxis, ...]])
    return np.array(data)

def main():
    parser = argparse.ArgumentParser(prog="nncase")
    parser.add_argument("--target", default="k230",type=str, help='target to run,k230/cpu')
    parser.add_argument("--model",default="best.onnx",type=str, help='model file')
    parser.add_argument("--dataset", type=str, default="test",help='calibration_dataset')
    parser.add_argument("--input_width", type=int, default=640, help='input_width')
    parser.add_argument("--input_height", type=int, default=640, help='input_height')
    parser.add_argument("--ptq_option", type=int, default=0, help='ptq_option:0,1,2,3,4,5')

    args = parser.parse_args()

    # 更新参数为32倍数
    input_width = int(math.ceil(args.input_width / 32.0)) * 32
    input_height = int(math.ceil(args.input_height / 32.0)) * 32

    # 模型的输入shape，维度要跟input_layout一致
    input_shape=[1,3,input_height,input_width]

    dump_dir = 'tmp'
    if not os.path.exists(dump_dir):
        os.makedirs(dump_dir)

    # onnx simplify
    model_file = onnx_simplify(args.model, dump_dir,input_shape)

    # compile_options
    compile_options = nncase.CompileOptions()
    compile_options.target = args.target
    # preprocess
    # 是否采用模型做预处理
    compile_options.preprocess = True
    compile_options.swapRB = False
    # 输入图像的shape
    compile_options.input_shape = input_shape
    # 模型输入格式‘uint8’或者‘float32’
    compile_options.input_type = 'uint8'

    # 如果输入是‘uint8’格式，输入反量化之后的范围
    compile_options.input_range = [0, 1]

    # 预处理的mean/std值，每个channel一个
    compile_options.mean = [0, 0, 0] 
    compile_options.std = [1, 1, 1]

    # 设置输入的layout，onnx默认‘NCHW’即可
    compile_options.input_layout = "NCHW"
    # compile_options.output_layout = "NCHW"

    # compile_options.dump_ir = True
    # compile_options.dump_asm = True
    # compile_options.dump_dir = dump_dir
    compile_options.quant_type = 'uint8'

    # compiler
    compiler = nncase.Compiler(compile_options)

    # import
    model_content = read_model_file(model_file)
    import_options = nncase.ImportOptions()
    compiler.import_onnx(model_content, import_options)

    # ptq_options
    ptq_options = nncase.PTQTensorOptions()
    ptq_options.samples_count = 20

    if args.ptq_option == 0:
        ptq_options.calibrate_method = 'NoClip'
        ptq_options.w_quant_type = 'uint8'
        ptq_options.quant_type = 'uint8'
    elif args.ptq_option == 1:
        ptq_options.calibrate_method = 'NoClip'
        ptq_options.w_quant_type = 'int16'
        ptq_options.quant_type = 'uint8'
    elif args.ptq_option == 2:
        ptq_options.calibrate_method = 'NoClip'
        ptq_options.w_quant_type = 'uint8'
        ptq_options.quant_type = 'int16'
    elif args.ptq_option == 3:
        ptq_options.calibrate_method = 'Kld'
        ptq_options.w_quant_type = 'uint8'
        ptq_options.quant_type = 'uint8'
    elif args.ptq_option == 4:
        ptq_options.calibrate_method = 'Kld'
        ptq_options.w_quant_type = 'int16'
        ptq_options.quant_type = 'uint8'
    elif args.ptq_option == 5:
        ptq_options.calibrate_method = 'Kld'
        ptq_options.w_quant_type = 'uint8'
        ptq_options.quant_type = 'int16'
    else:
        pass


    #ptq_options.set_tensor_data(generate_data_ramdom(input_shape, ptq_options.samples_count))
    ptq_options.set_tensor_data(generate_data(input_shape, ptq_options.samples_count, args.dataset))
    compiler.use_ptq(ptq_options)

    # compile
    compiler.compile()

    # kmodel
    kmodel = compiler.gencode_tobytes()
    base,ext=os.path.splitext(args.model)
    kmodel_name=base+".kmodel"
    with open(kmodel_name, 'wb') as f:
        f.write(kmodel)

    if os.path.exists("./tmp"):
        shutil.rmtree("./tmp")
    if os.path.exists("./gmodel_dump_dir"):
        shutil.rmtree("./gmodel_dump_dir")

if __name__ == '__main__':
    main()

硬件板卡

正点原子k230d

软件版本

CanMV_K230D_ATK_DNK230D_micropython_v1.4-0-g6cce59c_nncase_v2.9.0.img.gz

其他信息

import os,sys
from media.sensor import *
from media.display import *
from media.media import *
import nncase_runtime as nn
import ulab.numpy as np
import time,image,random,gc
from libs.Utils import *

#-----------------------------其他必要方法---------------------------------------------
# 多目标检测 非最大值抑制方法实现
def nms(boxes,scores,thresh):
    """Pure Python NMS baseline."""
    x1,y1,x2,y2 = boxes[:, 0],boxes[:, 1],boxes[:, 2],boxes[:, 3]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = np.argsort(scores,axis = 0)[::-1]
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        new_x1,new_y1,new_x2,new_y2,new_areas = [],[],[],[],[]
        for order_i in order:
            new_x1.append(x1[order_i])
            new_x2.append(x2[order_i])
            new_y1.append(y1[order_i])
            new_y2.append(y2[order_i])
            new_areas.append(areas[order_i])
        new_x1 = np.array(new_x1)
        new_x2 = np.array(new_x2)
        new_y1 = np.array(new_y1)
        new_y2 = np.array(new_y2)
        xx1 = np.maximum(x1[i], new_x1)
        yy1 = np.maximum(y1[i], new_y1)
        xx2 = np.minimum(x2[i], new_x2)
        yy2 = np.minimum(y2[i], new_y2)
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        new_areas = np.array(new_areas)
        ovr = inter / (areas[i] + new_areas - inter)
        new_order = []
        for ovr_i,ind in enumerate(ovr):
            if ind < thresh:
                new_order.append(order[ovr_i])
        order = np.array(new_order,dtype=np.uint8)
    return keep

# 计算padding缩放比例和上下左右padding大小
def letterbox_pad_param(input_size,output_size):
    ratio_w = output_size[0] / input_size[0]  # 宽度缩放比例
    ratio_h = output_size[1] / input_size[1]   # 高度缩放比例
    ratio = min(ratio_w, ratio_h)  # 取较小的缩放比例
    new_w = int(ratio * input_size[0])  # 新宽度
    new_h = int(ratio * input_size[1])  # 新高度
    dw = (output_size[0] - new_w) / 2  # 宽度差
    dh = (output_size[1] - new_h) / 2  # 高度差
    top = int(round(0))
    bottom = int(round(dh * 2 + 0.1))
    left = int(round(0))
    right = int(round(dw * 2 - 0.1))
    return top, bottom, left, right,ratio


#-----------------------------Sensor/Display初始化部分-------------------------------

# 定义屏幕显示分辨率
DISPLAY_WIDTH = ALIGN_UP(640, 16)
DISPLAY_HEIGHT = 480

# 定义AI推理帧分辨率
AI_RGB888P_WIDTH = ALIGN_UP(1280, 16)
AI_RGB888P_HEIGHT = 960

sensor = Sensor(id=2)
sensor.reset()
# 设置水平镜像和垂直翻转，不同板子的方向不同，通过配置这两个参数使画面转正
#sensor.set_hmirror(False)
#sensor.set_vflip(False)

# 配置sensor的多通道出图，每个通道的出图格式和分辨率可以不同，最多可以出三路图，参考sensor API文档
# 通道0直接给到显示VO，格式为YUV420
sensor.set_framesize(width = DISPLAY_WIDTH, height = DISPLAY_HEIGHT,chn=CAM_CHN_ID_0)
sensor.set_pixformat(Sensor.YUV420SP,chn=CAM_CHN_ID_0)
# 通道1给到AI做算法处理，格式为RGB888P
sensor.set_framesize(width = AI_RGB888P_WIDTH , height = AI_RGB888P_HEIGHT, chn=CAM_CHN_ID_1)
sensor.set_pixformat(Sensor.RGBP888, chn=CAM_CHN_ID_1)

# 绑定通道0的摄像头图像到屏幕，防止另一个通道的AI推理过程太慢影响显示过程，导致出现卡顿效果
sensor_bind_info = sensor.bind_info(x = 0, y = 0, chn = CAM_CHN_ID_0)
Display.bind_layer(**sensor_bind_info, layer = Display.LAYER_VIDEO1)

# OSD图像初始化,创建一帧和屏幕分辨率同样大的透明图像，用于绘制AI推理结果
osd_img = image.Image(DISPLAY_WIDTH, DISPLAY_HEIGHT, image.ARGB8888)

## 设置为LT9611显示，默认1920x1080
#Display.init(Display.LT9611,width=DISPLAY_WIDTH,height=DISPLAY_HEIGHT,osd_num=1, to_ide = True)
# 如果使用ST7701的LCD屏幕显示，默认800*480,还支持640*480等，具体参考Display模块API文档
Display.init(Display.ST7701, width=DISPLAY_WIDTH,height=DISPLAY_HEIGHT,osd_num=1, to_ide=True)

# 限制bind通道的帧率，防止生产者太快
sensor._set_chn_fps(chn = CAM_CHN_ID_0, fps = Display.fps())


#-----------------------------AI模型初始化部分-------------------------------
# Kmodel模型路径
kmodel_path="/data/best(17).kmodel"
# 类别标签
labels = ["0","1","2","3"]
# 模型输入分辨率
model_input_size=[640,480]
# 其它参数设置，包括阈值、最大检测框数量等
confidence_threshold = 0.7
nms_threshold = 0.3
max_boxes_num = 10
# 不同类别框的颜色
colors=get_colors(len(labels))

# 初始化ai2d预处理，并配置ai2d padding+resize预处理，预处理过程输入分辨率为图片分辨率，输出分辨率模型输入的需求分辨率，实现image->preprocess->model的过程
ai2d=nn.ai2d()
# 配置ai2d模块的输入输出数据类型和格式
ai2d.set_dtype(nn.ai2d_format.NCHW_FMT, nn.ai2d_format.NCHW_FMT, np.uint8, np.uint8)
# 设置padding的参数，上下左右padding的大小和三个通道padding的具体值
top,bottom,left,right,ratio=letterbox_pad_param([AI_RGB888P_WIDTH,AI_RGB888P_HEIGHT],model_input_size)
ai2d.set_pad_param(True,[0,0,0,0,top,bottom,left,right], 0, [128,128,128])
# 设置resize参数，配置插值方法
ai2d.set_resize_param(True,nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel)
# 设置ai2d模块的输入输出维度，并构建builder实例
ai2d_builder = ai2d.build([1,3,AI_RGB888P_HEIGHT,AI_RGB888P_WIDTH], [1,3,model_input_size[1],model_input_size[0]])
# 初始化一个空的tensor，用于ai2d输出和kpu输入，因为一般ai2d的输出会直接送给kpu，因此这里使用一个变量共用
input_init_data = np.ones((1,3,model_input_size[1],model_input_size[0]),dtype=np.uint8)
kpu_input_tensor = nn.from_numpy(input_init_data)


# 创建kpu实例
kpu=nn.kpu()
# 加载kmodel模型
kpu.load_kmodel(kmodel_path)

# media初始化
MediaManager.init()
# 启动sensor
sensor.run()
# 测试帧率
fps = time.clock()
while True:
    fps.tick()
    #------------------------从摄像头dump一帧图像并处理----------------------------------
    # 从摄像头1通道dump一帧RGB888P格式的Image图像
    img=sensor.snapshot(chn=CAM_CHN_ID_1)
    # 转换成ulab.numpy.ndarray格式的数据，CHW
    img_np=img.to_numpy_ref()
    # 创建nncase_runtime.tensor用于给到ai2d进行预处理
    ai2d_input_tensor=nn.from_numpy(img_np)
    #------------------------推理前的预处理步骤----------------------------------------
    # 执行预处理过程
    ai2d_builder.run(ai2d_input_tensor, kpu_input_tensor)
    #------------------------使用kpu完成模型推理--------------------------------------
    # 设置kpu的第0个输入为ai2d预处理后的tensor，如果有多个，可以依次设置
    kpu.set_input_tensor(0,kpu_input_tensor)
    # 在kpu上执行模型推理
    kpu.run()
    #------------------------获取模型推理结束的输出----------------------------------------
    # 获取模型推理的输出tensor，并将其转换成ulab.numpy.ndarray数据进行后处理
    results=[]
    for i in range(kpu.outputs_size()):
        output_i_tensor = kpu.get_output_tensor(i)
        result_i = output_i_tensor.to_numpy()
        results.append(result_i)
        del output_i_tensor
    #------------------------推理输出的后处理步骤----------------------------------------
    # YOLOv8检测模型输出只有1个，也就是results[0]的shape为[1,box_dim，box_num]，results[0][0]表示[box_dim,box_num]，转换成[box_num,box_dim]方便依次处理每个框
    output_data=results[0][0].transpose()
    # 每个框前四个数据为中心点坐标和宽高
    boxes_ori = output_data[:,0:4]
    # 剩余数据为每个类别的分数，通过argmax找到分数最大的类别编号和分数值
    class_ori = output_data[:,4:]
    class_res=np.argmax(class_ori,axis=-1)
    scores_ = np.max(class_ori,axis=-1)
    # 通过置信度阈值筛选框（小于置信度阈值的丢弃），同时处理坐标为x1,y1,x2,y2，为框的左上和右下的坐标,注意比例变换，将输入分辨率坐标(model_input_size)转换成原图坐标(AI_RGB888P_WIDTH,AI_RGB888P_HEIGHT)
    boxes,inds,scores=[],[],[]
    for i in range(len(boxes_ori)):
        if scores_[i]>confidence_threshold:
            x,y,w,h=boxes_ori[i][0],boxes_ori[i][1],boxes_ori[i][2],boxes_ori[i][3]
            x1 = int((x - 0.5 * w)/ratio)
            y1 = int((y - 0.5 * h)/ratio)
            x2 = int((x + 0.5 * w)/ratio)
            y2 = int((y + 0.5 * h)/ratio)
            boxes.append([x1,y1,x2,y2])
            inds.append(class_res[i])
            scores.append(scores_[i])
    osd_img.clear()
    #如果第一轮筛选后有框，继续下一帧处理
    if len(boxes)!=0:
        # 将list转换成ulab.numpy.ndarray方便处理
        boxes = np.array(boxes)
        scores = np.array(scores)
        inds = np.array(inds)
        # NMS过程,去除重叠的冗余框，keep为NMS去除重叠框后的索引列表
        keep = nms(boxes,scores,nms_threshold)
        dets = np.concatenate((boxes, scores.reshape((len(boxes),1)), inds.reshape((len(boxes),1))), axis=1)
        # 得到最后的检测框的结果
        det_res = []
        for keep_i in keep:
            det_res.append(dets[keep_i])
        det_res = np.array(det_res)
        # 去前max_box_num个，防止检测框过多
        det_res = det_res[:max_boxes_num, :]
        #------------------------绘制检测框结果----------------------------------------
        osd_img.clear()
        # 分别处理每一个框，将原图坐标(AI_RGB888P_WIDTH,AI_RGB888P_HEIGHT)转换成显示屏幕坐标(DISPLAY_WIDTH,DISPLAY_HEIGHT)
        for det in det_res:
            x_1, y_1, x_2, y_2 = map(lambda pos: int(round(pos, 0)), det[:4])
            draw_x= int(x_1 * DISPLAY_WIDTH // AI_RGB888P_WIDTH)
            draw_y= int(y_1 * DISPLAY_HEIGHT // AI_RGB888P_HEIGHT)
            draw_w = int((x_2 - x_1) * DISPLAY_WIDTH // AI_RGB888P_WIDTH)
            draw_h = int((y_2 - y_1) * DISPLAY_HEIGHT // AI_RGB888P_HEIGHT)
            osd_img.draw_rectangle(draw_x,draw_y, draw_w, draw_h, color=colors[int(det[5])],thickness=2)
            osd_img.draw_string_advanced( draw_x , max(0,draw_y-50), 24, "类别:"+labels[int(det[5])] + " 分数:{0:.3f}".format(det[4]), color=colors[int(det[5])])
    #------------------------在屏幕显示检测框结果----------------------------------------
    Display.show_image(osd_img)
    print("det fps:",fps.fps())
    gc.collect()

#退出循环，释放资源
del ai2d
del kpu
sensor.stop()
Display.deinit()
time.sleep_ms(50)
MediaManager.deinit()
nn.shrink_memory_pool()

Wy001 · Answer

为什么要改转换参数，给的参数就是适配yolo11的