|
文章目录1.TensorRT安装1.1cuda/cudnn以及虚拟环境的创建1.2根据cuda版本安装相对应版本的tensorRT2.模型转换2.1pth转onnx2.2onnx转engine3.TensorRT部署TensorRT推理(pythonAPI)TensorRT推理(C++API)可能遇到的问题参考文献1.TensorRT安装1.1cuda/cudnn以及虚拟环境的创建CUDA下载链接:https://developer.nvidia.com/cuda-toolkit-archivecuDnn下载链接:https://docs.nvidia.com/deeplearning/cudnn/latest/installation/windows.html1.2根据cuda版本安装相对应版本的tensorRTTensorRT下载链接:https://developer.nvidia.com/tensorrt/downloadTensorRT安装指南下载后解压缩,并为文件夹下的lib配置环境变量2.模型转换2.1pth转onnxpython安装onnx模块,pipinstallonnxinput_name='input'output_name='output'torch.onnx.export(model,#modelbeingrunx,#modelinput"model.onnx",#wheretosavethemodel(canbeafileorfile-likeobject)opset_version=11,#theONNXversiontoexportthemodeltoinput_names=[input_name],#themodel'sinputnamesoutput_names=[output_name],#themodel'soutputnamesdynamic_axes={input_name:{0:'batch_size',2:'in_width',3:'int_height'},output_name:{0:'batch_size',2:'out_width',3:'out_height'}}1234567891011122.2onnx转engine注意:TensorRT的ONNX解释器是针对Pytorch版本编译的,如果版本不对应可能导致转模型时出现错误1.使用命令行工具主要是调用bin文件夹下的trtexec执行程序trtexec.exe--onnx=model.onnx--saveEngine=model.engine--workspace=60001#生成静态batchsize的engine./trtexec --onnx=\ #指定onnx模型文件 --explicitBatch\ #在构建引擎时使用显式批大小(默认=隐式)显示批处理 --saveEngine=\ #输出engine --workspace=\ #设置工作空间大小单位是MB(默认为16MB) --fp16 #除了fp32之外,还启用fp16精度(默认=禁用)#生成动态batchsize的engine./trtexec --onnx=\ #指定onnx模型文件 --minShapes=input:\ #最小的NCHW --optShapes=input:\ #最佳输入维度,跟maxShapes一样就好 --maxShapes=input:\ #最大输入维度 --workspace=\ #设置工作空间大小单位是MB(默认为16MB) --saveEngine=\ #输出engine --fp16 #除了fp32之外,还启用fp16精度(默认=禁用)#小尺寸的图片可以多batchsize即8x3x416x416/home/zxl/TensorRT-7.2.3.4/bin/trtexec--onnx=yolov4_-1_3_416_416_dynamic.onnx\--minShapes=input:1x3x416x416\--optShapes=input:8x3x416x416\--maxShapes=input:8x3x416x416\--workspace=4096\--saveEngine=yolov4_-1_3_416_416_dynamic_b8_fp16.engine\--fp16#由于内存不够了所以改成4x3x608x608/home/zxl/TensorRT-7.2.3.4/bin/trtexec--onnx=yolov4_-1_3_608_608_dynamic.onnx\--minShapes=input:1x3x608x608\--optShapes=input:4x3x608x608\--maxShapes=input:4x3x608x608\--workspace=4096\--saveEngine=yolov4_-1_3_608_608_dynamic_b4_fp16.engine\--fp161234567891011121314151617181920212223242526272829303132333435另外,可以使用trtexec.exe--help命令查看trtexec的命令参数含义D:\Work\cuda_gpu\sdk\TensorRT-8.5.1.7\bin>trtexec.exe--help&&RUNNINGTensorRT.trtexec[TensorRTv8501]#trtexec.exe--help===ModelOptions===--uff=UFFmodel--onnx=ONNXmodel--model=Caffemodel(default=nomodel,randomweightsused)--deploy=Caffeprototxtfile--output=[,]*Outputnames(itcanbespecifiedmultipletimes);atleastoneoutputisrequiredforUFFandCaffe--uffInput=,X,Y,ZInputblobnameanditsdimensions(X,Y,Z=C,H,W),itcanbespecifiedmultipletimes;atleastoneisrequiredforUFFmodels--uffNHWCSetifinputsareintheNHWClayoutinsteadofNCHW(useX,Y,Z=H,W,Corderin--uffInput)===BuildOptions===--maxBatchSetmaxbatchsizeandbuildanimplicitbatchengine(default=samesizeas--batch)ThisoptionshouldnotbeusedwhentheinputmodelisONNXorwhendynamicshapesareprovided.--minShapes=specBuildwithdynamicshapesusingaprofilewiththeminshapesprovided--optShapes=specBuildwithdynamicshapesusingaprofilewiththeoptshapesprovided--maxShapes=specBuildwithdynamicshapesusingaprofilewiththemaxshapesprovided--minShapesCalib=specCalibratewithdynamicshapesusingaprofilewiththeminshapesprovided--optShapesCalib=specCalibratewithdynamicshapesusingaprofilewiththeoptshapesprovided--maxShapesCalib=specCalibratewithdynamicshapesusingaprofilewiththemaxshapesprovidedNote:Allthreeofmin,optandmaxshapesmustbesupplied.However,ifonlyoptshapesissuppliedthenitwillbeexpandedsothatminshapesandmaxshapesaresettothesamevaluesasoptshapes.Inputnamescanbewrappedwithescapedsinglequotes(ex:\'Input:0\').Exampleinputshapesspec:input0:1x3x256x256,input1:1x3x128x128Eachinputshapeissuppliedasakey-valuepairwherekeyistheinputnameandvalueisthedimensions(includingthebatchdimension)tobeusedforthatinput.Eachkey-valuepairhasthekeyandvalueseparatedusingacolon(.Multipleinputshapescanbeprovidedviacomma-separatedkey-valuepairs.--inputIOFormats=specTypeandformatofeachoftheinputtensors(default=allinputsinfp32:chw)See--outputIOFormatshelpforthegrammaroftypeandformatlist.Note:Ifthisoptionisspecified,pleasesetcomma-separatedtypesandformatsforallinputsfollowingthesameorderasnetworkinputsID(evenifonlyoneinputneedsspecifyingIOformat)orsetthetypeandformatonceforbroadcasting.--outputIOFormats=specTypeandformatofeachoftheoutputtensors(default=alloutputsinfp32:chw)Note:Ifthisoptionisspecified,pleasesetcomma-separatedtypesandformatsforalloutputsfollowingthesameorderasnetworkoutputsID(evenifonlyoneoutputneedsspecifyingIOformat)orsetthetypeandformatonceforbroadcasting.IOFormats:spec::=IOfmt[","spec]IOfmt::=type:fmttype::="fp32"|"fp16"|"int32"|"int8"fmt::=("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|"cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt]--workspace=NSetworkspacesizeinMiB.--memPoolSize=poolspecSpecifythesizeconstraintsofthedesignatedmemorypool(s)inMiB.Note:Alsoacceptsdecimalsizes,e.g.0.25MiB.Willberoundeddowntothenearestintegerbytes.Poolconstraint:poolspec::=poolfmt[","poolspec]poolfmt::=pool:sizeInMiBpool::="workspace"|"dlaSRAM"|"dlaLocalDRAM"|"dlaGlobalDRAM"--profilingVerbosity=modeSpecifyprofilingverbosity.mode::=layer_names_only|detailed|none(default=layer_names_only)--minTiming=MSettheminimumnumberofiterationsusedinkernelselection(default=1)--avgTiming=MSetthenumberoftimesaveragedineachiterationforkernelselection(default=8)--refitMarktheengineasrefittable.Thiswillallowtheinspectionofrefittablelayersandweightswithintheengine.--sparsity=specControlsparsity(default=disabled).Sparsity:spec::="disable","enable","force"Noteescriptionabouteachoftheseoptionsisasbelowdisable=donotenablesparsetacticsinthebuilder(thisisthedefault)enable=enablesparsetacticsinthebuilder(butthesetacticswillonlybeconsiderediftheweightshavetherightsparsitypattern)force=enablesparsetacticsinthebuilderandforce-overwritetheweightstohaveasparsitypattern(evenifyouloadedamodelyourself)--noTF32Disabletf32precision(defaultistoenabletf32,inadditiontofp32)--fp16Enablefp16precision,inadditiontofp32(default=disabled)--int8Enableint8precision,inadditiontofp32(default=disabled)--bestEnableallprecisionstoachievethebestperformance(default=disabled)--directIOAvoidreformattingatnetworkboundaries.(default=disabled)--precisionConstraints=specControlprecisionconstraintsetting.(default=none)PrecisionConstaints:spec::="none"|"obey"|"prefer"none=noconstraintsprefer=meetprecisionconstraintssetby--layerPrecisions/--layerOutputTypesifpossibleobey=meetprecisionconstraintssetby--layerPrecisions/--layerOutputTypesorfailotherwise--layerPrecisions=specControlper-layerprecisionconstraints.EffectiveonlywhenprecisionConstraintsissetto"obey"or"prefer".(default=none)Thespecsarereadleft-to-right,andlateronesoverrideearlierones."*"canbeusedasalayerNametospecifythedefaultprecisionforalltheunspecifiedlayers.Per-layerprecisionspec::=layerPrecision[","spec]layerPrecision::=layerName":"precisionprecision::="fp32"|"fp16"|"int32"|"int8"--layerOutputTypes=specControlper-layeroutputtypeconstraints.EffectiveonlywhenprecisionConstraintsissetto"obey"or"prefer".(default=none)Thespecsarereadleft-to-right,andlateronesoverrideearlierones."*"canbeusedasalayerNametospecifythedefaultprecisionforalltheunspecifiedlayers.Ifalayerhasmorethanoneoutput,thenmultipletypesseparatedby"+"canbeprovidedforthislayer.Per-layeroutputtypespec::=layerOutputTypes[","spec]layerOutputTypes::=layerName":"typetype::="fp32"|"fp16"|"int32"|"int8"["+"type]--calib=ReadINT8calibrationcachefile--safeEnablebuildsafetycertifiedengine--consistencyPerformconsistencycheckingonsafetycertifiedengine--restrictedEnablesafetyscopecheckingwithkSAFETY_SCOPEbuildflag--saveEngine=Savetheserializedengine--loadEngine=Loadaserializedengine--tacticSources=tacticsSpecifythetacticstobeusedbyadding(+)orremoving(-)tacticsfromthedefaulttacticsources(default=allavailabletactics).Note:CurrentlyonlycuDNN,cuBLAS,cuBLAS-LT,andedgemaskconvolutionsarelistedasoptionaltactics.TacticSources:tactics::=[","tactic]tactic::=(+|-)liblib::="CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS"|"JIT_CONVOLUTIONS"Forexample,todisablecudnnandenablecublas:--tacticSources=-CUDNN,+CUBLAS--noBuilderCacheDisabletimingcacheinbuilder(defaultistoenabletimingcache)--heuristicEnabletacticselectionheuristicinbuilder(defaultistodisabletheheuristic)--timingCacheFile=Save/loadtheserializedglobaltimingcache--preview=featuresSpecifypreviewfeaturetobeusedbyadding(+)orremoving(-)previewfeaturesfromthedefaultPreviewFeatures:features::=[","feature]feature::=(+|-)flagflag::="fasterDynamicShapes0805"|"disableExternalTacticSourcesForCore0805"===InferenceOptions===--batch=NSetbatchsizeforimplicitbatchengines(default=1)ThisoptionshouldnotbeusedwhentheengineisbuiltfromanONNXmodelorwhendynamicshapesareprovidedwhentheengineisbuilt.--shapes=specSetinputshapesfordynamicshapesinferenceinputs.Note:Inputnamescanbewrappedwithescapedsinglequotes(ex:\'Input:0\').Exampleinputshapesspec:input0:1x3x256x256,input1:1x3x128x128Eachinputshapeissuppliedasakey-valuepairwherekeyistheinputnameandvalueisthedimensions(includingthebatchdimension)tobeusedforthatinput.Eachkey-valuepairhasthekeyandvalueseparatedusingacolon(.Multipleinputshapescanbeprovidedviacomma-separatedkey-valuepairs.--loadInputs=specLoadinputvaluesfromfiles(default=generaterandominputs).Inputnamescanbewrappedwithsinglequotes(ex:'Input:0')Inputvaluesspec::=Ival[","spec]Ival::=name":"file--iterations=NRunatleastNinferenceiterations(default=10)--warmUp=NRunforNmillisecondstowarmupbeforemeasuringperformance(default=200)--duration=NRunperformancemeasurementsforatleastNsecondswallclocktime(default=3)--sleepTime=NDelayinferencestartwithagapofNmillisecondsbetweenlaunchandcompute(default=0)--idleTime=NSleepNmillisecondsbetweentwocontinuousiterations(default=0)--streams=NInstantiateNenginestouseconcurrently(default=1)--exposeDMASerializeDMAtransferstoandfromdevice(default=disabled).--noDataTransfersDisableDMAtransferstoandfromdevice(default=enabled).--useManagedMemoryUsemanagedmemoryinsteadofseparatehostanddeviceallocations(default=disabled).--useSpinWaitActivelysynchronizeonGPUevents.ThisoptionmaydecreasesynchronizationtimebutincreaseCPUusageandpower(default=disabled)--threadsEnablemultithreadingtodriveengineswithindependentthreadsorspeeduprefitting(default=disabled)--useCudaGraphUseCUDAgraphtocaptureengineexecutionandthenlaunchinference(default=disabled).Thisflagmaybeignoredifthegraphcapturefails.--timeDeserializeTimetheamountoftimeittakestodeserializethenetworkandexit.--timeRefitTimetheamountoftimeittakestorefittheenginebeforeinference.--separateProfileRunDonotattachtheprofilerinthebenchmarkrun;ifprofilingisenabled,asecondprofilerunwillbeexecuted(default=disabled)--buildOnlyExitaftertheenginehasbeenbuiltandskipinferenceperfmeasurement(default=disabled)--persistentCacheRatioSetthepersistentCacheLimitinratio,0.5representhalfofmaxpersistentL2size(default=0)===BuildandInferenceBatchOptions===Whenusingimplicitbatch,themaxbatchsizeoftheengine,ifnotgiven,issettotheinferencebatchsize;whenusingexplicitbatch,ifshapesarespecifiedonlyforinference,theywillbeusedalsoasmin/opt/maxinthebuildprofile;ifshapesarespecifiedonlyforthebuild,theoptshapeswillbeusedalsoforinference;ifbotharespecified,theymustbecompatible;andifexplicitbatchisenabledbutneitherisspecified,themodelmustprovidecompletestaticdimensions,includingbatchsize,forallinputsUsingONNXmodelsautomaticallyforcesexplicitbatch.===ReportingOptions===--verboseUseverboselogging(default=false)--avgRuns=NReportperformancemeasurementsaveragedoverNconsecutiveiterations(default=10)--percentile=P1,P2,P3,...ReportperformancefortheP1,P2,P3,...percentages(0Writethetimingresultsinajsonfile(default=disabled)--exportOutput=Writetheoutputtensorstoajsonfile(default=disabled)--exportProfile=Writetheprofileinformationperlayerinajsonfile(default=disabled)--exportLayerInfo=Writethelayerinformationoftheengineinajsonfile(default=disabled)===SystemOptions===--device=NSelectcudadeviceN(default=0)--useDLACore=NSelectDLAcoreNforlayersthatsupportDLA(default=none)--allowGPUFallbackWhenDLAisenabled,allowGPUfallbackforunsupportedlayers(default=disabled)--pluginsPluginlibrary(.so)toload(canbespecifiedmultipletimes)===Help===--help,-hPrintthismessage1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761772.使用TensorRTAPI转换成TensorRTenginedefgenerate_engine(onnx_path,engine_path):#1.构建trt日志记录器logger=trt.Logger(trt.Logger.WARNING)#初始化trt.init_libnvinfer_plugins(logger,namespace="")#2.createabuilder,logger放入进去builder=trt.Builder(logger)#3.创建配置文件,用于trt如何优化模型config=builder.create_builder_config()#设置工作空间内存大小config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,1<< 20) # 1 MiB
# 设置精度
config.set_flag(trt.BuilderFlag.FP16)
# INT8需要进行校准
# 4.创建一个network。EXPLICIT_BATCH:batch是动态的
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# 创建ONNX模型解析器
parser = trt.OnnxParser(network, logger)
# 解析ONNX模型,并填充到网络
success = parser.parse_from_file(onnx_path)
# 处理错误
for idx in range(parser.num_errors):
print(parser.get_error(idx))
if not success:
pass # Error handling code here
# 5.engine模型序列化,即生成了trt.engine model
serialized_engine = builder.build_serialized_network(network, config)
# 保存序列化的engine,如果以后要用到的话. 模型不能跨平台,即和trt版本 gpu类型有关
with open(engine_path, "wb") as f:
f.write(serialized_engine)
# 6.反序列化engine。使用runtime接口。即加载engine模型进行推理。
# runtime = trt.Runtime(logger)
# engine = runtime.deserialize_cuda_engine(serialized_engine)
# with open("sample.engine", "rb") as f:
# serialized_engine = f.read()
12345678910111213141516171819202122232425262728293031323334353637383940
完成上述步骤后,将获得一个转换为TensorRT格式的模型文件(model.trt)。可以将该文件用于TensorRT的推理和部署。
3. TensorRT部署
TensorRT部署包括Python和C++ 两种API
可以使用TensorRT的Python API或C++ API
TensorRT推理(python API)
在安装好tensorrt环境后,可以尝试使用预训练权重进行转化封装部署,运行以下代码!!
import torch
import tensorrt as trt
from collections import OrderedDict, namedtuple
def infer(img_data, engine_path):
# 1.日志器
logger = trt.Logger(trt.Logger.INFO)
# 2.runtime加载trt engine model
runtime = trt.Runtime(logger)
trt.init_libnvinfer_plugins(logger, '') # initialize TensorRT plugins
with open(engine_path, "rb") as f:
serialized_engine = f.read()
engine = runtime.deserialize_cuda_engine(serialized_engine)
# 3.绑定输入输出
bindings = OrderedDict()
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
fp16 = False
for index in range(engine.num_bindings):
name = engine.get_binding_name(index)
dtype = trt.nptype(engine.get_binding_dtype(index))
shape = tuple(engine.get_binding_shape(index))
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(self.device)
# Tensor.data_ptr 该tensor首个元素的地址即指针,为int类型
bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
if engine.binding_is_input(index) and dtype == np.float16:
fp16 = True
# 记录输入输出的指针地址
binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
# 4.加载数据,绑定数据,并推理,将推理的结果放入到
context = engine.create_execution_context()
binding_addrs['images'] = int(img_data.data_ptr())
context.execute_v2(list(binding_addrs.values()))
# 5.获取结果((根据导出onnx模型时设置的输入输出名字获取)
nums = bindings['num'].data[0]
boxes = bindings['boxes'].data[0]
scores = bindings['scores'].data[0]
classes = bindings['classes'].data[0]
123456789101112131415161718192021222324252627282930313233343536373839
TensorRT推理(C++ API)
项目工程设置属性 — 链接器 找到链接器下的输入,在附加依赖项中加入
cudnn.lib
cublas.lib
cudart.lib
nvinfer.lib
nvparsers.lib
nvonnxparser.lib
nvinfer_plugin.lib
opencv_world460d.lib
12345678
完整推理代码:
#include #include#include#include#include#include#include#include"NvInfer.h"#include"NvOnnxParser.h"#include"logger.h"usingsample::gLogError;usingsample::gLogInfo;usingnamespacenvinfer1;//logger用来管控打印日志级别//TRTLogger继承自nvinfer1::ILoggerclassTRTLogger:publicnvinfer1::ILogger{ voidlog(Severityseverity,constchar*msg)noexceptoverride { //屏蔽INFO级别的日志 if(severity!=Severity::kINFO) std::cout<< msg << std::endl;
}
} gLogger;
int ReadEngineData(char* enginePath, char *&engineData)
{
// 读取引擎文件
std::ifstream engineFile(enginePath, std::ios::binary);
if (engineFile.fail())
{
std::cerr << "Failed to open file!" << std::endl;
return -1;
}
engineFile.seekg(0, std::ifstream::end);
auto fsize = engineFile.tellg();
engineFile.seekg(0, std::ifstream::beg);
if (nullptr == engineData)
{
engineData = new char[fsize];
}
engineFile.read(engineData, fsize);
engineFile.close();
return fsize;
}
size_t getMemorySize(nvinfer1:ims32 input_dims, int typeSize)
{
size_t psize = input_dims.d[0] * input_dims.d[1] * input_dims.d[2] * input_dims.d[3] * typeSize;
return psize;
}
bool inferDemo(float* input_buffer, int* tensorSize)
{
int batchsize = tensorSize[0];
int channel = tensorSize[1];
int width = tensorSize[2];
int height = tensorSize[3];
size_t dataSize = width * height*channel*batchsize;
// 读取引擎文件
char* enginePath = "net_model.engine";
char* engineData = nullptr;
int fsize = ReadEngineData(enginePath, engineData);
printf("fsize=%d\n", fsize);
// 创建运行时 & 加载引擎
// TRTLogger glogger; // 可以使用这个代替sample::gLogger.getTRTLogger()
std::unique_ptrruntime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())}; std::unique_ptrmEngine(runtime->deserializeCudaEngine(engineData,fsize,nullptr)); assert(mEngine.get()!=nullptr); //创建执行上下文 std::unique_ptrcontext(mEngine->createExecutionContext()); constchar*name0=mEngine->getBindingName(0); constchar*name1=mEngine->getBindingName(1); constchar*name2=mEngine->getBindingName(2); constchar*name3=mEngine->getBindingName(3); printf("name0=%s\nname1=%s\nname2=%s\nname3=%s\n",name0,name1,name2,name3); //获取输入大小 autoinput_idx=mEngine->getBindingIndex("input"); if(input_idx==-1) { returnfalse; } assert(mEngine->getBindingDataType(input_idx)==nvinfer1:ataType::kFLOAT); autoinput_dims=context->getBindingDimensions(input_idx); context->setBindingDimensions(input_idx,input_dims); autoinput_size=getMemorySize(input_dims,sizeof(float_t)); //获取输出大小所有输出的空间都要分配 autooutput1_idx=mEngine->getBindingIndex("output1"); if(output1_idx==-1) { returnfalse; } assert(mEngine->getBindingDataType(output1_idx)==nvinfer1:ataType::kFLOAT); autooutput1_dims=context->getBindingDimensions(output1_idx); autooutput1_size=getMemorySize(output1_dims,sizeof(float_t)); autooutput2_idx=mEngine->getBindingIndex("output2"); if(output2_idx==-1) { returnfalse; } assert(mEngine->getBindingDataType(output2_idx)==nvinfer1:ataType::kFLOAT); autooutput2_dims=context->getBindingDimensions(output2_idx); autooutput2_size=getMemorySize(output2_dims,sizeof(float_t)); autooutput3_idx=mEngine->getBindingIndex("output3"); if(output3_idx==-1) { returnfalse; } assert(mEngine->getBindingDataType(output3_idx)==nvinfer1:ataType::kFLOAT); autooutput3_dims=context->getBindingDimensions(output3_idx); autooutput3_size=getMemorySize(output3_dims,sizeof(float_t)); //准备推理 //AllocateCUDAmemory void*input_mem{nullptr}; if(cudaMalloc(&input_mem,input_size)!=cudaSuccess) { gLogError<< "ERROR: input cuda memory allocation failed, size = " << input_size << " bytes" << std::endl;
return false;
}
void* output1_mem{ nullptr };
if (cudaMalloc(&output1_mem, output1_size) != cudaSuccess)
{
gLogError << "ERROR: output cuda memory allocation failed, size = " << output1_size << " bytes" << std::endl;
return false;
}
void* output2_mem{ nullptr };
if (cudaMalloc(&output2_mem, output2_size) != cudaSuccess)
{
gLogError << "ERROR: output cuda memory allocation failed, size = " << output2_size << " bytes" << std::endl;
return false;
}
void* output3_mem{ nullptr };
if (cudaMalloc(&output3_mem, output3_size) != cudaSuccess)
{
gLogError << "ERROR: output cuda memory allocation failed, size = " << output3_size << " bytes" << std::endl;
return false;
}
// 复制数据到设备
cudaMemcpy(input_mem, input_buffer, input_size, cudaMemcpyHostToDevice); // cudaMemcpyHostToDevice 从主机到设备 即 内存到显存
// 绑定输入输出内存 一起送入推理
void* bindings[4];
bindings[input_idx] = input_mem;
bindings[output1_idx] = output1_mem;
bindings[output2_idx] = output2_mem;
bindings[output3_idx] = output3_mem;
// 推理
bool status = context->executeV2(bindings); if(!status) { gLogError<< "ERROR: inference failed" << std::endl;
cudaFree(input_mem);
cudaFree(output1_mem);
cudaFree(output2_mem);
cudaFree(output3_mem);
return 0;
}
// 获得结果
float* output3_buffer = new float[dataSize];
cudaMemcpy(output3_buffer, output3_mem, output3_size, cudaMemcpyDeviceToHost);
// 释放CUDA内存
cudaFree(input_mem);
cudaFree(output1_mem);
cudaFree(output2_mem);
cudaFree(output3_mem);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
gLogError << "ERROR: failed to free CUDA memory: " << cudaGetErrorString(err) << std::endl;
return false;
}
// save the results
delete[] output3_buffer;
output3_buffer = nullptr;
return true;
}
int main()
{
int batchsize = 1;
int channel = 3;
int width = 256;
int height = 256;
size_t dataSize = width * height*channel*batchsize;
int tensorSize[4] = { batchsize, channel, width, height };
float* input_buffer = new float[dataSize];
for (int i = 0; i < dataSize; i++)
input_buffer[i] = 0.1;
inferDemo(input_buffer, tensorSize);
delete[] input_buffer;
input_buffer = nullptr;
system("pause");
return 0;
}
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
可能遇到的问题
trt转换时,cudnn库报错: 安装正确版本的cudnn,另外也需要将tensorRT中lib的dll拷贝到cuda的bin文件下。
trt转换时,缺少 zlibwapi.dll: Could not locate zlibwapi.dll. Please make sure it is in your library path!
网上的解决方案有: 在NVIDIA官网下载(不过好像自2023年后下架了);另外就是下载源码进行编译 zlib源码:https://github.com/madler/zlib 而我自己则通过搜索电脑上的zlibwapi.dll(安装的pytorch路径下、钉钉、Origin),将其拷贝到cuda的bin文件下解决了。 TensorRT推理时,size报错: 如果模型是动态大小导出的,则需要自己设置size。
参考文献
tensorRT基础(1)-实现模型的推理过程 ONNX基本操作 重要,python部署成功 模型部署】TensorRT的安装与使用 有点参考价值 TensorRT部署模型基本步骤(C++) TensorRT优化部署(一)–TensorRT和ONNX基础 值得参考C++部署 TensorRT基础知识及应用【C++深度学习部署(十)】 windows平台使用tensorRT部署yolov5详细介绍,整个流程思路以及细节。 C ++部署成功 TensorRT Windows C++ 部署 ONNX 模型简易教程 python版tensorrt推理 python TensorRT API转换模型 TensorRT实战:构建与部署Python推理模型(二) onnx转engine命令 【TensorRT】trtexec工具转engine trt 使用trtexec工具ONNX转engine
|
|