apache · Jan 24, 2023
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎3rdparty/cnpy b/‎3rdparty/cnpy
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎LICENSE
Lines changed: 1 addition & 0 deletions b/‎LICENSE
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/cpp_rtvm/CMakeLists.txt
Lines changed: 98 additions & 0 deletions b/‎apps/cpp_rtvm/CMakeLists.txt
Lines changed: 98 additions & 0 deletions
diff --git a/‎apps/cpp_rtvm/README.md
Lines changed: 354 additions & 0 deletions b/‎apps/cpp_rtvm/README.md
Lines changed: 354 additions & 0 deletions
diff --git a/‎apps/cpp_rtvm/main.cc
Lines changed: 264 additions & 0 deletions b/‎apps/cpp_rtvm/main.cc
Lines changed: 264 additions & 0 deletions
diff --git a/‎apps/cpp_rtvm/scripts/download_models.py
Lines changed: 36 additions & 0 deletions b/‎apps/cpp_rtvm/scripts/download_models.py
Lines changed: 36 additions & 0 deletions
diff --git a/‎apps/cpp_rtvm/tvm_runner.cc
Lines changed: 320 additions & 0 deletions b/‎apps/cpp_rtvm/tvm_runner.cc
Lines changed: 320 additions & 0 deletions
diff --git a/‎apps/cpp_rtvm/tvm_runner.h
Lines changed: 93 additions & 0 deletions b/‎apps/cpp_rtvm/tvm_runner.h
Lines changed: 93 additions & 0 deletions
diff --git a/‎cmake/config.cmake
Lines changed: 3 additions & 0 deletions b/‎cmake/config.cmake
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmake/modules/LibInfo.cmake
Lines changed: 1 addition & 0 deletions b/‎cmake/modules/LibInfo.cmake
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/driver/tvmc/composite_target.py
Lines changed: 5 additions & 0 deletions b/‎python/tvm/driver/tvmc/composite_target.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/tvm/relay/op/contrib/clml.py
Lines changed: 1 addition & 1 deletion b/‎python/tvm/relay/op/contrib/clml.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/auto_scheduler/search_task.cc
Lines changed: 8 additions & 0 deletions b/‎src/auto_scheduler/search_task.cc
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/runtime/graph_executor/graph_executor.cc
Lines changed: 43 additions & 1 deletion b/‎src/runtime/graph_executor/graph_executor.cc
Lines changed: 43 additions & 1 deletion
diff --git a/‎src/runtime/graph_executor/graph_executor.h
Lines changed: 6 additions & 0 deletions b/‎src/runtime/graph_executor/graph_executor.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/support/libinfo.cc
Lines changed: 5 additions & 0 deletions b/‎src/support/libinfo.cc
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/scripts/task_build_adreno_bins.sh
Lines changed: 2 additions & 1 deletion b/‎tests/scripts/task_build_adreno_bins.sh
Lines changed: 2 additions & 1 deletion
@@ -19,3 +19,6 @@
 [submodule "3rdparty/OpenCL-Headers"]
 	path = 3rdparty/OpenCL-Headers
 	url = https://github.com/KhronosGroup/OpenCL-Headers.git
+[submodule "3rdparty/cnpy"]
+	path = 3rdparty/cnpy
+	url = https://github.com/rogersce/cnpy.git
@@ -593,6 +593,10 @@ if(USE_CPP_RPC)
   add_subdirectory("apps/cpp_rpc")
 endif()
 
+if(USE_CPP_RTVM)
+  add_subdirectory("apps/cpp_rtvm")
+endif()
+
 if(USE_IOS_RPC)
   add_subdirectory("apps/ios_rpc")
 endif()
 
@@ -234,6 +234,7 @@ MIT License
 3rdparty/libcrc
 3rdparty/cma
 3rdparty/compiler-rt/builtin_fp16.h
+3rdparty/cnpy
 
 The Unlicense
 -------------
 
@@ -0,0 +1,98 @@
+cmake_policy(SET CMP0069 NEW) # suppress cmake warning about IPO
+
+set(RTVM_SOURCES
+  main.cc
+  tvm_runner.cc
+  ../../3rdparty/cnpy/cnpy.cpp
+)
+set(TVM_RUNNER_SOURCES
+  tvm_runner.cc
+  ../../3rdparty/cnpy/cnpy.cpp
+)
+
+set(RTVM_LINKER_LIBS "")
+
+if(WIN32)
+  list(APPEND RTVM_SOURCES win32_process.cc)
+  list(APPEND TVM_RUNNER_SOURCES win32_process.cc)
+endif()
+
+# Set output to same directory as the other TVM libs
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+add_executable(rtvm ${RTVM_SOURCES})
+add_library(tvm_runner_objs OBJECT ${TVM_RUNNER_SOURCES})
+add_library(tvm_runner SHARED $<TARGET_OBJECTS:tvm_runner_objs>)
+
+include(CheckIPOSupported)
+check_ipo_supported(RESULT result OUTPUT output)
+if(result)
+  set_property(TARGET rtvm PROPERTY INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
+endif()
+
+if(WIN32)
+  target_compile_definitions(rtvm PUBLIC -DNOMINMAX)
+endif()
+
+if (OS)
+   if (OS STREQUAL "Linux")
+      set_property(TARGET rtvm PROPERTY LINK_FLAGS -lpthread)
+      set_property(TARGET tvm_runner PROPERTY LINK_FLAGS -lpthread)
+   endif()
+endif()
+
+if(USE_OPENCL)
+   if (ANDROID_ABI)
+     if(DEFINED ENV{ANDROID_NDK_MAJOR})
+       if($ENV{ANDROID_NDK_MAJOR} VERSION_LESS "23")
+         set_property(TARGET rtvm PROPERTY LINK_FLAGS -fuse-ld=gold)
+         set_property(TARGET tvm_runner PROPERTY LINK_FLAGS -fuse-ld=gold)
+       endif()
+     endif()
+   endif()
+endif()
+
+target_include_directories(
+  rtvm
+  PUBLIC "../../include"
+  PUBLIC "../../3rdparty/cnpy"
+  PUBLIC DLPACK_PATH
+  PUBLIC DMLC_PATH
+)
+
+if (BUILD_FOR_ANDROID AND USE_HEXAGON)
+  get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+    DSPRPC_LIB DSPRPC_LIB_DIRS
+  )
+  if(DSPRPC_LIB_DIRS)
+    link_directories(${DSPRPC_LIB_DIRS})
+  else()
+    message(WARNING "Could not locate some Hexagon SDK components")
+  endif()
+  list(APPEND RTVM_LINKER_LIBS cdsprpc log)
+endif()
+
+if(USE_ETHOSN)
+  if (ETHOSN_RUNTIME_LIBRARY)
+    list(APPEND RTVM_LINKER_LIBS ${ETHOSN_RUNTIME_LIBRARY})
+  else()
+    message(WARNING "Could not locate Arm(R) Ethos(TM)-N runtime library components")
+  endif()
+endif()
+
+if(BUILD_STATIC_RUNTIME)
+  list(APPEND RTVM_LINKER_LIBS -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive z)
+else()
+  list(APPEND RTVM_LINKER_LIBS tvm_runtime z)
+endif()
+
+target_link_libraries(rtvm ${RTVM_LINKER_LIBS})
+
+# Build tvm_runner as a exportable lib
+target_include_directories(
+  tvm_runner_objs
+  PUBLIC "../../include"
+  PUBLIC "../../3rdparty/cnpy"
+  PUBLIC DLPACK_PATH
+  PUBLIC DMLC_PATH
+)
+target_link_libraries(tvm_runner ${RTVM_LINKER_LIBS})
@@ -0,0 +1,354 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# Native Inference application for CPP Native
+
+Native inference tool ```rtvm``` helps in deploying TVM compiled models from a standalone cpp environment.
+Overall process starts from getting a model from a framework all the way up to running on target device using `rtvm` tool.
+
+### Models
+
+Models can be downloaded from well known frameworks like Tensorflow, PyTorch, TFLite, Onnx ..etc.
+scripts/download_models.py has a reference to prepare sample network ```resnet50``` from keras framework.
+
+```bash
+python3  scripts/download_models.py
+```
+
+### Auto Tuning
+Auto tuning process tunes various operatrors the given model for respective target. Auto tuning for remote devices use ```tvm_rpc``` and we need to setup the rpc environment before we invoke tuning.
+Please refer below section [RPC setup](#rpc-setup) for the same.
+
+Auto tunng is necessary to obtain best performaning kernels. We can skip this step if we have tuning log already or the tuning cache is available from tophub (implicite by TVM compilation process).
+Below message indicate that there exists some kernels not optimized for the selected target. In this case we can proceed with tuning to best performance.
+```One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.```
+
+with below environment from [RPC setup](#rpc-setup)
+``` bash
+tvm tracker running on ```TVM_TRACKER_HOST```
+tracker port being ```TVM_TRACKER_PORT```
+rpc device access key being ```TVM_RPC_KEY```
+the model to be tuned being ```./model_data/keras-resnet50/resnet50.h5```
+```
+
+the below command we can generate the tuning cache to file ```./model_data/keras-resnet50/keras-resnet50.log```
+
+```bash
+python3 -m tvm.driver.tvmc tune --target="opencl" --target-host="llvm -mtriple=aarch64-linux-gnu" \
+./model_data/keras-resnet50/resnet50.h5 -o ./model_data/keras-resnet50/keras-resnet50.log \
+--early-stopping 0 --repeat 30 --rpc-key ${TVM_RPC_KEY} --rpc-tracker ${TVM_TRACKER_HOST}:${TVM_TRACKER_PORT} --trials 1024 \
+--tuning-records ./model_data/keras-resnet50/keras-resnet50-records.log --tuner xgb
+```
+
+where
+```bash
+--target="opencl" refers to opencl device on Android device
+--target-host="llvm -mtriple=aarch64-linux-gnu" refers to target_host being an ARM64 CPU
+Options --early-stopping, --repeat, --trials, --tuner are Auto TVM specific options.
+```
+Please refer to AutoTVM documentation for more details [here](https://tvm.apache.org/docs/how_to/tune_with_autotvm/index.html?highlight=autotvm).
+
+### Compile the model
+
+Compilation step generates TVM compiler output artifacts which need to be taken to target device for deployment.
+These artifacts is a compressed archive with kernel shared lib, json with graph description and params binary.
+
+Below command will generate the same
+
+
+```bash
+python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang \
+--target="opencl, llvm" --target-llvm-mtriple aarch64-linux-gnu -o keras-resnet50.tar ./model_data/keras-resnet50/resnet50.h5
+```
+
+where
+```
+--cross-compiler : Indicates the cross compiler path for kernel library generation
+--target="opencl, llvm" indicates target and host devices
+```
+
+### Test Run via RPC
+
+At this stage we can verify the generated compiler output for execution correctness over the RPC setup interface.
+Below command can run the compiled output on remote target device.
+
+with
+
+``` bash
+tvm tracker running on ```TVM_TRACKER_HOST```
+tracker port being ```TVM_TRACKER_PORT```
+rpc device access key being ```TVM_RPC_KEY```
+compilation out being keras-resnet50.tar
+```
+
+```bash
+python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker ${TVM_TRACKER_HOST}:${TVM_TRACKER_PORT} --print-time
+```
+
+This inputs random inputs and validates the execution correctness of the compiled model.
+
+```tvmc``` tool has various options to input custom data, profile the model and benchmark the execution.
+
+
+### Deployment Run
+
+Now we will verify the deployment run of the compiled model using ```rtvm``` tool on target device without any RPC or host based execution.
+
+We need to extract the tar achive on target device. We can copy the extracted contents of ```keras-resnet50.tar``` under Android temp folder at ```/data/local/tmp/keras-resnet50/```
+
+Also copy the cross compiled tool ```rtvm``` and ```libtvm_runtime.so``` to ```data/local/tmp/```
+
+```rtvm``` usage can be quired as below
+```bash
+Android:/data/local/tmp $ LD_LIBRARY_PATH=./ ./rtvm
+Command line usage
+--model        - The folder containing tvm artifacts(mod.so, mod.param, mod.json)
+--device       - The target device to use {llvm, opencl, cpu, cuda, metal, rocm, vpi, oneapi}
+--input        - Numpy file for the model input (optional and we use random of not given)
+--output       - Numpy file name to dump the model output as numpy
+--dump-meta    - Dump model meta information
+
+  Example
+  ./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
+  ./rtvm --model=keras-resnet50 --device="opencl" --input input.npz --output=output.npz
+```
+
+```rtvm``` can run the model using no inputs (just a dry run without any valid inputs) and also with specific input supplied as a numpy npz format file.
+
+We can create npz dump for all inputs by saving the dict object as shown below.
+
+With ```keras-resnet50``` having one  input ```input_1``` with shape ```[1, 224, 224, 3]``` and dtype ```float32```
+
+```
+# Random initilization
+input1 = np.random.uniform(low=-1, high=1, size=(1, 224, 224, 3)).astype("float32")
+dataset = {"input_1": input1}
+np.savez("input.npz", **dataset)
+```
+
+Copy ```input.npz``` also to the target device as ```/data/local/tmp/input.npz```
+
+
+Now, on Android shell we can do a dry run as well as with specific input as shown below.
+```bash
+# Query meta data information
+Android:/data/local/tmp/ $ LD_LIBRARY_PATH=./ ./rtvm --model=keras-resnet50 --device=opencl --dump-meta
+. . . . . .
+Meta Information:keras-resnet50
+    Number of Inputs:183
+    Number of Outputs:1
+    Input MetaInfo:
+        Input:input_1
+            DType:float32
+            Shape:[1, 224, 224, 3]
+    Output MetaInfo:
+        Output:tvmgen_default_fused_nn_softmax
+            DType:float32
+            Shape:[1, 1000]
+. . . . . .
+
+# Dry run with out any inputs
+Android:/data/local/tmp/ $ LD_LIBRARY_PATH=./ ./rtvm --model=keras-resnet50 --device=opencl
+Model         = keras-resnet50
+Device        = opencl
+Input         =
+Output        =
+Dump Metadata = False
+TVMRunner Constructor:keras-resnet50 Devices:opencl
+TVMRunner Load:keras-resnet50
+TVMRunner::GetMetaInfo
+Executing dry run ...
+Set Random Input for :input_1
+TVMRunner::GetInputMemSize:input_1
+Random Input Size:602112  bytes
+TVMRunner::SetInput (Raw)
+TVMRunner::Run
+Get Output for :tvmgen_default_fused_nn_softmax
+TVMRunner::GetOutputMemSize:tvmgen_default_fused_nn_softmax
+TVMRunner::GetOutput (Raw)
+Output Size:4000  bytes
+
+
+# Run with input and dump output as npz file
+Android:/data/local/tmp/ $ LD_LIBRARY_PATH=./ ./rtvm --model=keras-resnet50 --device=opencl --input=input.npz --output=output.npz
+Model         = keras-resnet50
+Device        = opencl
+Input         = input.npz
+Output        = output.npz
+Dump Metadata = False
+TVMRunner Constructor:keras-resnet50 Devices:opencl
+TVMRunner Load:keras-resnet50
+TVMRunner::GetMetaInfo
+Executing with Input:input.npz Output:output.npz
+TVMRunner::SetInput (Numpy):input.npz
+Set Numpy Input for :input_1
+TVMRunner::Run
+TVMRunner::GetOutput (Numpy):output.npz
+Get Output for :tvmgen_default_fused_nn_softmax
+Output Size:4000  bytes
+```
+
+output.npz contains the modle outputs. Below is a quick look of its contents.
+```bash
+tvm-host:~$ unzip -l output.npz
+Archive:  output.npz
+  Length      Date    Time    Name
+---------  ---------- -----   ----
+     4080  1980-00-00 00:00   tvmgen_default_fused_nn_softmax.npy
+---------                     -------
+     4080                     1 file
+
+```
+
+Building ```cpp_rtvm``` produces ```libtvm_runner.so```, a simplified interface that rtvm use internally for loading and executing tvm compiled models from C/C++ environments.
+```tvm_runner.h``` describes the interface definition here. Alternatively pro users can use TVM's [c_native_api](https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h) interface for more access to TVM features.
+
+
+# RPC Setup
+
+For Android devices we require cross compilation of tvm_rpc (also libtvm_runtime.so which is a dependency) for remote device.
+RPC setup involves running tracker on host device and running tvm_rpc on target device.
+
+### Tracker
+
+Below command runs the tracker on host over port ```9100```
+
+```bash
+python3 -m tvm.exec.rpc_tracker --host 127.0.0.1 --port 9100"
+```
+### RPC on Target
+
+With ```abcd1234ef``` being adb device id and tvm_rpc (and libtvm_runtime.so) is pushed to target device at ```/data/local/tmp/tvm_rpc/```
+
+```bash
+export ANDROID_SERIAL=abcd1234ef
+# Below settings will reroute networking tcm connections on devices to host device via adb interface
+adb reverse tcp:9100 tcp:9100
+adb forward tcp:5000 tcp:5000
+# Run the tvm_rpc on device
+env adb shell "cd /data/local/tmp/tvm_rpc; killall -9 tvm_rpc; \
+LD_LIBRARY_PATH=/data/local/tmp/tvm_rpc/ ./tvm_rpc server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:9100 --key=android
+```
+
+Now we have the rpc setup with ```TVM_TRACKER_HOST=127.0.0.1```, ```TVM_TRACKER_PORT=9100``` and ```TVM_RPC_KEY=android```.
+
+We can also check connected and available devices on tracker as shown below.
+
+```bash
+python3 -m tvm.exec.query_rpc_tracker --port ${TVM_TRACKER_PORT}
+Tracker address 127.0.0.1:9100
+
+Server List
+------------------------------
+server-address           key
+------------------------------
+       127.0.0.1:5000    server:android
+------------------------------
+
+Queue Status
+-------------------------------
+key       total  free  pending
+-------------------------------
+android   1      1     0
+-------------------------------
+```
+
+
+# Target Specific Configuration
+
+Below sections describe device/target specific settings to be used with ```tvmc``` tool.
+
+### Adreno GPU
+
+Adreno GPU has a docker definition that helps to ease the development environment.
+
+We can build the docker image by using below command from TVM repo.
+
+```bash
+./docker/build.sh ci_adreno
+docker tag tvm.ci_adreno ci_adreno
+```
+
+Below command builds host and target rpc components for Adreno and drops into an interactive shell.
+
+```bash
+./tests/scripts/ci.py adreno -i
+```
+
+Also, one can build with Adreno OpenCLML SDK support
+
+```bash
+export ADRENO_OPENCL=<Path to OpenCLML SDK>
+./tests/scripts/ci.py adreno -i
+```
+
+Above command produces
+```build-adreno``` which is host build
+```build-adreno-target``` which contains cross compiled tvm_rpc and libtvm_runtime.so
+
+
+Below options to be used for Adreno GPU while working with tvmc
+
+* Tuning
+
+  ```
+  --target="opencl -device=adreno"
+  --target-host="llvm -mtriple=aarch64-linux-gnu"
+  ```
+
+* Compilation
+
+  ```
+  --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang
+  --target="opencl, llvm"
+  --target-opencl-device adreno
+  --target-llvm-mtriple aarch64-linux-gnu
+  ```
+
+  While enabling CLML just need to specify below target option for compilation.
+  ```--target="opencl, clml, llvm"```
+
+
+* Running
+
+  ```--device="cl"```
+
+
+For example with a model from keras ```./model_data/keras-resnet50/resnet50.h5```
+
+
+```bash
+# Tuning
+python3 -m tvm.driver.tvmc tune --desired-layout NCHW --target="opencl -device=adreno" --target-host="llvm -mtriple=aarch64-linux-gnu" \
+./model_data/keras-resnet50/resnet50.h5 -o ./model_data/keras-resnet50/keras-resnet50.log --early-stopping 0 --repeat 30 \
+--rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --trials 1024 --tuning-records ./model_data/keras-resnet50/keras-resnet50-records.log --tuner xgb
+
+# Tuning produces tuning log ./model_data/keras-resnet50/keras-resnet50.log
+
+
+# Compilation
+python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang \
+--desired-layout NCHW --target="opencl, llvm" --target-opencl-device adreno --target-llvm-mtriple aarch64-linux-gnu \
+./model_data/keras-resnet50/resnet50.h5 -o keras-resnet50.tar
+
+# Compilation produces target artifacts keras-resnet50.tar
+
+# Run on adreno device via RPC
+python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time
+
+```
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file main.cc
+ * \brief TVM runtime utility for TVM.
+ */
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#if defined(__linux__) || defined(__ANDROID__)
+#include <unistd.h>
+#endif
+#include <dmlc/logging.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "../../src/support/socket.h"
+#include "../../src/support/utils.h"
+#include "tvm_runner.h"
+
+#if defined(_WIN32)
+#include "win32_process.h"
+#endif
+
+using namespace std;
+using namespace tvm::runtime;
+using namespace tvm::support;
+
+static const string kUsage =
+    "Command line usage\n"
+    "--model        - The folder containing tvm artifacts(mod.so, mod.param, mod.json) \n"
+    "--device       - The target device to use {llvm, opencl, cpu, cuda, metal, rocm, vpi, "
+    "oneapi}\n"
+    "--input        - Numpy file for the model input (optional and we use random of not given)\n"
+    "--output       - Numpy file name to dump the model output as numpy\n"
+    "--dump-meta    - Dump model meta information\n"
+    "\n"
+    "  Example\n"
+    "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
+    "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --input input.npz --output=output.npz\n"
+    "\n";
+
+/*!
+ * \brief Tool Arguments.
+ * \arg model The tvm artifact to load & run
+ * \arg device The target device to use {llvm, cl, ...etc.}
+ * \arg input Numpy file for the model input
+ * \arg output Numpy file name to dump the model output as numpy
+ */
+struct ToolArgs {
+  string model;
+  string device;
+  string input;
+  string output;
+  bool dump_meta = false;
+};
+
+/*!
+ * \brief PrintArgs print the contents of ToolArgs
+ * \param args ToolArgs structure
+ */
+void PrintArgs(const ToolArgs& args) {
+  LOG(INFO) << "Model         = " << args.model;
+  LOG(INFO) << "Device        = " << args.device;
+  LOG(INFO) << "Input         = " << args.input;
+  LOG(INFO) << "Output        = " << args.output;
+  LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
+}
+
+#if defined(__linux__) || defined(__ANDROID__)
+/*!
+ * \brief CtrlCHandler, exits if Ctrl+C is pressed
+ * \param s signal
+ */
+void CtrlCHandler(int s) {
+  LOG(INFO) << "\nUser pressed Ctrl+C, Exiting";
+  exit(1);
+}
+
+/*!
+ * \brief HandleCtrlC Register for handling Ctrl+C event.
+ */
+void HandleCtrlC() {
+  // Ctrl+C handler
+  struct sigaction sigIntHandler;
+  sigIntHandler.sa_handler = CtrlCHandler;
+  sigemptyset(&sigIntHandler.sa_mask);
+  sigIntHandler.sa_flags = 0;
+  sigaction(SIGINT, &sigIntHandler, nullptr);
+}
+#endif
+/*!
+ * \brief GetCmdOption Parse and find the command option.
+ * \param argc arg counter
+ * \param argv arg values
+ * \param option command line option to search for.
+ * \param key whether the option itself is key
+ * \return value corresponding to option.
+ */
+string GetCmdOption(int argc, char* argv[], string option, bool key = false) {
+  string cmd;
+  for (int i = 1; i < argc; ++i) {
+    string arg = argv[i];
+    if (arg.find(option) == 0) {
+      if (key) {
+        cmd = argv[i];
+        return cmd;
+      }
+      // We assume "=" is the end of option.
+      ICHECK_EQ(*option.rbegin(), '=');
+      cmd = arg.substr(arg.find('=') + 1);
+      return cmd;
+    }
+  }
+  return cmd;
+}
+
+/*!
+ * \brief ParseCmdArgs parses the command line arguments.
+ * \param argc arg counter
+ * \param argv arg values
+ * \param args the output structure which holds the parsed values
+ */
+void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
+  const string model = GetCmdOption(argc, argv, "--model=");
+  if (!model.empty()) {
+    args.model = model;
+  } else {
+    LOG(INFO) << kUsage;
+    exit(0);
+  }
+
+  const string device = GetCmdOption(argc, argv, "--device=");
+  if (!device.empty()) {
+    args.device = device;
+  } else {
+    LOG(INFO) << kUsage;
+    exit(0);
+  }
+
+  const string input = GetCmdOption(argc, argv, "--input=");
+  if (!input.empty()) {
+    args.input = input;
+  }
+
+  const string output = GetCmdOption(argc, argv, "--output=");
+  if (!output.empty()) {
+    args.output = output;
+  }
+
+  const string pmeta = GetCmdOption(argc, argv, "--dump-meta", true);
+  if (!pmeta.empty()) {
+    args.dump_meta = true;
+  }
+}
+
+/*!
+ * \brief Loads and Executes the model on given Target.
+ * \param args tool arguments
+ * \return result of operation.
+ */
+int ExecuteModel(ToolArgs& args) {
+#if defined(__linux__) || defined(__ANDROID__)
+  // Ctrl+C handler
+  HandleCtrlC();
+#endif
+
+  // Initialize TVM Runner
+  TVMRunner runner = TVMRunner(args.model, args.device);
+
+  // Load the model
+  runner.Load();
+
+  // Query Model meta Information
+  TVMMetaInfo mInfo = runner.GetMetaInfo();
+
+  // Print Meta Information
+  if (args.dump_meta) runner.PrintMetaInfo();
+
+  if (args.input.empty() || args.output.empty()) {
+    LOG(INFO) << "Executing dry run ... ";
+    // Set random input for all inputs
+    for (auto& elem : mInfo.input_info) {
+      LOG(INFO) << "Set Random Input for :" << elem.first;
+      auto shape = elem.second.first;
+      size_t ssize = runner.GetInputMemSize(elem.first);
+      char* data = (char*)malloc(ssize);
+      LOG(INFO) << "Random Input Size:" << ssize << "  bytes";
+      runner.SetInput(elem.first, data);
+      free(data);
+    }
+
+    // Run the model
+    runner.Run();
+
+    // Get Output and dump few values
+    for (auto& elem : mInfo.output_info) {
+      LOG(INFO) << "Get Output for :" << elem.first;
+      auto shape = elem.second.first;
+      size_t ssize = runner.GetOutputMemSize(elem.first);
+      char* data = (char*)malloc(ssize);
+      runner.GetOutput(elem.first, data);
+      LOG(INFO) << "Output Size:" << ssize << "  bytes";
+      free(data);
+    }
+  } else {
+    LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
+
+    // Set Input from Numpy Input
+    runner.SetInput(args.input);
+
+    // Run the model
+    runner.Run();
+
+    // Get Output as Numpy dump
+    runner.GetOutput(args.output);
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief main The main function.
+ * \param argc arg counter
+ * \param argv arg values
+ * \return result of operation.
+ */
+int main(int argc, char* argv[]) {
+  if (argc <= 1) {
+    LOG(INFO) << kUsage;
+    return 0;
+  }
+
+  ToolArgs args;
+  ParseCmdArgs(argc, argv, args);
+  PrintArgs(args);
+
+  if (ExecuteModel(args)) {
+    PrintArgs(args);
+    LOG(INFO) << kUsage;
+    return -1;
+  }
+  return 0;
+}
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+tmp_dir = "./model_data/"
+dload_models = []
+
+# Keras : Resnet50
+try:
+    from tensorflow.keras.applications.resnet50 import ResNet50
+
+    model_file_name = "{}/{}".format(tmp_dir + "keras-resnet50", "resnet50.h5")
+    model = ResNet50(include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000)
+    model.save(model_file_name)
+    dload_models.append(model_file_name)
+except ImportError:
+    LOG.warning("Keras is not installed, skipping Keras models")
+
+
+print("Models:", dload_models)
@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm_runner.cc
+ * \brief TVM model runner implementation.
+ */
+
+#include "tvm_runner.h"
+
+#include <cnpy.h>
+
+#include <fstream>
+#include <streambuf>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Get the TVM device id corresponding to device string.
+ * \param device the target device in string format.
+ * \return dl_device corresponding to the device string.
+ */
+int GetTVMDevice(std::string device) {
+  if (!device.compare("cpu")) {
+    return static_cast<int>(kDLCPU);
+  } else if (!device.compare("llvm")) {
+    return static_cast<int>(kDLCPU);
+  } else if (!device.compare("cuda")) {
+    return static_cast<int>(kDLCUDA);
+  } else if (!device.compare("opencl")) {
+    return static_cast<int>(kDLOpenCL);
+  } else if (!device.compare("vulkan")) {
+    return static_cast<int>(kDLVulkan);
+  } else if (!device.compare("metal")) {
+    return static_cast<int>(kDLMetal);
+  } else if (!device.compare("vpi")) {
+    return static_cast<int>(kDLVPI);
+  } else if (!device.compare("rocm")) {
+    return static_cast<int>(kDLROCM);
+  } else if (!device.compare("oneapi")) {
+    return static_cast<int>(kDLOneAPI);
+  } else {
+    LOG(FATAL) << "TVMRunner : Unsupported device :" << device;
+  }
+}
+
+/*!
+ * \brief Constructor for TVMRunner.
+ * \param path where the tfm compiler artifacts present.
+ * \param device the target device where we need to load the compiled model.
+ */
+TVMRunner::TVMRunner(std::string path, std::string device) : r_model_path(path), r_device(device) {
+  LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << r_device;
+}
+
+/*!
+ * \brief Load Setup TVM graph runtime for given model.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::Load(void) {
+  LOG(INFO) << "TVMRunner Load:" << r_model_path;
+  // Load the lib file
+  r_mod_handle = Module::LoadFromFile((r_model_path + "/mod.so").c_str(), "so");
+
+  // Read model json file
+  std::ifstream json_reader((r_model_path + "/mod.json").c_str());
+  CHECK(!json_reader.fail()) << "Failed to open json file:" << (r_model_path + "/mod.json").c_str();
+  std::string json_str((std::istreambuf_iterator<char>(json_reader)),
+                       std::istreambuf_iterator<char>());
+  json_reader.close();
+
+  // Get ref to graph exeutor
+  auto f_handle = tvm::runtime::Registry::Get("tvm.graph_executor.create");
+
+  // Greate graph runtime
+  r_graph_handle = (*f_handle)(json_str, r_mod_handle, GetTVMDevice(r_device), 0);
+
+  // Read params binary file
+  std::ifstream params_reader((r_model_path + "/mod.params").c_str(), std::ios::binary);
+  CHECK(!params_reader.fail()) << "Failed to open json file:"
+                               << (r_model_path + "/mod.params").c_str();
+  const std::string params_str((std::istreambuf_iterator<char>(params_reader)),
+                               std::istreambuf_iterator<char>());
+  params_reader.close();
+  TVMByteArray params_arr;
+  params_arr.data = params_str.c_str();
+  params_arr.size = params_str.length();
+
+  // Load parameters
+  r_graph_handle.GetFunction("load_params")(params_arr);
+
+  return 0;
+}
+
+/*!
+ * \brief Calculated the memory size for the NDArray.
+ * \param NDArray object.
+ * \return size of the memory.
+ */
+inline size_t GetMemSize(NDArray& narr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < narr->ndim; ++i) {
+    size *= static_cast<size_t>(narr->shape[i]);
+  }
+  size *= (narr->dtype.bits * narr->dtype.lanes + 7) / 8;
+  return size;
+}
+
+/*!
+ * \brief Get the input alloc mem size.
+ * \param input_id The input id to query the mem size.
+ * \return The memory size.
+ */
+size_t TVMRunner::GetInputMemSize(std::string input_id) {
+  LOG(INFO) << "TVMRunner::GetInputMemSize:" << input_id;
+
+  NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id);
+  auto ssize = GetMemSize(in_arr);
+
+  return ssize;
+}
+
+/*!
+ * \brief Get the output alloc mem size.
+ * \param output_id The output id to query the mem size.
+ * \return The memory size.
+ */
+size_t TVMRunner::GetOutputMemSize(std::string output_id) {
+  LOG(INFO) << "TVMRunner::GetOutputMemSize:" << output_id;
+
+  NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id);
+  auto ssize = GetMemSize(out_arr);
+
+  return ssize;
+}
+
+/*!
+ * \brief Set the model inputs from npz file.
+ * \param inputfile the npz file from where we read input tensor data.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::SetInput(std::string inputfile) {
+  LOG(INFO) << "TVMRunner::SetInput (Numpy):" << inputfile;
+  cnpy::npz_t npz_input = cnpy::npz_load(inputfile);
+
+  for (auto& elem : mInfo.input_info) {
+    LOG(INFO) << "Set Numpy Input for :" << elem.first;
+    NDArray in_arr = r_graph_handle.GetFunction("get_input")(elem.first);
+    auto ssize = GetMemSize(in_arr);
+
+    if (npz_input.find(elem.first) != npz_input.end()) {
+      in_arr.CopyFromBytes(npz_input[elem.first].data<char>(), ssize);
+    } else {
+      LOG(WARNING) << "Couldn't find input " << elem.first << " in npy input file";
+    }
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief Set the model input from the given binary buffer.
+ * \param input_id input node name.
+ * \param raw_input binary input buffer to copy over input NDArray.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::SetInput(std::string input_id, char* raw_input) {
+  LOG(INFO) << "TVMRunner::SetInput (Raw)";
+  NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id);
+  auto ssize = GetMemSize(in_arr);
+  in_arr.CopyFromBytes(raw_input, ssize);
+  return 0;
+}
+
+/*!
+ * \brief Get the model outputs and dump them to npz file.
+ * \param outputfile the npz file to where we dump the output data.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::GetOutput(std::string outputfile) {
+  LOG(INFO) << "TVMRunner::GetOutput (Numpy):" << outputfile;
+
+  for (auto& elem : mInfo.output_info) {
+    LOG(INFO) << "Get Output for :" << elem.first;
+    NDArray out_arr = r_graph_handle.GetFunction("get_output")(elem.first);
+    auto ssize = GetMemSize(out_arr);
+    LOG(INFO) << "Output Size:" << ssize << "  bytes";
+
+    void* data = (void*)malloc(ssize * (out_arr->dtype.bits * out_arr->dtype.lanes + 7) / 8);
+    out_arr.CopyToBytes(data, ssize);
+    std::vector<size_t> shape;
+
+    for (int j = 0; j < out_arr->ndim; ++j) shape.push_back(out_arr->shape[j]);
+    if (!elem.second.second.compare("float32")) {
+      cnpy::npz_save<float>(outputfile, elem.first, (float*)data, shape, "a");
+    } else if (!elem.second.second.compare("int8")) {
+      cnpy::npz_save<int8_t>(outputfile, elem.first, (int8_t*)data, shape, "a");
+    } else {
+      LOG(WARNING) << "DType:" << elem.second.second << " is not supported for npy_save";
+    }
+    free(data);
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief Get output of the model as a binary buffer.
+ * \param output_id output node name to read the data.
+ * \param raw_output the buffer to copy the data to.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::GetOutput(std::string output_id, char* raw_output) {
+  LOG(INFO) << "TVMRunner::GetOutput (Raw)";
+  NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id);
+  auto ssize = GetMemSize(out_arr);
+  out_arr.CopyToBytes(raw_output, ssize);
+  return 0;
+}
+
+/*!
+ * \brief Call one cycle of execution for the model.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::Run(void) {
+  LOG(INFO) << "TVMRunner::Run";
+
+  r_graph_handle.GetFunction("run")();
+  return 0;
+}
+
+/*!
+ * \brief Query various metadata from the grsph runtime.
+ * \param 0 on success else error code.
+ */
+TVMMetaInfo TVMRunner::GetMetaInfo(void) {
+  LOG(INFO) << "TVMRunner::GetMetaInfo";
+
+  mInfo.n_inputs = r_graph_handle.GetFunction("get_num_inputs")();
+  mInfo.n_outputs = r_graph_handle.GetFunction("get_num_outputs")();
+
+  Map<String, ObjectRef> tvm_input_info = r_graph_handle.GetFunction("get_input_info")();
+  auto shape_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["shape"].as<MapNode>());
+  auto dtype_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["dtype"].as<MapNode>());
+  for (const auto& kv : shape_info) {
+    auto stuple = GetRef<ShapeTuple>(kv.second.as<ShapeTupleObj>());
+    std::vector<int> vshape;
+    vshape.assign(stuple.begin(), stuple.end());
+    auto dtype = GetRef<String>(dtype_info[kv.first].as<StringObj>());
+    std::pair<std::vector<int>, std::string> value = std::make_pair(vshape, dtype);
+    mInfo.input_info.insert({kv.first, value});
+  }
+
+  tvm_input_info = r_graph_handle.GetFunction("get_output_info")();
+  shape_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["shape"].as<MapNode>());
+  dtype_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["dtype"].as<MapNode>());
+  for (const auto& kv : shape_info) {
+    auto stuple = GetRef<ShapeTuple>(kv.second.as<ShapeTupleObj>());
+    std::vector<int> vshape;
+    vshape.assign(stuple.begin(), stuple.end());
+    auto dtype = GetRef<String>(dtype_info[kv.first].as<StringObj>());
+    std::pair<std::vector<int>, std::string> value = std::make_pair(vshape, dtype);
+    mInfo.output_info.insert({kv.first, value});
+  }
+
+  return mInfo;
+}
+
+/*!
+ * \brief Print the meta information.
+ * \param 0 on success else error code.
+ */
+void TVMRunner::PrintMetaInfo(void) {
+  LOG(INFO) << "Meta Information:" << r_model_path;
+  LOG(INFO) << "    Number of Inputs:" << mInfo.n_inputs;
+  LOG(INFO) << "    Number of Outputs:" << mInfo.n_outputs;
+  LOG(INFO) << "    Input MetaInfo:";
+  for (auto& elem : mInfo.input_info) {
+    std::ostringstream stream;
+    stream << "[";
+    copy(elem.second.first.begin(), elem.second.first.end() - 1,
+         std::ostream_iterator<int>(stream, ", "));
+    stream << elem.second.first.back() << "]";
+    LOG(INFO) << "        Input:" << elem.first;
+    LOG(INFO) << "            DType:" << elem.second.second;
+    LOG(INFO) << "            Shape:" << stream.str();
+  }
+  LOG(INFO) << "    Output MetaInfo:";
+  for (auto& elem : mInfo.output_info) {
+    std::ostringstream stream;
+    stream << "[";
+    copy(elem.second.first.begin(), elem.second.first.end() - 1,
+         std::ostream_iterator<int>(stream, ", "));
+    stream << elem.second.first.back() << "]";
+    LOG(INFO) << "        Output:" << elem.first;
+    LOG(INFO) << "            DType:" << elem.second.second;
+    LOG(INFO) << "            Shape:" << stream.str();
+  }
+}
+
+}  // namespace runtime
+}  // namespace tvm
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm_runner.h
+ * \brief TVM model runner.
+ */
+#ifndef TVM_APPS_CPP_RTVM_RUNNER_H_
+#define TVM_APPS_CPP_RTVM_RUNNER_H_
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+
+#include "tvm/runtime/c_runtime_api.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief various meta information related to the compiled TVM model.
+ */
+typedef struct {
+ public:
+  int n_inputs;
+  int n_outputs;
+  std::map<std::string, std::pair<std::vector<int>, std::string>> input_info;
+  std::map<std::string, std::pair<std::vector<int>, std::string>> output_info;
+} TVMMetaInfo;
+
+/*!
+ * \brief encapsulates TVM graph runtime functionality with simplified API interface.
+ */
+class TVMRunner {
+ public:
+  /*! \brief Constructor */
+  TVMRunner(std::string path, std::string device);
+
+  /*! \brief Initiates graph runtime and with the compiled model */
+  int Load(void);
+  /*! \brief Executes one inference cycle */
+  int Run(void);
+  /*! \brief To set the inputs from given npz file */
+  int SetInput(std::string);
+  /*! \brief To set the input from binary data */
+  int SetInput(std::string, char*);
+  /*! \brief Save the model output into given npz file */
+  int GetOutput(std::string);
+  /*! \brief Get the model output in binary format */
+  int GetOutput(std::string, char*);
+  /*! \brief To get the input mem size */
+  size_t GetInputMemSize(std::string);
+  /*! \brief To get the output mem size */
+  size_t GetOutputMemSize(std::string);
+  /*! \brief Populates various meta information from graph runtime */
+  TVMMetaInfo GetMetaInfo(void);
+  /*! \brief Print function to show all meta information */
+  void PrintMetaInfo(void);
+
+ private:
+  /*! \brief Module handle for the shared object */
+  Module r_mod_handle;
+  /*! \brief Graph runtime module handle */
+  Module r_graph_handle;
+  /*! \brief The local model path from where we load the model */
+  std::string r_model_path;
+  /*! \brief The target device */
+  std::string r_device;
+  /*! \brief Holds meta information queried from graph runtime */
+  TVMMetaInfo mInfo;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_APPS_CPP_RTVM_RUNNER_H_
@@ -110,6 +110,9 @@ set(USE_RPC ON)
 # Whether to build the C++ RPC server binary
 set(USE_CPP_RPC OFF)
 
+# Whether to build the C++ native runtime tool binary
+set(USE_CPP_RTVM OFF)
+
 # Whether to build the iOS RPC server application
 set(USE_IOS_RPC OFF)
 
 
@@ -60,6 +60,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CMSISNN="${USE_CMSISNN}"
     TVM_INFO_USE_COREML="${USE_COREML}"
     TVM_INFO_USE_CPP_RPC="${USE_CPP_RPC}"
+    TVM_INFO_USE_CPP_RTVM="${USE_CPP_RTVM}"
     TVM_INFO_USE_CUBLAS="${USE_CUBLAS}"
     TVM_INFO_USE_CUDA="${USE_CUDA}"
     TVM_INFO_USE_CUDNN="${USE_CUDNN}"
 
@@ -28,6 +28,7 @@
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.relay.op.contrib.bnns import partition_for_bnns
 from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
+from tvm.relay.op.contrib.clml import partition_for_clml
 
 
 from tvm.driver.tvmc import TVMCException
@@ -71,6 +72,10 @@
         "config_key": "relay.ext.vitis_ai.options",
         "pass_pipeline": partition_for_vitis_ai,
     },
+    "clml": {
+        "config_key": None,
+        "pass_pipeline": partition_for_clml,
+    },
 }
 
 
 
@@ -79,7 +79,7 @@ def transform_function(
         return RemoveDropout().visit(func)
 
 
-def partition_for_clml(mod, params=None):
+def partition_for_clml(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to CLML Library.
 
 
@@ -102,6 +102,14 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
       int max_vthread_extent = 1;
       return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
                             max_threads_per_block, max_vthread_extent, warp_size);
+    } else if (target->GetAttr<String>("device", "") == "adreno") {
+      int max_shared_memory_per_block = 32768;
+      int max_local_memory_per_block = 32768;
+      int max_threads_per_block = 256;
+      int warp_size = 1;
+      int max_vthread_extent = 1;
+      return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
+                            max_threads_per_block, max_vthread_extent, warp_size);
     } else {
       // add other opencl target
       auto dev = Device{static_cast<DLDeviceType>(device_type), 0};
 
@@ -138,6 +138,28 @@ std::tuple<GraphExecutor::ShapeInfo, GraphExecutor::DtypeInfo> GraphExecutor::Ge
   return std::make_tuple(shape_dict, dtype_dict);
 }
 
+/*!
+ * \brief Get the output info of Graph by parsing the output nodes.
+ * \return The shape and dtype tuple.
+ */
+std::tuple<GraphExecutor::ShapeInfo, GraphExecutor::DtypeInfo> GraphExecutor::GetOutputInfo()
+    const {
+  GraphExecutor::ShapeInfo shape_dict;
+  GraphExecutor::DtypeInfo dtype_dict;
+  for (auto out : outputs_) {
+    uint32_t nid = out.node_id;
+    CHECK_LE(nid, nodes_.size());
+    std::string name = nodes_[nid].name;
+    CHECK_LE(nid, attrs_.shape.size());
+    auto shape = attrs_.shape[nid];
+    shape_dict.Set(name, ShapeTuple(shape));
+    CHECK_LE(nid, attrs_.dltype.size());
+    auto dtype = attrs_.dltype[nid];
+    dtype_dict.Set(name, String(dtype));
+  }
+  return std::make_tuple(shape_dict, dtype_dict);
+}
+
 /*!
  * \brief Get the output index given the name of output.
  * \param name The name of the output.
@@ -606,7 +628,19 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
       if (args.num_args == 2) {
         this->CopyOutputTo(args[0], args[1]);
       } else {
-        *rv = this->GetOutput(args[0]);
+        int out_idx = -1;
+        if (String::CanConvertFrom(args[0])) {
+          for (size_t i = 0; i < outputs_.size(); i++) {
+            std::string& name = nodes_[outputs_[i].node_id].name;
+            if (args[0].operator String() == name) {
+              out_idx = i;
+            }
+          }
+          CHECK(out_idx != -1) << "Invalid output node:" << args[0].operator String();
+        } else {
+          out_idx = args[0];
+        }
+        *rv = this->GetOutput(out_idx);
       }
     });
   } else if (name == "get_input") {
@@ -682,6 +716,14 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
       input_info.Set("dtype", dtype_info);
       *rv = input_info;
     });
+  } else if (name == "get_output_info") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      auto [shape_info, dtype_info] = this->GetOutputInfo();
+      Map<String, ObjectRef> input_info;
+      input_info.Set("shape", shape_info);
+      input_info.Set("dtype", dtype_info);
+      *rv = input_info;
+    });
   } else {
     return PackedFunc();
   }
 
@@ -117,6 +117,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    */
   std::tuple<ShapeInfo, DtypeInfo> GetInputInfo() const;
 
+  /*!
+   * \brief Get the output info of Graph by parsing the output nodes.
+   * \return The shape and dtype tuple.
+   */
+  std::tuple<ShapeInfo, DtypeInfo> GetOutputInfo() const;
+
   /*!
    * \brief Get the output index given the name of output.
    * \param name The name of the output.
 
@@ -203,6 +203,10 @@
 #define TVM_INFO_USE_CPP_RPC "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_CPP_RTVM
+#define TVM_INFO_USE_CPP_RTVM "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_TFLITE
 #define TVM_INFO_USE_TFLITE "NOT-FOUND"
 #endif
@@ -273,6 +277,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CMSISNN", TVM_INFO_USE_CMSISNN},
       {"USE_COREML", TVM_INFO_USE_COREML},
       {"USE_CPP_RPC", TVM_INFO_USE_CPP_RPC},
+      {"USE_CPP_RTVM", TVM_INFO_USE_CPP_RTVM},
       {"USE_CUBLAS", TVM_INFO_USE_CUBLAS},
       {"USE_CUDA", TVM_INFO_USE_CUDA},
       {"USE_CUDNN", TVM_INFO_USE_CUDNN},
 
@@ -37,6 +37,7 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 fi
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_CPP_RPC ON\) >> config.cmake
+echo set\(USE_CPP_RTVM ON\) >> config.cmake
 echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake
@@ -56,4 +57,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.
       -DCMAKE_C_COMPILER="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang" \
       -DMACHINE_NAME="aarch64-linux-gnu" ..
 
-make -j$(nproc) tvm_rpc
+make -j$(nproc) tvm_rpc rtvm