diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
deleted file mode 100644
index feb0d5120..000000000
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-name: Bug (compilation)
-description: Something goes wrong when trying to compile llama.cpp.
-title: "Compile bug: "
-labels: ["bug-unconfirmed", "compilation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the compilation of llama.cpp fails.
-        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
-        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
-        by clearing `~/.cache/ccache` (on Linux).
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        84a07a17b1b08cf2b9747c633a2372782848a27f
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
-      placeholder: >
-        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Compile command
-      description: >
-        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
deleted file mode 100644
index c42a14ff8..000000000
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ /dev/null
@@ -1,101 +0,0 @@
-name: Bug (model use)
-description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
-title: "Eval bug: "
-labels: ["bug-unconfirmed", "model evaluation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the model evaluation results
-        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-cli` binary can be used for simple and reproducible model inference.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: hardware
-    attributes:
-      label: Hardware
-      description: Which CPUs/GPUs are you using?
-      placeholder: >
-        e.g. Ryzen 5950X + 2x RTX 4090
-    validations:
-      required: true
-  - type: textarea
-    id: model
-    attributes:
-      label: Models
-      description: >
-        Which model(s) at which quantization were you using when encountering the bug?
-        If you downloaded a GGUF file off of Huggingface, please provide a link.
-      placeholder: >
-        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
-        that information would be very much appreciated by us.
-      placeholder: >
-        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
-        When I use -ngl 0 it works correctly.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
deleted file mode 100644
index 1904e31fd..000000000
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-name: Bug (misc.)
-description: Something is not working the way it should (and it's not covered by any of the above cases).
-title: "Misc. bug: "
-labels: ["bug-unconfirmed"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for miscellaneous bugs that don't fit into any other category.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software is affected? (You can use `--version` to get a version string.)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: dropdown
-    id: module
-    attributes:
-      label: Which llama.cpp modules do you know to be affected?
-      multiple: true
-      options:
-        - Documentation/Github
-        - libllama (core library)
-        - llama-cli
-        - llama-server
-        - llama-bench
-        - llama-quantize
-        - Python/Bash scripts
-        - Test code
-        - Other (Please specify in the next section)
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Command line
-      description: >
-        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          If applicable, please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml
deleted file mode 100644
index cee1446f5..000000000
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp.
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml
deleted file mode 100644
index e774550d5..000000000
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: Research
-description: Track new technical research area.
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml
deleted file mode 100644
index 2fe94e26c..000000000
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities.
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 0d246533c..000000000
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
diff --git a/.github/actions/get-tag-name/action.yml b/.github/actions/get-tag-name/action.yml
deleted file mode 100644
index 7ace23b2a..000000000
--- a/.github/actions/get-tag-name/action.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: "Determine tag name"
-description: "Determine the tag name to use for a release"
-outputs:
-  name:
-    description: "The name of the tag"
-    value: ${{ steps.tag.outputs.name }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Determine tag name
-      id: tag
-      shell: bash
-      run: |
-        BUILD_NUMBER="$(git rev-list --count HEAD)"
-        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-        else
-          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-        fi
diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml
deleted file mode 100644
index 5575caeca..000000000
--- a/.github/actions/windows-setup-cuda/action.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: "Windows - Setup CUDA Toolkit"
-description: "Setup CUDA Toolkit for Windows"
-inputs:
-  cuda_version:
-    description: "CUDA toolkit version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Cuda Toolkit 11.7
-      if: ${{ inputs.cuda_version == '11.7' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 12.4
-      if: ${{ inputs.cuda_version == '12.4' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml
deleted file mode 100644
index 446f799fa..000000000
--- a/.github/actions/windows-setup-curl/action.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: 'Windows - Setup CURL'
-description: 'Composite action, to be reused in other workflow'
-inputs:
-  curl_version:
-    description: 'CURL version'
-    required: false
-    default: '8.6.0_6'
-  architecture:
-    description: 'Architecture of the libcurl to download'
-    required: false
-    default: 'win64'
-outputs:
-  curl_path:
-    description: "Path to the downloaded libcurl"
-    value: ${{ steps.get_libcurl.outputs.curl_path }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: libCURL
-      id: get_libcurl
-      shell: powershell
-      env:
-        CURL_VERSION: ${{ inputs.curl_version }}
-        ARCHITECTURE: ${{ inputs.architecture }}
-      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
-        mkdir $env:RUNNER_TEMP/libcurl
-        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
-        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
deleted file mode 100644
index 3250e3279..000000000
--- a/.github/copilot-instructions.md
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copilot Instructions for llama.cpp
-
-## Repository Overview
-
-llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model) inference with minimal setup and dependencies. The project enables running language models on diverse hardware with state-of-the-art performance.
-
-**Key Facts:**
-- **Primary language**: C/C++ with Python utility scripts
-- **Size**: ~200k+ lines of code across 1000+ files
-- **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
-- **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
-- **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
-- **License**: MIT
-
-## Build Instructions
-
-### Prerequisites
-- CMake 3.14+ (primary build system)
-- C++17 compatible compiler (GCC 13.3+, Clang, MSVC)
-- Optional: ccache for faster compilation
-
-### Basic Build (CPU-only)
-**ALWAYS run these commands in sequence:**
-```bash
-cmake -B build
-cmake --build build --config Release -j $(nproc)
-```
-
-**Build time**: ~10 minutes on 4-core system with ccache enabled, ~25 minutes without ccache.
-
-**Important Notes:**
-- The Makefile is deprecated - always use CMake
-- ccache is automatically detected and used if available
-- Built binaries are placed in `build/bin/`
-- Parallel builds (`-j`) significantly reduce build time
-
-### Backend-Specific Builds
-For CUDA support:
-```bash
-cmake -B build -DGGML_CUDA=ON
-cmake --build build --config Release -j $(nproc)
-```
-
-For Metal (macOS):
-```bash
-cmake -B build -DGGML_METAL=ON
-cmake --build build --config Release -j $(nproc)
-```
-
-**Important Note**: While all backends can be built as long as the correct requirements for that backend are installed, you will not be able to run them without the correct hardware. The only backend that can be run for testing and validation is the CPU backend.
-
-### Debug Builds
-Single-config generators:
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Debug
-cmake --build build
-```
-
-Multi-config generators:
-```bash
-cmake -B build -G "Xcode"
-cmake --build build --config Debug
-```
-
-### Common Build Issues
-- **Issue**: Network tests fail in isolated environments
-  **Solution**: Expected behavior - core functionality tests will still pass
-
-## Testing
-
-### Running Tests
-```bash
-ctest --test-dir build --output-on-failure -j $(nproc)
-```
-
-**Test suite**: 38 tests covering tokenizers, grammar parsing, sampling, backends, and integration
-**Expected failures**: 2-3 tests may fail if network access is unavailable (they download models)
-**Test time**: ~30 seconds for passing tests
-
-### Server Unit Tests
-Run server-specific unit tests after building the server:
-```bash
-# Build the server first
-cmake --build build --target llama-server
-
-# Navigate to server tests and run
-cd tools/server/tests
-source ../../../.venv/bin/activate
-./tests.sh
-```
-**Server test dependencies**: The `.venv` environment includes the required dependencies for server unit tests (pytest, aiohttp, etc.). Tests can be run individually or with various options as documented in `tools/server/tests/README.md`.
-
-### Test Categories
-- Tokenizer tests: Various model tokenizers (BERT, GPT-2, LLaMA, etc.)
-- Grammar tests: GBNF parsing and validation
-- Backend tests: Core ggml operations across different backends
-- Integration tests: End-to-end workflows
-
-### Manual Testing Commands
-```bash
-# Test basic inference
-./build/bin/llama-cli --version
-
-# Test model loading (requires model file)
-./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
-```
-
-## Code Quality and Linting
-
-### C++ Code Formatting
-**ALWAYS format C++ code before committing:**
-```bash
-git clang-format
-```
-
-Configuration is in `.clang-format` with these key rules:
-- 4-space indentation
-- 120 column limit
-- Braces on same line for functions
-- Pointer alignment: `void * ptr` (middle)
-- Reference alignment: `int & ref` (middle)
-
-### Python Code
-**ALWAYS activate the Python environment in `.venv` and use tools from that environment:**
-```bash
-# Activate virtual environment
-source .venv/bin/activate
-```
-
-Configuration files:
-- `.flake8`: flake8 settings (max-line-length=125, excludes examples/tools)
-- `pyrightconfig.json`: pyright type checking configuration
-
-### Pre-commit Hooks
-Run before committing:
-```bash
-pre-commit run --all-files
-```
-
-## Continuous Integration
-
-### GitHub Actions Workflows
-Key workflows that run on every PR:
-- `.github/workflows/build.yml`: Multi-platform builds
-- `.github/workflows/server.yml`: Server functionality tests
-- `.github/workflows/python-lint.yml`: Python code quality
-- `.github/workflows/python-type-check.yml`: Python type checking
-
-### Local CI Validation
-**Run full CI locally before submitting PRs:**
-```bash
-mkdir tmp
-
-# CPU-only build
-bash ./ci/run.sh ./tmp/results ./tmp/mnt
-```
-
-**CI Runtime**: 30-60 minutes depending on backend configuration
-
-### Triggering CI
-Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI infrastructure.
-
-## Project Layout and Architecture
-
-### Core Directories
-- **`src/`**: Main llama library implementation (`llama.cpp`, `llama-*.cpp`)
-- **`include/`**: Public API headers, primarily `include/llama.h`
-- **`ggml/`**: Core tensor library (submodule with custom GGML framework)
-- **`examples/`**: 30+ example applications and tools
-- **`tools/`**: Additional development and utility tools (server benchmarks, tests)
-- **`tests/`**: Comprehensive test suite with CTest integration
-- **`docs/`**: Detailed documentation (build guides, API docs, etc.)
-- **`scripts/`**: Utility scripts for CI, data processing, and automation
-- **`common/`**: Shared utility code used across examples
-
-### Key Files
-- **`CMakeLists.txt`**: Primary build configuration
-- **`include/llama.h`**: Main C API header (~2000 lines)
-- **`src/llama.cpp`**: Core library implementation (~8000 lines)
-- **`CONTRIBUTING.md`**: Coding guidelines and PR requirements
-- **`.clang-format`**: C++ formatting rules
-- **`.pre-commit-config.yaml`**: Git hook configuration
-
-### Built Executables (in `build/bin/`)
-Primary tools:
-- **`llama-cli`**: Main inference tool
-- **`llama-server`**: OpenAI-compatible HTTP server
-- **`llama-quantize`**: Model quantization utility
-- **`llama-perplexity`**: Model evaluation tool
-- **`llama-bench`**: Performance benchmarking
-- **`llama-convert-llama2c-to-ggml`**: Model conversion utilities
-
-### Configuration Files
-- **CMake**: `CMakeLists.txt`, `cmake/` directory
-- **Linting**: `.clang-format`, `.clang-tidy`, `.flake8`
-- **CI**: `.github/workflows/`, `ci/run.sh`
-- **Git**: `.gitignore` (includes build artifacts, models, cache)
-
-### Dependencies
-- **System**: OpenMP, libcurl (for model downloading)
-- **Optional**: CUDA SDK, Metal framework, Vulkan SDK, Intel oneAPI
-- **Bundled**: httplib, json (header-only libraries in vendored form)
-
-## Common Validation Steps
-
-### After Making Changes
-1. **Format code**: `git clang-format`
-2. **Build**: `cmake --build build --config Release`
-3. **Test**: `ctest --test-dir build --output-on-failure`
-4. **Server tests** (if modifying server): `cd tools/server/tests && source ../../../.venv/bin/activate && ./tests.sh`
-5. **Manual validation**: Test relevant tools in `build/bin/`
-
-### Performance Validation
-```bash
-# Benchmark inference performance
-./build/bin/llama-bench -m model.gguf
-
-# Evaluate model perplexity
-./build/bin/llama-perplexity -m model.gguf -f dataset.txt
-```
-
-### Backend Validation
-```bash
-# Test backend operations
-./build/bin/test-backend-ops
-```
-
-## Environment Setup
-
-### Required Tools
-- CMake 3.14+ (install via system package manager)
-- Modern C++ compiler with C++17 support
-- Git (for submodule management)
-- Python 3.9+ with virtual environment (`.venv` is provided)
-
-### Optional but Recommended
-- ccache: `apt install ccache` or `brew install ccache`
-- clang-format 15+: Usually included with LLVM/Clang installation
-- pre-commit: `pip install pre-commit`
-
-### Backend-Specific Requirements
-- **CUDA**: NVIDIA CUDA Toolkit 11.2+
-- **Metal**: Xcode command line tools (macOS only)
-- **Vulkan**: Vulkan SDK
-- **SYCL**: Intel oneAPI toolkit
-
-## Important Guidelines
-
-### Code Changes
-- **Minimal dependencies**: Avoid adding new external dependencies
-- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
-- **Performance focus**: This is a performance-critical inference library
-- **API stability**: Changes to `include/llama.h` require careful consideration
-
-### Git Workflow
-- Always create feature branches from `master`
-- **Never** commit build artifacts (`build/`, `.ccache/`, `*.o`, `*.gguf`)
-- Use descriptive commit messages following project conventions
-
-### Trust These Instructions
-Only search for additional information if these instructions are incomplete or found to be incorrect. This document contains validated build and test procedures that work reliably across different environments.
-
diff --git a/.github/labeler.yml b/.github/labeler.yml
deleted file mode 100644
index c4da4ab4e..000000000
--- a/.github/labeler.yml
+++ /dev/null
@@ -1,99 +0,0 @@
-# https://github.com/actions/labeler
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal/**
-            - README-metal.md
-SYCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl/**
-            - docs/backend/SYCL.md
-            - examples/sycl/**
-Nvidia GPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cuda.h
-            - ggml/src/ggml-cuda/**
-Vulkan:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan/**
-IBM zDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zdnn.h
-            - ggml/src/ggml-zdnn/**
-documentation:
-    - changed-files:
-        - any-glob-to-any-file:
-            - docs/**
-            - media/**
-testing:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tests/**
-build:
-    - changed-files:
-        - any-glob-to-any-file:
-            - cmake/**
-            - CMakeLists.txt
-            - CMakePresets.json
-examples:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
-devops:
-    - changed-files:
-        - any-glob-to-any-file:
-            - .devops/**
-            - .github/**
-            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
-android:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/llama.android/**
-server:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/server/**
-ggml:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/**
-nix:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.nix"
-            - .github/workflows/nix-*.yml
-            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
-
-Ascend NPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cann.h
-            - ggml/src/ggml-cann/**
-            - docs/backend/CANN.md
-OpenCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-opencl.h
-            - ggml/src/ggml-opencl/**
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
deleted file mode 100644
index d0bdd73c4..000000000
--- a/.github/pull_request_template.md
+++ /dev/null
@@ -1 +0,0 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled
deleted file mode 100644
index f2d7e16e9..000000000
--- a/.github/workflows/bench.yml.disabled
+++ /dev/null
@@ -1,304 +0,0 @@
-# TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggml-org/llama.cpp/issues/7893
-#
-# Benchmark
-name: Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-
-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || github.event_name == 'pull_request_target'
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd tools/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=tools/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd tools/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd tools/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - name: Server bench
-        id: server_bench
-        env:
-            HEAD_REF: ${{ github.head_ref || github.ref_name }}
-        run: |
-          set -eux
-
-          cd tools/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch $HEAD_REF \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-
-          cat results.github.env >> $GITHUB_ENV
-
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            tools/server/bench/*.jpg
-            tools/server/bench/*.json
-            tools/server/bench/*.log
-
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            tools/server/bench/prompt_tokens_seconds.jpg
-            tools/server/bench/predicted_tokens_seconds.jpg
-            tools/server/bench/kv_cache_usage_ratio.jpg
-            tools/server/bench/requests_processing.jpg
-
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-
-          cd tools/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
-            </p>
-
-            <details>
-
-            <summary>Expand details for performance related PR only</summary>
-
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
-            <details>
-
-            <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            </p>
-
-            <details>
-
-            <summary>Details</summary>
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-
-            </details>
-
-            </p>
-            </details>
-            </details>
diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
deleted file mode 100644
index fee2ab96b..000000000
--- a/.github/workflows/build-cmake-pkg.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Build relocatable cmake package
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  linux:
-    runs-on: ubuntu-24.04
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y build-essential tcl
-
-      - name: Build
-        run: |
-          PREFIX="$(pwd)"/inst
-          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
-          cmake --build build --config Release
-          cmake --install build --prefix "$PREFIX" --config Release
-
-          export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
-          tclsh <<'EOF'
-          set build(commit)  [string trim [exec git rev-parse --short HEAD]]
-          set build(number)  [string trim [exec git rev-list  --count HEAD]]
-          set build(version) "0.0.$build(number)"
-
-          set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
-          set checks [list "set\\(LLAMA_VERSION     \\s+$build(version)\\)" \
-                           "set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
-                           "set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
-
-          puts -nonewline "Checking llama-config.cmake version... "
-          foreach check $checks {
-              if {![regexp -expanded -- $check $llamaconfig]} {
-                  puts "\"$check\" failed!"
-                  exit 1
-              }
-          }
-          puts "success."
-          EOF
-
-          cd examples/simple-cmake-pkg
-          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
-          cmake --build build
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
deleted file mode 100644
index 04ad187d3..000000000
--- a/.github/workflows/build-linux-cross.yml
+++ /dev/null
@@ -1,346 +0,0 @@
-name: Build on Linux using cross-compiler
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  # ubuntu-24-riscv64-vulkan-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo dpkg --add-architecture riscv64
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 glslc \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu \
-  #                 libvulkan-dev:riscv64
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_VULKAN=ON \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
-  # ubuntu-24-arm64-vulkan-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - name: Setup Arm64
-  #       run: |
-  #         sudo dpkg --add-architecture arm64
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 glslc \
-  #                 crossbuild-essential-arm64 \
-  #                 libvulkan-dev:arm64
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_VULKAN=ON \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-  #                        -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-  #                        -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-ppc64el-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup PowerPC64le
-        run: |
-          sudo dpkg --add-architecture ppc64el
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  # ubuntu-24-ppc64el-vulkan-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - name: Setup PowerPC64le
-  #       run: |
-  #         sudo dpkg --add-architecture ppc64el
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 glslc \
-  #                 gcc-14-powerpc64le-linux-gnu \
-  #                 g++-14-powerpc64le-linux-gnu \
-  #                 libvulkan-dev:ppc64el
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_VULKAN=ON \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-  #                        -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
-  debian-13-loongarch64-cpu-cross:
-    runs-on: ubuntu-24.04
-    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup LoongArch
-        run: |
-          rm -f /etc/apt/sources.list.d/*
-          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
-          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
-          EOF
-          ( echo 'quiet "true";'; \
-            echo 'APT::Get::Assume-Yes "true";'; \
-            echo 'APT::Install-Recommends "false";'; \
-            echo 'Acquire::Check-Valid-Until "false";'; \
-            echo 'Acquire::Retries "5";'; \
-          ) > /etc/apt/apt.conf.d/99snapshot-repos
-
-          apt-get update
-          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
-          dpkg --add-architecture loong64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
-          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
-          EOF
-
-          apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-loongarch64-linux-gnu \
-                  g++-14-loongarch64-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  debian-13-loongarch64-vulkan-cross:
-    runs-on: ubuntu-24.04
-    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup LoongArch
-        run: |
-          rm -f /etc/apt/sources.list.d/*
-          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
-          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
-          EOF
-          ( echo 'quiet "true";'; \
-            echo 'APT::Get::Assume-Yes "true";'; \
-            echo 'APT::Install-Recommends "false";'; \
-            echo 'Acquire::Check-Valid-Until "false";'; \
-            echo 'Acquire::Retries "5";'; \
-          ) > /etc/apt/apt.conf.d/99snapshot-repos
-
-          apt-get update
-          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
-          dpkg --add-architecture loong64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
-          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
-          EOF
-
-          apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-loongarch64-linux-gnu \
-                  g++-14-loongarch64-linux-gnu \
-                  libvulkan-dev:loong64
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build-riscv-native.yml b/.github/workflows/build-riscv-native.yml
deleted file mode 100644
index 86dc0ff76..000000000
--- a/.github/workflows/build-riscv-native.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
-  pull_request:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  debian-13-riscv64-native: # Bianbu 2.2
-    runs-on: self-hosted
-
-    steps:
-      - name: Install prerequisites
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y libatomic1
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  ccache \
-                  cmake
-
-      - name: Setup ccache
-        run: |
-          mkdir -p $HOME/.ccache
-          ccache -M 5G -d $HOME/.ccache
-          export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
-          export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
-          echo "$GITHUB_WORKSPACE"
-          echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
-          echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
-          echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-          echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
-      - name: Build
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_SYSTEM_NAME=Linux \
-            -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index 43553ac13..000000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,1209 +0,0 @@
-name: CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build.yml',
-      '.github/workflows/build-linux-cross.yml',
-      '.github/workflows/build-cmake-pkg.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build.yml',
-      '.github/workflows/build-linux-cross.yml',
-      '.github/workflows/build-cmake-pkg.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  macOS-latest-cmake-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=OFF \
-            -DGGML_METAL_SHADER_DEBUG=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-  macOS-latest-cmake-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  macOS-latest-cmake-arm64-webgpu:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64-webgpu
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v1.0.0"
-          DAWN_OWNER="reeselevine"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-cpu-cmake:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-llguidance:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_LLGUIDANCE=ON
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-rpc
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          export GGML_VK_VISIBLE_DEVICES=0
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 4200
-
-  ubuntu-22-cmake-webgpu:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-webgpu
-          evict-old-files: 1d
-
-      - name: Vulkan SDK Dependencies
-        id: vulkan-depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v1.0.0"
-          DAWN_OWNER="reeselevine"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 3600
-
-  ubuntu-22-cmake-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.1.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-hip
-          evict-old-files: 1d
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-musa
-          evict-old-files: 1d
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl-fp16
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          cmake --build build --config Release -j $(nproc)
-
-  build-linux-cross:
-    uses: ./.github/workflows/build-linux-cross.yml
-
-  build-cmake-pkg:
-    uses: ./.github/workflows/build-cmake-pkg.yml
-
-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-ios
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-tvos
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-visionos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=visionOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-swift
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-  windows-msys2:
-    runs-on: windows-2025
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-msys2
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            git
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-  windows-latest-cmake:
-    runs-on: windows-2025
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.313.2
-
-    strategy:
-      matrix:
-        include:
-          - build: 'cpu-x64 (static)'
-            arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
-          - build: 'openblas-x64'
-            arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            arch: 'x64'
-            defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'llvm-arm64'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'llvm-arm64-opencl-adreno'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Test
-        id: cmake_test
-        if: ${{ matrix.arch == 'x64' }}
-        run: |
-          cd build
-          ctest -L main -C Release --verbose --timeout 900
-
-      # TODO: disabled for now, consider adding tests for all CPU variants instead
-      # - name: Test (Intel SDE)
-      #   id: cmake_test_sde
-      #   if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
-      #   run: |
-      #     curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
-      #     # for some weird reason windows tar doesn't like sde tar.xz
-      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
-      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
-      #     $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
-      #     cd build
-      #     $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
-      #     & $sde -future -- ctest -L main -C Release --verbose --timeout 900
-
-  ubuntu-latest-cmake-cuda:
-    runs-on: ubuntu-latest
-    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
-
-    steps:
-        - name: Clone
-          id: checkout
-          uses: actions/checkout@v4
-
-        - name: Install dependencies
-          env:
-            DEBIAN_FRONTEND: noninteractive
-          run: |
-              apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
-
-        - name: ccache
-          uses: ggml-org/ccache-action@v1.2.16
-          with:
-            key: ubuntu-latest-cmake-cuda
-            evict-old-files: 1d
-
-        - name: Build with CMake
-          run: |
-            cmake -S . -B build -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_CUDA_ARCHITECTURES=89-real \
-              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
-              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON
-            cmake --build build
-
-  windows-2022-cmake-cuda:
-    runs-on: windows-2022
-
-    strategy:
-      matrix:
-        cuda: ['12.4']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
-            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-  windows-latest-cmake-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
-    runs-on: windows-2022
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $proc.WaitForExit(600000)
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}
-          evict-old-files: 1d
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Xcode
-        uses: maxim-lobanov/setup-xcode@v1
-        with:
-          xcode-version: latest-stable
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-  android-build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: android-build
-          evict-old-files: 1d
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-          ./gradlew build --no-daemon
-
-  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
-    defaults:
-      run:
-        shell: bash -el {0}
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        cann:
-          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.cann }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
-          cmake --build build -j $(nproc)
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
deleted file mode 100644
index 19e785474..000000000
--- a/.github/workflows/close-issue.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Close inactive issues
-on:
-  schedule:
-    - cron: "42 0 * * *"
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  issues: write
-
-jobs:
-  close-issues:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - uses: actions/stale@v5
-        with:
-          exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
-          days-before-issue-stale: 30
-          days-before-issue-close: 14
-          stale-issue-label: "stale"
-          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
-          days-before-pr-stale: -1
-          days-before-pr-close: -1
-          operations-per-run: 10000
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml
deleted file mode 100644
index 3645e3037..000000000
--- a/.github/workflows/copilot-setup-steps.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: "Copilot Setup Steps"
-
-# Automatically run the setup steps when they are changed to allow for easy validation, and
-# allow manual testing through the repository's "Actions" tab
-on:
-  workflow_dispatch:
-  push:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-  pull_request:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-
-jobs:
-  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
-  copilot-setup-steps:
-    runs-on: ubuntu-latest
-
-    # Set the permissions to the lowest permissions possible needed for your steps.
-    # Copilot will be given its own token for its operations.
-    permissions:
-      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
-      contents: read
-
-    # You can define any steps you want, and they will run before the agent starts.
-    # If you do not check out your code, Copilot will do this for you.
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: copilot-setup-steps
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-          # Install git-clang-format script for formatting only changed code
-          wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
-          sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
-          sudo chmod +x /usr/local/bin/git-clang-format
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python3 -m venv .venv
-          .venv/bin/activate
-          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
-          pip install flake8 pyright pre-commit
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 2067927be..000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,178 +0,0 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# GitHub recommends pinning actions to a commit SHA.
-# To get a newer version, you will need to update the SHA.
-# You can also reference a tag or branch, but the action may change without warning.
-
-name: Publish Docker image
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because it is expensive
-    - cron: '12 4 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  packages: write
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-
-    runs-on: ubuntu-22.04
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          # Multi-stage build
-          # Note: the arm64 images are failing, which prevents the amd64 images from being built
-          # https://github.com/ggml-org/llama.cpp/issues/11888
-          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # preserve git history, so we can determine the build number
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          image: tonistiigi/binfmt:qemu-v7.0.0-28
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-
-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
-          fi
-          # list all tags possible
-          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-              TYPE=""
-          else
-              TYPE="-${{ matrix.config.tag }}"
-          fi
-          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
-          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
-          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
-          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "full_output_tags=$FULLTAGS"  # print out for debugging
-          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
-          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
-        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.full_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Light Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.light_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Server Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.server_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index f02b7c219..000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@v2
-        with:
-          version: v3.0.3
-      - run: editorconfig-checker
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
deleted file mode 100644
index 3ca4d3058..000000000
--- a/.github/workflows/gguf-publish.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.9.x'
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        python -m pip install poetry
-        poetry install
-
-    - name: Build package
-      run: cd gguf-py && poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 0b0f300aa..000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        repository: "ggml-org/llama.cpp"
-    - uses: actions/labeler@v5
-      with:
-        configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
deleted file mode 100644
index dff998e23..000000000
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: Check Pre-Tokenizer Hashes
-
-on:
-    push:
-        paths:
-            - 'convert_hf_to_gguf.py'
-            - 'convert_hf_to_gguf_update.py'
-    pull_request:
-        paths:
-            - 'convert_hf_to_gguf.py'
-            - 'convert_hf_to_gguf_update.py'
-
-jobs:
-    pre-tokenizer-hashes:
-        runs-on: ubuntu-latest
-
-        steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-
-        - name: Set up Python
-          uses: actions/setup-python@v5
-          with:
-              python-version: '3.11'
-
-        - name: Install Python dependencies
-          run: |
-              python3 -m venv .venv
-              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
-
-        - name: Update pre-tokenizer hashes
-          run: |
-              cp convert_hf_to_gguf.py /tmp
-              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
-
-        - name: Check if committed pre-tokenizer hashes matches generated version
-          run: |
-              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
-                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
-                  echo "Differences found:"
-                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
-                  exit 1
-              fi
-              echo "Model pre-tokenizer hashes are up to date."
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
deleted file mode 100644
index 46e80aecd..000000000
--- a/.github/workflows/python-check-requirements.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
deleted file mode 100644
index ddfdf73b8..000000000
--- a/.github/workflows/python-lint.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: flake8 Lint
-
-on:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  flake8-lint:
-    runs-on: ubuntu-latest
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@v2
-        with:
-            plugins: "flake8-no-print"
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
deleted file mode 100644
index 373bb6010..000000000
--- a/.github/workflows/python-type-check.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Python Type-Check
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
-      - '**.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
-      - '**.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-type-check:
-    runs-on: ubuntu-latest
-    name: pyright type-check
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Python dependencies
-        # TODO: use a venv
-        run: pip install -r requirements/requirements-all.txt
-      - name: Type-check with Pyright
-        uses: jakebailey/pyright-action@v2
-        with:
-          version: 1.1.382
-          level: warning
-          warnings: true
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
deleted file mode 100644
index 5367637e4..000000000
--- a/.github/workflows/release.yml
+++ /dev/null
@@ -1,760 +0,0 @@
-name: Release
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
-
-jobs:
-  macOS-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_INSTALL_RPATH='@loader_path' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_INSTALL_RPATH='@loader_path' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-22-cpu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
-          # - build: 'arm64'
-          #   os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-22-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGGML_VULKAN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  windows-cpu:
-    runs-on: windows-2025
-
-    strategy:
-      matrix:
-        include:
-          - arch: 'x64'
-          - arch: 'arm64'
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-cpu-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
-      - name: Build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
-            -DGGML_OPENMP=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
-          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
-          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
-          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
-
-  windows:
-    runs-on: windows-2025
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      VULKAN_VERSION: 1.4.313.2
-
-    strategy:
-      matrix:
-        include:
-          - backend: 'vulkan'
-            arch: 'x64'
-            defines: '-DGGML_VULKAN=ON'
-            target: 'ggml-vulkan'
-          - backend: 'opencl-adreno'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-            target: 'ggml-opencl'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.backend == 'vulkan' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
-          cmake --build build --config Release --target ${{ matrix.target }}
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
-          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
-
-  windows-cuda:
-    runs-on: windows-2022
-
-    strategy:
-      matrix:
-        cuda: ['12.4']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_CPU=OFF ^
-            -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-
-  windows-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-          cmake -G "Ninja" -B build ^
-            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_CURL=OFF
-          cmake --build build --target ggml-sycl -j
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-hip:
-    runs-on: windows-2022
-
-    strategy:
-      matrix:
-        include:
-          - name: "radeon"
-            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $proc.WaitForExit(600000)
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_BACKEND_DL=ON `
-            -DGGML_NATIVE=OFF `
-            -DGGML_CPU=OFF `
-            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DLLAMA_CURL=OFF
-          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
-          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
-
-  ios-xcode-build:
-    runs-on: macos-15
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup Xcode
-        run: |
-          sudo xcode-select -s /Applications/Xcode_16.4.app
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    # Fine-grant permission
-    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-    permissions:
-        contents: write # for creating release
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - windows
-      - windows-cpu
-      - windows-cuda
-      - windows-sycl
-      - windows-hip
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
-      - macOS-arm64
-      - macOS-x64
-      - ios-xcode-build
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-          merge-multiple: true
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: |
-          mkdir -p release
-
-          echo "Adding CPU backend files to existing zips..."
-          for arch in x64 arm64; do
-            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
-            temp_dir=$(mktemp -d)
-            echo "Extracting CPU backend for $arch..."
-            unzip "$cpu_zip" -d "$temp_dir"
-
-            echo "Adding CPU files to $arch zips..."
-            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
-              if [[ "$target_zip" == "$cpu_zip" ]]; then
-                continue
-              fi
-              echo "Adding CPU backend to $(basename "$target_zip")"
-              realpath_target_zip=$(realpath "$target_zip")
-              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
-            done
-
-            rm -rf "$temp_dir"
-          done
-
-          echo "Renaming and moving zips to release..."
-          for zip_file in artifact/llama-bin-win-*.zip; do
-            base_name=$(basename "$zip_file" .zip)
-            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
-            echo "Moving $zip_file to release/$zip_name"
-            mv "$zip_file" "release/$zip_name"
-          done
-
-          echo "Moving other artifacts..."
-          mv -v artifact/*.zip release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./release/${file}`)
-                });
-              }
-            }
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
deleted file mode 100644
index f6da48857..000000000
--- a/.github/workflows/server.yml
+++ /dev/null
@@ -1,237 +0,0 @@
-# Server build and tests
-name: Server
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libcurl4-openssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      # Setup nodejs (to be used for verifying bundled index.html)
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22.11.0'
-
-      - name: WebUI - Install dependencies
-        id: webui_lint
-        run: |
-          cd tools/server/webui
-          npm ci
-
-      - name: WebUI - Check code format
-        id: webui_format
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
-
-          npm run format
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Files do not follow coding style. To fix: npm run format"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Verify bundled index.html
-        id: verify_server_index_html
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
-
-          npm run build
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server/webui is not built as expected"
-            echo "Hint: You may need to follow Web UI build guide in server/README.md"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
-        run: |
-          cd tools/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd tools/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 ./tests.sh
-
-
-  server-windows:
-    runs-on: windows-2022
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      - name: Copy Libcurl
-        id: prepare_libcurl
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          $env:SLOW_TESTS = "1"
-          pytest -v -x
diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml
deleted file mode 100644
index c0218fa74..000000000
--- a/.github/workflows/update-ops-docs.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Update Operations Documentation
-
-on:
-    push:
-        paths:
-            - 'docs/ops/**'
-            - 'scripts/create_ops_docs.py'
-    pull_request:
-        paths:
-            - 'docs/ops/**'
-            - 'scripts/create_ops_docs.py'
-
-jobs:
-    update-ops-docs:
-        runs-on: ubuntu-latest
-
-        steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-
-        - name: Set up Python
-          uses: actions/setup-python@v5
-          with:
-              python-version: '3.x'
-
-        - name: Generate operations documentation to temporary file
-          run: |
-              mkdir -p /tmp/ops_check
-              ./scripts/create_ops_docs.py /tmp/ops_check/ops.md
-
-        - name: Check if docs/ops.md matches generated version
-          run: |
-              if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
-                  echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
-                  echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
-                  echo "Differences found:"
-                  diff docs/ops.md /tmp/ops_check/ops.md || true
-                  exit 1
-              fi
-              echo "Operations documentation is up to date."
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
deleted file mode 100644
index 5c2861559..000000000
--- a/.github/workflows/winget.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: Update Winget Package
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    - cron: '28 5 * * *' # Update every day at 5:28 UTC
-
-jobs:
-  update:
-    name: Update Winget Package
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Install cargo binstall
-        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
-
-      - name: Install komac
-        run: |
-          cargo binstall komac@2.11.2 -y
-
-      - name: Find latest release
-        id: find_latest_release
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const { data: releases } = await github.rest.repos.listReleases({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-            });
-            console.log("Latest release:", releases[0].tag_name);
-            return releases[0].tag_name;
-
-      - name: Update manifest
-        env:
-          VERSION: ${{ steps.find_latest_release.outputs.result }}
-        run: |
-          echo "Updating manifest..."
-          komac update --version ${{ env.VERSION }} \
-            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
-            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
-            --submit \
-            ggml.llamacpp
diff --git a/CMakePresets.json b/CMakePresets.json
index b5afeb3c0..77c654089 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -30,6 +30,8 @@
     { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
     { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
     { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+    { "name": "remoting_frontend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND":      "ON" } },
+    { "name": "remoting_backend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND":      "ON" } },
 
     {
         "name": "x64-windows-llvm", "hidden": true,
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 000000000..9de8c5639
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,13 @@
+approvers:
+- kpouget
+- cfergeau
+- praveenkumar
+- vyasgun
+- gbraad
+options: {}
+reviewers:
+- kpouget
+- cfergeau
+- praveenkumar
+- vyasgun
+- gbraad
diff --git a/build.backend.sh b/build.backend.sh
new file mode 100755
index 000000000..2904c4a15
--- /dev/null
+++ b/build.backend.sh
@@ -0,0 +1,36 @@
+# force isatty-->true, so that $0 |& head -50 has colors ...
+rm -f READY_backend FAILED_backend
+
+echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
+export LD_PRELOAD=/tmp/isatty.so
+
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+export SDKROOT=$(xcrun --sdk macosx --show-sdk-path)
+
+if [[ "$FLAVOR" == "-prod" ]]; then
+    cat <<EOF
+###
+### Building the prod flavor
+###
+EOF
+fi
+
+TARGETS="llama-run"
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    TARGETS="$TARGETS llama-bench"
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    TARGETS="$TARGETS test-backend-ops"
+fi
+
+cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $TARGETS "$@"
+
+if [[ $? == 0 ]]; then
+    touch READY_backend
+else
+    touch FAILED_backend
+fi
diff --git a/build.remoting.sh b/build.remoting.sh
new file mode 100755
index 000000000..c17ed3291
--- /dev/null
+++ b/build.remoting.sh
@@ -0,0 +1,25 @@
+# force isatty-->true, so that $0 |& head -50 has colors ...
+rm -f READY FAILED
+
+echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
+export LD_PRELOAD=/tmp/isatty.so
+
+TARGETS="ggml-remotingfrontend"
+
+TARGETS="$BUILD_TARGET llama-run"
+set -x
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    TARGETS="$TARGETS llama-bench"
+elif [[ "${BENCH_MODE:-}" == "server" ]]; then
+    TARGETS="$TARGETS llama-server"
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    TARGETS="$TARGETS test-backend-ops"
+fi
+
+cmake --build ../build.remoting-frontend$FLAVOR --parallel 8 --target $TARGETS "$@"
+
+if [[ $? == 0 ]]; then
+    touch READY
+else
+    touch FAILED
+fi
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 2ead001e2..272929e54 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -188,6 +188,8 @@ option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
+option(GGML_REMOTING_FRONTEND               "ggml: use the API Remoting frontend"             OFF)
+option(GGML_REMOTING_BACKEND                "ggml: use the API Remoting backend"              OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
@@ -278,6 +280,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     include/ggml-webgpu.h
+    include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index a61069442..4d21e6466 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -61,6 +61,11 @@ GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t bac
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
 
+GGML_BACKEND_API void ggml_backend_metal_get_device_context(ggml_backend_dev_t dev,
+							    bool *has_simdgroup_mm,
+							    bool *has_simdgroup_reduction,
+							    bool *use_bfloat);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h
new file mode 100644
index 000000000..4c7cd585e
--- /dev/null
+++ b/ggml/include/ggml-remoting-frontend.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 2b5b8169d..8b5250a69 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -384,6 +384,9 @@ ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
+ggml_add_backend(RemotingFrontend)
+ggml_add_backend(RemotingBackend)
+
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 5f02a710a..d066f0b8c 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -69,6 +69,10 @@
 #include "ggml-cann.h"
 #endif
 
+#ifdef GGML_USE_REMOTINGFRONTEND
+#include "ggml-remoting-frontend.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -187,6 +191,10 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_ZDNN
         register_backend(ggml_backend_zdnn_reg());
 #endif
+#ifdef GGML_USE_REMOTINGFRONTEND
+        register_backend(ggml_backend_remoting_frontend_reg());
+#endif
+
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
@@ -589,6 +597,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("remoting_frontend", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 1f93633d9..aef7a2e79 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -8,6 +8,9 @@
 
 #import <Metal/Metal.h>
 
+#undef GGML_LOG_DEBUG
+#define GGML_LOG_DEBUG(...)
+
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -1203,8 +1206,6 @@ @implementation GGMLMetalClass
                 GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                 return NULL; \
             } \
-        } else { \
-            GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
         }
 
         const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
@@ -5988,9 +5989,53 @@ static int ggml_metal_encode_node(
     return n_fuse;
 }
 
+long long timer_start;
+long long timer_total;
+long long timer_count;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static void show_timer(void) {
+  double ms = timer_total/1000000;
+  double itl = ms/timer_count;
+  double speed = 1/itl * 1000;
+
+  printf("METAL compute_graph: [%9.0f] ms for %lld invokations | ITL %.2f ms | throughput = %.2f t/s\n",ms, timer_count, itl, speed);
+
+  timer_start = 0;
+  timer_total = 1; // to avoid re-registering
+  timer_count = 0;
+}
+
+static void show_timer_signal(int sig) {
+  GGML_UNUSED(sig);
+  show_timer();
+}
+
 static enum ggml_status ggml_metal_graph_compute(
             ggml_backend_t   backend,
         struct ggml_cgraph * gf) {
+
+  if (timer_total == 0) {
+    signal(SIGUSR1, show_timer_signal); // kill -USR1 $(cat /tmp/krunkit.pid)
+    atexit(show_timer);
+  }
+
+  start_timer();
+
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -6118,6 +6163,8 @@ static enum ggml_status ggml_metal_graph_compute(
         }
     }
 
+  stop_timer();
+
     return GGML_STATUS_SUCCESS;
 }
 
@@ -6884,3 +6931,16 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
+
+
+GGML_BACKEND_API void
+ggml_backend_metal_get_device_context(ggml_backend_dev_t dev,
+				      bool *has_simdgroup_mm,
+				      bool *has_simdgroup_reduction,
+				      bool *use_bfloat) {
+  struct ggml_backend_metal_device_context *dev_ctx = dev->context ;
+
+  *use_bfloat = dev_ctx->use_bfloat;
+  *has_simdgroup_reduction = dev_ctx->has_simdgroup_reduction;
+  *has_simdgroup_mm = dev_ctx->has_simdgroup_mm;
+}
diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
new file mode 100644
index 000000000..4b796ff42
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting backend")
+
+ggml_add_backend_library(ggml-remotingbackend
+                         backend.cpp
+                         backend-dispatched.cpp
+                         backend-dispatched-backend.cpp
+                         backend-dispatched-device.cpp
+                         backend-dispatched-buffer.cpp
+                         backend-dispatched-buffer-type.cpp
+                         backend-dispatched-metal.cpp
+                         backend-utils.cpp
+                         shared/api_remoting.h
+                         shared/apir_backend.h
+                         shared/venus_cs.h
+                         venus_cs_ggml-rpc-back.cpp
+                        )
+
+target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
new file mode 100644
index 000000000..b45c27841
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -0,0 +1,15 @@
+#include "shared/apir_backend.h"
+
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
new file mode 100644
index 000000000..f15f39c7f
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -0,0 +1,57 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "shared/apir_backend.h"
+
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
+uint32_t
+backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&graph_compute_timer);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+  size_t cgraph_size;
+  vn_decode_size_t(dec, &cgraph_size);
+
+  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size);
+
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size);
+
+  ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+  for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+    ggml_tensor *op = ggml_graph_node(cgraph, idx);
+    if (dev->iface.supports_op(dev, op)) {
+      continue;
+    }
+    ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op));
+
+    status = GGML_STATUS_ABORTED;
+    vn_encode_ggml_status(enc, &status);
+
+    stop_timer(&graph_compute_timer);
+    return 0;
+  }
+#endif
+  status = bck->iface.graph_compute(bck, cgraph);
+
+  vn_encode_ggml_status(enc, &status);
+
+  stop_timer(&graph_compute_timer);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
new file mode 100644
index 000000000..f925d1e06
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -0,0 +1,81 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t
+backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  const char *string = buft->iface.get_name(buft);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t value = buft->iface.get_alignment(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t value = buft->iface.get_max_size(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  bool is_host = buft->iface.is_host(buft);
+  vn_encode_bool_t(enc, &is_host);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  ggml_backend_buffer_t buffer;
+
+  buffer = buft->iface.alloc_buffer(buft, size);
+
+  vn_encode_ggml_buffer(enc, buffer);
+
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
new file mode 100644
index 000000000..fc1ccaef6
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -0,0 +1,143 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
+struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
+
+uint32_t
+backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
+  vn_encode_uintptr_t(enc, &base);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&set_tensor_timer);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  ggml_tensor *tensor;
+  // safe to remove the const qualifier here
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  size_t offset;
+  vn_decode_size_t(dec, &offset);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+#if 0
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
+       buffer, tensor, shmem_data, offset, size);
+#endif
+#if 0
+  void **addr = (void **)(uintptr_t) shmem_data;
+  for (int i = 0; i <= 10; i++) {
+    INFO("%s: %p | %llx", __func__, addr, *addr);
+    addr++;
+  }
+  INFO("\n");
+#endif
+
+  buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
+
+  stop_timer(&set_tensor_timer);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&get_tensor_timer);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+
+  const ggml_tensor *tensor;
+  // safe to remove the const qualifier here
+  tensor = vn_decode_ggml_tensor(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  size_t offset;
+  vn_decode_size_t(dec, &offset);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+    if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+  UNUSED(buffer);
+  UNUSED(tensor);
+  buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
+
+  stop_timer(&get_tensor_timer);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uint8_t value;
+  vn_decode_uint8_t(dec, &value);
+
+  buffer->iface.clear(buffer, value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  if (!untrack_backend_buffer(buffer)) {
+    WARNING("%s: unknown buffer %p", (void *) buffer);
+    return 1;
+  }
+
+  buffer->iface.free_buffer(buffer);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
new file mode 100644
index 000000000..473e9d2db
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -0,0 +1,142 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  int32_t dev_count = reg->iface.get_device_count(reg);
+  vn_encode_int32_t(enc, &dev_count);
+
+  return 0;
+}
+
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_name(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_description(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  uint32_t type = dev->iface.get_type(dev);
+  vn_encode_uint32_t(enc, &type);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+
+  vn_encode_size_t(enc, &free);
+  vn_encode_size_t(enc, &total);
+
+  return 0;
+}
+
+uint32_t
+backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
+
+  bool supports_op = dev->iface.supports_op(dev, op);
+
+  vn_encode_bool_t(enc, &supports_op);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
+
+  vn_encode_ggml_buffer_type(enc, bufft);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  struct ggml_backend_dev_props props;
+  dev->iface.get_props(dev, &props);
+
+  vn_encode_bool_t(enc, &props.caps.async);
+  vn_encode_bool_t(enc, &props.caps.host_buffer);
+  vn_encode_bool_t(enc, &props.caps.buffer_from_host_ptr);
+  vn_encode_bool_t(enc, &props.caps.events);
+
+  return 0;
+}
+
+uint32_t
+backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_ptr) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+  size_t max_tensor_size;
+  vn_decode_size_t(dec, &max_tensor_size);
+
+  ggml_backend_buffer_t buffer;
+  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
+
+  vn_encode_ggml_buffer(enc, buffer);
+  vn_encode_ggml_buffer_type(enc, buffer->buft);
+
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-metal.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-metal.cpp
new file mode 100644
index 000000000..72f672f1d
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-metal.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+void (*ggml_backend_metal_get_device_context_fct)(ggml_backend_dev_t dev,
+						  bool *has_simdgroup_mm,
+						  bool *has_simdgroup_reduction,
+						  bool *use_bfloat) = NULL;
+
+uint32_t
+backend_metal_get_device_context(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  bool has_simdgroup_mm;
+  bool has_simdgroup_reduction;
+  bool use_bfloat;
+
+  uint32_t ret = 0;
+  if (ggml_backend_metal_get_device_context_fct) {
+
+    ggml_backend_metal_get_device_context_fct(dev,
+					  &has_simdgroup_mm,
+					  &has_simdgroup_reduction,
+					  &use_bfloat
+      );
+  } else {
+    ERROR("ggml_backend_metal_get_device_context not available :/");
+    ret = 1;
+  }
+
+  vn_encode_bool_t(enc, &has_simdgroup_mm);
+  vn_encode_bool_t(enc, &has_simdgroup_reduction);
+  vn_encode_bool_t(enc, &use_bfloat);
+
+  return ret;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
new file mode 100644
index 000000000..e93f5bcce
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -0,0 +1,47 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "ggml-metal.h"
+
+ggml_backend_reg_t reg = NULL;
+ggml_backend_dev_t dev = NULL;
+ggml_backend_t bck = NULL;
+
+long long timer_start = 0;
+long long timer_total = 0;
+long long timer_count = 0;
+
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) {
+  if (reg != NULL) {
+    FATAL("%s: already initialized :/", __func__);
+  }
+  ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
+
+  reg = ggml_backend_reg_fct();
+  if (reg == NULL) {
+    FATAL("%s: backend registration failed :/", __func__);
+  }
+
+  if (reg->iface.get_device_count(reg)) {
+    dev = reg->iface.get_device(reg, 0);
+  }
+
+  ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p;
+
+  bck = ggml_backend_fct(0);
+  if (!bck) {
+    ERROR("%s: backend initialization failed :/", __func__);
+    return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;
+  }
+
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+  INFO("%s: free memory: %ld MB", __func__, (size_t) free/1024/1024);
+
+  return APIR_BACKEND_INITIALIZE_SUCCESS;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
new file mode 100644
index 000000000..3420735ca
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <ggml-backend.h>
+
+#include "backend-utils.h"
+#include "backend-convert.h"
+#include "shared/apir_backend.h"
+#include "shared/venus_cs.h"
+#include "shared/venus_cs_ggml.h"
+
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
+
+typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* *** */
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* device */
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* buffer-type */
+uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* buffer */
+uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* backend */
+uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* metal */
+uint32_t backend_metal_get_device_context(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
+{
+  switch (type) {
+  /* device */
+  case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count";
+  case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name";
+  case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description";
+  case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type";
+  case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
+  case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
+  case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type";
+  case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props";
+  case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr";
+
+  /* buffer-type */
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer";
+
+  /* buffer */
+  case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base";
+  case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor";
+  case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor";
+  case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear";
+  case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: return "backend_buffer_free_buffer";
+
+  /* backend */
+  case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute";
+
+  /* metal */
+  case APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT: return "metal_get_device_context";
+
+  default: return "unknown";
+  }
+}
+
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
+  /* device */
+  [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count,
+  [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name,
+  [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description,
+  [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type,
+  [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
+  [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
+  [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type,
+  [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props,
+  [APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr,
+
+  /* buffer-type */
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER] = backend_buffer_type_alloc_buffer,
+
+  /* buffer */
+  [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base,
+  [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor,
+  [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor,
+  [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear,
+  [APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER] = backend_buffer_free_buffer,
+
+  /* backend */
+  [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute,
+
+  /* metal */
+  [APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT] = backend_metal_get_device_context,
+};
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
new file mode 100644
index 000000000..4d7ef19e8
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -0,0 +1,35 @@
+#include <cstdio>
+#include <cstdarg>
+#include <cstdlib>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "shared/api_remoting.h"
+
+extern ggml_backend_reg_t reg;
+extern ggml_backend_dev_t dev;
+extern ggml_backend_t bck;
+
+#define NOT_IMPLEMENTED							\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \
+      first = false;							\
+    }									\
+  } while(0)
+
+extern "C" {
+  ApirLoadLibraryReturnCode apir_backend_initialize();
+  void apir_backend_deinit(void);
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx,
+				   char *dec_cur, const char *dec_end,
+				   char *enc_cur, const char *enc_end,
+				   char **enc_cur_after);
+}
+
+extern void (*ggml_backend_metal_get_device_context_fct)(ggml_backend_dev_t dev,
+							 bool *has_simdgroup_mm,
+							 bool *has_simdgroup_reduction,
+							 bool *use_bfloat);
diff --git a/ggml/src/ggml-remotingbackend/backend-utils.cpp b/ggml/src/ggml-remotingbackend/backend-utils.cpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/ggml/src/ggml-remotingbackend/backend-utils.h b/ggml/src/ggml-remotingbackend/backend-utils.h
new file mode 100644
index 000000000..cf2898a71
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-utils.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <cstdarg>
+#include <cstdio>
+#include <cassert>
+
+#include <ggml.h>
+
+#define UNUSED GGML_UNUSED
+#define APIR_LLAMA_CPP_LOG_TO_FILE_ENV "APIR_LLAMA_CPP_LOG_TO_FILE"
+
+static FILE *
+get_log_dest(void)
+{
+   static FILE *dest = NULL;
+   if (dest) {
+      return dest;
+   }
+   const char *apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV);
+   if (!apir_log_to_file) {
+      dest = stderr;
+      return dest;
+   }
+
+   dest = fopen(apir_log_to_file, "w");
+
+   return dest;
+}
+
+#define APIR_VA_PRINT(prefix, format)               \
+   do {                                             \
+      FILE *dest = get_log_dest();                  \
+      fprintf(dest, prefix);                        \
+      va_list argptr;                               \
+      va_start(argptr, format);                     \
+      vfprintf(dest, format, argptr);               \
+      fprintf(dest, "\n");                          \
+      va_end(argptr);                               \
+      fflush(dest);                                 \
+   } while (0)
+
+inline void
+INFO(const char *format, ...) {
+  APIR_VA_PRINT("INFO: ", format);
+}
+
+inline void
+WARNING(const char *format, ...) {
+  APIR_VA_PRINT("WARNING: ", format);
+}
+
+inline void
+ERROR(const char *format, ...) {
+  APIR_VA_PRINT("ERROR: ", format);
+}
+
+[[noreturn]] inline void
+FATAL(const char *format, ...) {
+  APIR_VA_PRINT("FORMAT: ", format);
+  abort();
+}
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
new file mode 100644
index 000000000..a7695834d
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -0,0 +1,151 @@
+#include <iostream>
+#include <dlfcn.h>
+
+#include <ggml-backend.h>
+
+#include "backend-utils.h"
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "shared/api_remoting.h"
+#include "shared/apir_backend.h"
+#include "shared/venus_cs.h"
+
+#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH"
+#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG"
+#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT"
+
+#define GGML_BACKEND_LIBRARY_METAL_DEVICE_CONTEXT "ggml_backend_metal_get_device_context"
+
+static void *backend_library_handle = NULL;
+
+extern "C" {
+  void apir_backend_deinit(void) {
+    auto buffers = get_track_backend_buffers();
+    for (const auto& buffer: buffers) {
+      untrack_backend_buffer(buffer);
+      buffer->iface.free_buffer(buffer);
+    }
+
+    if (dev) {
+      size_t free, total;
+      dev->iface.get_memory(dev, &free, &total);
+      INFO("%s: free memory: %ld MB", __func__, (size_t) free/1024/1024);
+    }
+
+    show_timer(&graph_compute_timer);
+    show_timer(&set_tensor_timer);
+    show_timer(&get_tensor_timer);
+    /* *** */
+
+    if (backend_library_handle) {
+      INFO("%s: The GGML backend library was loaded. Unloading it.", __func__);
+      dlclose(backend_library_handle);
+    }
+
+    INFO("%s: bye-bye", __func__);
+  }
+
+  ApirLoadLibraryReturnCode apir_backend_initialize() {
+    const char* dlsym_error;
+
+    const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV);
+    const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV);
+    const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV);
+
+    INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init);
+
+    if (!library_name) {
+      ERROR("cannot open the GGML library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV);
+
+      return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+    }
+
+    backend_library_handle = dlopen(library_name, RTLD_LAZY);
+
+    if (!backend_library_handle) {
+      ERROR("cannot open the GGML library: %s", dlerror());
+
+      return APIR_LOAD_LIBRARY_CANNOT_OPEN;
+    }
+
+    if (!library_reg) {
+      ERROR("cannot register the GGML library: env var '%s' not defined", GGML_BACKEND_LIBRARY_REG_ENV);
+
+      return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+    }
+
+    void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
+    dlsym_error = dlerror();
+    if (dlsym_error) {
+      ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s",
+	    library_reg, GGML_BACKEND_LIBRARY_REG_ENV, dlsym_error);
+
+      return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
+    }
+
+    if (!library_init) {
+      ERROR("cannot initialize the GGML library: env var '%s' not defined", library_init);
+
+      return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
+    }
+
+    void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init);
+    dlsym_error = dlerror();
+    if (dlsym_error) {
+      ERROR("cannot find the GGML backend init symbol '%s' (from %s): %s",
+	    library_init, GGML_BACKEND_LIBRARY_INIT_ENV, dlsym_error);
+
+      return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
+    }
+
+    ggml_backend_metal_get_device_context_fct = (void (*)(ggml_backend_dev_t, bool *, bool *, bool *)) dlsym(backend_library_handle, GGML_BACKEND_LIBRARY_METAL_DEVICE_CONTEXT);
+    dlsym_error = dlerror();
+    if (dlsym_error) {
+      ERROR("cannot find the GGML device context symbol '%s': %s\n",
+	    GGML_BACKEND_LIBRARY_METAL_DEVICE_CONTEXT, dlsym_error);
+
+      return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
+    }
+
+    uint32_t ret = backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
+
+    return (ApirLoadLibraryReturnCode) (APIR_LOAD_LIBRARY_INIT_BASE_INDEX + ret);
+  }
+
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx,
+				   char *dec_cur, const char *dec_end,
+				   char *enc_cur, const char *enc_end,
+				   char **enc_cur_after) {
+    struct vn_cs_encoder _enc = {
+      .cur = enc_cur,
+      .end = enc_end,
+    };
+    struct vn_cs_encoder *enc = &_enc;
+
+    struct vn_cs_decoder _dec = {
+      .cur = dec_cur,
+      .end = dec_end,
+    };
+    struct vn_cs_decoder *dec = &_dec;
+
+
+    if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
+      ERROR("Received an invalid dispatch index (%d >= %d)\n",
+	    cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
+      return APIR_BACKEND_FORWARD_INDEX_INVALID;
+    }
+
+#if 0
+    static long long count = 0;
+    INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type));
+    count += 1;
+#endif
+    backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
+    uint32_t ret = forward_fct(enc, dec, ctx);
+
+    *enc_cur_after = enc->cur;
+
+    return ret;
+  }
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
new file mode 100644
index 000000000..fe9d89bdc
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
@@ -0,0 +1,88 @@
+#pragma once
+
+/* the rest of this file must match virglrenderer/src/apir-protocol.h */
+
+#include <unistd.h>
+
+#define VENUS_COMMAND_TYPE_LENGTH 331
+
+#define APIR_PROTOCOL_MAJOR 0
+#define APIR_PROTOCOL_MINOR 1
+
+#define APIR_HANDSHAKE_MAGIC 0xab1e
+
+typedef enum {
+    APIR_COMMAND_TYPE_HandShake = 0,
+    APIR_COMMAND_TYPE_LoadLibrary = 1,
+    APIR_COMMAND_TYPE_Forward = 2,
+
+    APIR_COMMAND_TYPE_LENGTH = 3,
+} ApirCommandType;
+
+typedef uint64_t ApirCommandFlags;
+
+typedef enum {
+    APIR_LOAD_LIBRARY_SUCCESS = 0,
+    APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
+    APIR_LOAD_LIBRARY_ALREADY_LOADED = 2,
+    APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3,
+    APIR_LOAD_LIBRARY_CANNOT_OPEN = 4,
+    APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5,
+    APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code
+} ApirLoadLibraryReturnCode;
+
+typedef enum {
+    APIR_FORWARD_SUCCESS = 0,
+    APIR_FORWARD_NO_DISPATCH_FCT = 1,
+    APIR_FORWARD_TIMEOUT = 2,
+
+    APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code
+} ApirForwardReturnCode;
+
+__attribute__((unused))
+static inline const char *apir_command_name(ApirCommandType type)
+{
+  switch (type) {
+  case APIR_COMMAND_TYPE_HandShake: return "HandShake";
+  case APIR_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
+  case APIR_COMMAND_TYPE_Forward: return "Forward";
+  default: return "unknown";
+  }
+}
+
+__attribute__((unused))
+static const char *apir_load_library_error(ApirLoadLibraryReturnCode code) {
+#define APIR_LOAD_LIBRARY_ERROR(code_name) \
+  do {						 \
+    if (code == code_name) return #code_name;	 \
+  } while (0)					 \
+
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SUCCESS);
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR);
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ALREADY_LOADED);
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_ENV_VAR_MISSING);
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_CANNOT_OPEN);
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_SYMBOL_MISSING);
+  APIR_LOAD_LIBRARY_ERROR(APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+  return "Unknown APIR_COMMAND_TYPE_LoadLibrary error";
+
+#undef APIR_LOAD_LIBRARY_ERROR
+}
+
+__attribute__((unused))
+static const char *apir_forward_error(ApirForwardReturnCode code) {
+#define APIR_FORWARD_ERROR(code_name) \
+  do {						 \
+    if (code == code_name) return #code_name;	 \
+  } while (0)					 \
+
+  APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
+  APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
+  APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
+  APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
+
+  return "Unknown APIR_COMMAND_TYPE_Forward error";
+
+#undef APIR_FORWARD_ERROR
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
new file mode 100644
index 000000000..32553e49e
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#define APIR_BACKEND_INITIALIZE_SUCCESS 0
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
+#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
+#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
+
+#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5
+// new entries here need to be added to the apir_backend_initialize_error function below
+
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
+
+// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
+#define APIR_BACKEND_CHECK_SUPPORTS_OP 0
+
+typedef uintptr_t apir_buffer_type_host_handle_t;
+typedef uintptr_t apir_buffer_host_handle_t;
+
+typedef struct {
+  apir_buffer_host_handle_t host_handle;
+
+  struct vn_renderer_shmem *shmem;
+  apir_buffer_type_host_handle_t buft_host_handle;
+} apir_buffer_context_t;
+
+struct vn_dispatch_context;
+struct virgl_apir_context;
+
+typedef enum ApirBackendCommandType {
+  /* device */
+  APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0,
+  APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1,
+  APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2,
+  APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3,
+  APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
+  APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
+  APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6,
+  APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7,
+  APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 8,
+
+  /* buffer-type */
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 9,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 10,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 11,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 12,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 13,
+
+  /* buffer */
+  APIR_COMMAND_TYPE_BUFFER_GET_BASE = 14,
+  APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 15,
+  APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 16,
+  APIR_COMMAND_TYPE_BUFFER_CLEAR = 17,
+  APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 18,
+
+  /* backend */
+  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 19,
+
+  /* metal */
+  APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT = 20,
+
+  // last command_type index + 1
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 21,
+} ApirBackendCommandType;
+
+
+struct virgl_apir_callbacks {
+  void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id);
+};
+
+struct virgl_apir_context {
+  struct vn_dispatch_context *virgl_ctx;
+
+  struct virgl_apir_callbacks iface;
+};
+
+struct timer_data {
+  long long start;
+  long long total;
+  long long count;
+  const char *name;
+};
+
+extern struct timer_data graph_compute_timer;
+extern struct timer_data get_tensor_timer;
+extern struct timer_data set_tensor_timer;
+extern struct timer_data wait_host_reply_timer;
+extern struct timer_data get_tensor_from_ptr_timer;
+extern struct timer_data set_tensor_from_ptr_timer;
+
+static inline void start_timer(struct timer_data *timer) {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+// returns the duration in ns
+static inline long long stop_timer(struct timer_data *timer) {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  long long duration = (timer_end - timer->start);
+  timer->total += duration;
+  timer->count += 1;
+
+  return duration;
+}
+
+static inline void show_timer(struct timer_data *timer) {
+  double ms = timer->total/1000000;
+  double itl = ms/timer->count;
+  double speed = 1/itl * 1000;
+
+  if (!timer->total) {
+    return;
+  }
+
+  INFO("%15s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s (%4.2f ms/call)",
+       timer->name, ms, timer->count, itl, speed, ms/timer->count);
+}
+
+static const char *apir_backend_initialize_error(int code) {
+#define APIR_BACKEND_INITIALIZE_ERROR(code_name) \
+  do {						 \
+    if (code == code_name) return #code_name;	 \
+  } while (0)					 \
+
+  APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_SUCCESS);
+  APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY);
+  APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY);
+  APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
+  APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
+  APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
+
+  return "Unknown APIR_BACKEND_INITIALIZE error:/";
+
+#undef APIR_BACKEND_INITIALIZE_ERROR
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
new file mode 100644
index 000000000..e67c99a46
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -0,0 +1,554 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+
+// needs UNUSED to be defined
+// needs FATAL to be defined
+
+#define likely(x)   __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+struct vn_cs_encoder {
+  char* cur;
+  const char *start;
+  const char* end;
+};
+
+struct vn_cs_decoder {
+  const char* cur;
+  const char* end;
+};
+
+/*
+ * new encoder and decoder
+ */
+
+static struct vn_cs_decoder
+vn_cs_new_decoder(const char *ptr, size_t size) {
+  struct vn_cs_decoder dec = {
+      .cur = ptr,
+      .end = ptr + size,
+  };
+
+  return dec;
+}
+
+static struct vn_cs_encoder
+vn_cs_new_encoder(char *ptr, size_t size) {
+  struct vn_cs_encoder enc = {
+      .cur = ptr,
+      .start = ptr,
+      .end = ptr + size,
+  };
+
+  return enc;
+}
+
+/*
+ * encode peek
+ */
+
+static inline bool
+vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
+                            size_t size,
+                            void *val,
+                            size_t val_size)
+{
+  assert(val_size <= size);
+
+  if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+    FATAL("READING TOO MUCH FROM THE DECODER :/");
+    //vn_cs_decoder_set_fatal(dec);
+    memset(val, 0, val_size);
+    return false;
+  }
+
+  /* we should not rely on the compiler to optimize away memcpy... */
+  memcpy(val, dec->cur, val_size);
+  return true;
+}
+
+static inline void
+vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+  vn_cs_decoder_peek_internal(dec, size, val, val_size);
+}
+
+static inline const void *
+vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec,
+			  size_t size)
+{
+  if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+    FATAL("READING TOO MUCH FROM THE DECODER :/");
+  }
+  const void *addr = dec->cur;
+  dec->cur += size;
+
+  return addr;
+}
+
+/*
+ * read/write
+ */
+
+static inline void
+vn_cs_decoder_read(struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+  if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
+    dec->cur += size;
+}
+
+static inline char *
+vn_cs_encoder_write(struct vn_cs_encoder *enc,
+                    size_t size,
+                    const void *val,
+                    size_t val_size)
+{
+  assert(val_size <= size);
+  assert(size <= ((size_t) (enc->end - enc->cur)));
+
+  char *write_addr = enc->cur;
+  /* we should not rely on the compiler to optimize away memcpy... */
+  memcpy(write_addr, val, val_size);
+  enc->cur += size;
+
+  return write_addr;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void
+vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size)
+{
+  assert(size % 4 == 0);
+  vn_cs_decoder_read(dec, size, data, data_size);
+}
+
+static inline void
+vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size)
+{
+  assert(size % 4 == 0);
+  /* TODO check if the generated code is optimal */
+  vn_cs_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint8_t */
+
+static inline void
+vn_encode_uint8_t(struct vn_cs_encoder *enc, const uint8_t *val)
+{
+  vn_encode(enc, sizeof(int), val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint8_t(struct vn_cs_decoder *dec, uint8_t *val)
+{
+  vn_decode(dec, sizeof(int), val, sizeof(*val));
+}
+
+/* uint64_t */
+
+static inline size_t
+vn_sizeof_uint64_t(const uint64_t *val)
+{
+  assert(sizeof(*val) == 8);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
+  return 8;
+}
+
+static inline void
+vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val)
+{
+  vn_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val)
+{
+  vn_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count)
+{
+  assert(sizeof(*val) == 8);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
+}
+
+static inline void
+vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+static inline const uint64_t *
+vn_decode_uint64_t_array_inplace(struct vn_cs_decoder *dec, uint32_t count)
+{
+  return (uint64_t *)(uintptr_t) vn_cs_decoder_use_inplace(dec, count * sizeof(uint64_t));
+}
+
+/* int32_t */
+
+static inline size_t
+vn_sizeof_int32_t(const int32_t *val)
+{
+  assert(sizeof(*val) == 4);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
+  return 4;
+}
+
+static inline void
+vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val)
+{
+  vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val)
+{
+  vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_int32_t_array(const int32_t *val, uint32_t count)
+{
+  assert(sizeof(*val) == 4);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
+}
+
+static inline void
+vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline size_t
+vn_sizeof_array_size(uint64_t size)
+{
+  return vn_sizeof_uint64_t(&size);
+}
+
+static inline void
+vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size)
+{
+  vn_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t
+vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size)
+{
+  uint64_t size;
+  vn_decode_uint64_t(dec, &size);
+  if (size != expected_size) {
+    FATAL("ENCODER IS FULL :/");
+    //vn_cs_decoder_set_fatal(dec);
+    size = 0;
+  }
+  return size;
+}
+
+static inline uint64_t
+vn_decode_array_size_unchecked(struct vn_cs_decoder *dec)
+{
+  uint64_t size;
+  vn_decode_uint64_t(dec, &size);
+  return size;
+}
+
+static inline uint64_t
+vn_peek_array_size(struct vn_cs_decoder *dec)
+{
+  uint64_t size;
+  vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
+  return size;
+}
+
+/* non-array pointer */
+
+static inline size_t
+vn_sizeof_simple_pointer(const void *val)
+{
+  return vn_sizeof_array_size(val ? 1 : 0);
+}
+
+static inline bool
+vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val)
+{
+  vn_encode_array_size(enc, val ? 1 : 0);
+  return val;
+}
+
+static inline bool
+vn_decode_simple_pointer(struct vn_cs_decoder *dec)
+{
+  return vn_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline size_t
+vn_sizeof_uint32_t(const uint32_t *val)
+{
+  assert(sizeof(*val) == 4);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
+  return 4;
+}
+
+static inline void
+vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val)
+{
+  vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val)
+{
+  vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count)
+{
+  assert(sizeof(*val) == 4);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
+}
+
+static inline void
+vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+/* size_t */
+
+static inline size_t
+vn_sizeof_size_t(const size_t *val)
+{
+    return sizeof(*val);
+}
+
+static inline void
+vn_encode_size_t(struct vn_cs_encoder *enc, const size_t *val)
+{
+    const uint64_t tmp = *val;
+    vn_encode_uint64_t(enc, &tmp);
+}
+
+static inline void
+vn_decode_size_t(struct vn_cs_decoder *dec, size_t *val)
+{
+    uint64_t tmp;
+    vn_decode_uint64_t(dec, &tmp);
+    *val = tmp;
+}
+
+static inline size_t
+vn_sizeof_size_t_array(const size_t *val, uint32_t count)
+{
+    return vn_sizeof_size_t(val) * count;
+}
+
+static inline void
+vn_encode_size_t_array(struct vn_cs_encoder *enc, const size_t *val, uint32_t count)
+{
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        vn_encode_uint64_t_array(enc, (const uint64_t *)val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++)
+            vn_encode_size_t(enc, &val[i]);
+    }
+}
+
+static inline void
+vn_decode_size_t_array(struct vn_cs_decoder *dec, size_t *val, uint32_t count)
+{
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        vn_decode_uint64_t_array(dec, (uint64_t *)val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++)
+            vn_decode_size_t(dec, &val[i]);
+    }
+}
+
+/* opaque blob */
+
+static inline size_t
+vn_sizeof_blob_array(const void *val, size_t size)
+{
+  UNUSED(val);
+  return (size + 3) & ~3;
+}
+
+static inline void
+vn_encode_blob_array(struct vn_cs_encoder *enc, const void *val, size_t size)
+{
+  vn_encode(enc, (size + 3) & ~3, val, size);
+}
+
+static inline void
+vn_decode_blob_array(struct vn_cs_decoder *dec, void *val, size_t size)
+{
+  vn_decode(dec, (size + 3) & ~3, val, size);
+}
+
+/* string */
+
+static inline size_t
+vn_sizeof_char_array(const char *val, size_t size)
+{
+  return vn_sizeof_blob_array(val, size);
+}
+
+static inline void
+vn_encode_char_array(struct vn_cs_encoder *enc, const char *val, size_t size)
+{
+  assert(size && strlen(val) < size);
+  vn_encode_blob_array(enc, val, size);
+}
+
+static inline void
+vn_decode_char_array(struct vn_cs_decoder *dec, char *val, size_t size)
+{
+  vn_decode_blob_array(dec, val, size);
+  if (size)
+    val[size - 1] = '\0';
+  else {
+    //vn_cs_decoder_set_fatal(dec);
+    FATAL("Couldn't decode the blog array");
+  }
+}
+
+/* (temp) buffer allocation */
+
+static inline void *
+vkr_cs_decoder_alloc_array(struct vkr_cs_decoder *dec, size_t size, size_t count)
+{
+  UNUSED(dec);
+  size_t alloc_size;
+  if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
+    FATAL("overflow in array allocation of %zu * %zu bytes", size, count);
+    return NULL;
+  }
+
+  return malloc(alloc_size);
+}
+
+static inline void *
+vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count)
+{
+  struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec;
+  return vkr_cs_decoder_alloc_array(d, size, count);
+}
+
+/* bool */
+
+static inline void
+vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val)
+{
+  vn_encode(enc, sizeof(int), val, sizeof(bool));
+}
+
+static inline void
+vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
+{
+  vn_decode(dec, sizeof(int), val, sizeof(bool));
+}
+
+/* apir_buffer_type_host_handle_t */
+
+static inline void
+vn_encode_apir_buffer_type_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_host_handle_t *val)
+{
+  vn_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+static inline void
+vn_decode_apir_buffer_type_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_host_handle_t *val)
+{
+  vn_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+/* apir_buffer_host_handle_t */
+
+static inline void
+vn_encode_apir_buffer_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *val)
+{
+  vn_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+static inline void
+vn_decode_apir_buffer_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_host_handle_t *val)
+{
+  vn_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+/* uintptr_t */
+
+static inline void
+vn_encode_uintptr_t(struct vn_cs_encoder *enc, const uintptr_t *val)
+{
+  vn_encode(enc, sizeof(*val), val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uintptr_t(struct vn_cs_decoder *dec, uintptr_t *val)
+{
+  vn_decode(dec, sizeof(*val), val, sizeof(*val));
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
new file mode 100644
index 000000000..196cd7095
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
@@ -0,0 +1,167 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "venus_cs_ggml-rpc.h"
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void
+track_backend_buffer(ggml_backend_buffer_t buffer) {
+  backend_buffers.insert(buffer);
+}
+
+rpc_tensor
+serialize_tensor(const ggml_tensor * tensor) {
+  rpc_tensor result;
+  result.id = reinterpret_cast<uint64_t>(tensor);
+  result.type = tensor->type;
+  if (tensor->buffer) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
+
+    result.buffer = BUFFER_TO_HANDLE(buffer);
+  } else {
+    result.buffer = 0;
+  }
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result.ne[i] = tensor->ne[i];
+    result.nb[i] = tensor->nb[i];
+  }
+  result.op = tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result.op_params[i] = tensor->op_params[i];
+  }
+  result.flags = tensor->flags;
+  for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+    result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+  }
+  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+  result.view_offs = tensor->view_offs;
+  result.data = reinterpret_cast<uint64_t>(tensor->data);
+  snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+  return result;
+}
+
+ggml_tensor *
+deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+                                            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result->nb[i] = tensor->nb[i];
+  }
+  result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+  if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+    printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer);
+    result->buffer = nullptr;
+  }
+
+  if (result->buffer) {
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+  }
+
+  result->op = (ggml_op) tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result->op_params[i] = tensor->op_params[i];
+  }
+  result->flags = tensor->flags;
+  result->data = reinterpret_cast<void *>(tensor->data);
+  ggml_set_name(result, tensor->name);
+  return result;
+}
+
+void
+add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+  if (tensor == nullptr) {
+    return;
+  }
+  if (visited.find(tensor) != visited.end()) {
+    return;
+  }
+  visited.insert(tensor);
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    add_tensor(tensor->src[i], tensors, visited);
+  }
+  add_tensor(tensor->view_src, tensors, visited);
+  tensors.push_back(serialize_tensor(tensor));
+}
+
+void
+serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+  uint32_t n_nodes = cgraph->n_nodes;
+  std::vector<rpc_tensor> tensors;
+  std::unordered_set<ggml_tensor*> visited;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    add_tensor(cgraph->nodes[i], tensors, visited);
+  }
+  // serialization format:
+  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+  uint32_t n_tensors = tensors.size();
+  int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+  output.resize(output_size, 0);
+  memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+  }
+  uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+  *out_ntensors = n_tensors;
+  rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
+
+ggml_tensor *
+create_node(uint64_t id,
+            struct ggml_context * ctx,
+            const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+            std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+  if (id == 0) {
+    return nullptr;
+  }
+  if (tensor_map.find(id) != tensor_map.end()) {
+    return tensor_map[id];
+  }
+  const rpc_tensor * tensor = tensor_ptrs.at(id);
+  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+  if (result == nullptr) {
+    return nullptr;
+  }
+  tensor_map[id] = result;
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+  }
+  result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+  result->view_offs = tensor->view_offs;
+  return result;
+}
+
+ggml_cgraph *
+deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) {
+  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+  struct ggml_init_params params = {
+    /*.mem_size   =*/ buf_size,
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+  struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+  graph->n_nodes = n_nodes;
+  std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+  for (uint32_t i = 0; i < n_tensors; i++) {
+    tensor_ptrs[tensors[i].id] = &tensors[i];
+  }
+  std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    int64_t id;
+    memcpy(&id, &nodes[i], sizeof(id));
+    graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+  }
+
+  return graph;
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
new file mode 100644
index 000000000..96402287a
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
@@ -0,0 +1,45 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+
+// ggml_tensor is serialized into rpc_tensor
+struct rpc_tensor {
+  uint64_t id;
+  uint32_t type;
+  uint64_t buffer;
+  uint32_t ne[GGML_MAX_DIMS];
+  uint32_t nb[GGML_MAX_DIMS];
+  uint32_t op;
+  int32_t  op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+  int32_t  flags;
+  uint64_t src[GGML_MAX_SRC];
+  uint64_t view_src;
+  uint64_t view_offs;
+  uint64_t data;
+  char name[GGML_MAX_NAME];
+
+  char padding[4];
+};
+
+/* frontend */
+
+rpc_tensor serialize_tensor(const ggml_tensor * tensor);
+
+void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
+
+/* backend */
+
+void track_backend_buffer(ggml_backend_buffer_t buffer);
+bool untrack_backend_buffer(ggml_backend_buffer_t buffer);
+std::unordered_set<ggml_backend_buffer_t> get_track_backend_buffers();
+
+void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited);
+
+ggml_tensor *deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
+
+ggml_tensor *create_node(uint64_t id,
+			 struct ggml_context * ctx,
+			 const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+			 std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
+
+ggml_cgraph *deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes);
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
new file mode 100644
index 000000000..71c9b3f3e
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -0,0 +1,236 @@
+// needs the ggml-backend-impl.h definition
+// needs venus_cs.h definition
+
+#include "venus_cs_ggml-rpc.h"
+
+// needs
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
+
+static inline void
+vn_encode_ggml_buffer_host_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle);
+
+static inline ggml_backend_buffer_t
+vn_decode_ggml_buffer(struct vn_cs_decoder *dec);
+
+/* rpc_tensor */
+
+static inline void
+vn_encode_rcp_tensor(struct vn_cs_encoder *enc, const rpc_tensor *rpc_tensor) {
+  size_t rpc_tensor_size = sizeof(*rpc_tensor);
+  vn_encode(enc, rpc_tensor_size, rpc_tensor, rpc_tensor_size);
+}
+
+static inline rpc_tensor *
+vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) {
+  size_t rpc_tensor_size = sizeof(rpc_tensor);
+
+  return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
+}
+
+static inline rpc_tensor *
+vn_decode_rpc_tensor_array_inplace(struct vn_cs_decoder *dec, uint32_t n_tensors) {
+  size_t rpc_tensor_size = sizeof(rpc_tensor) * n_tensors;
+
+  return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
+}
+
+/* ggml_tensor */
+
+static inline void
+vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  rpc_tensor serialized = serialize_tensor(tensor);
+
+  vn_encode_rcp_tensor(enc, &serialized);
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor(struct vn_cs_decoder *dec) {
+  const rpc_tensor *rpc_tensor = vn_decode_rpc_tensor_inplace(dec);
+  struct ggml_init_params params {
+    /*.mem_size   =*/ ggml_tensor_overhead(),
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+
+  const ggml_tensor *tensor = deserialize_tensor(ctx, rpc_tensor);
+
+  return tensor;
+}
+
+/* *** ggml_backend_buffer_type_t *** */
+
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
+
+
+static inline void
+vn_encode_ggml_buffer_type(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) {
+  apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+  vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_type_t
+vn_decode_ggml_buffer_type(struct vn_cs_decoder *dec) {
+  apir_buffer_type_host_handle_t handle;
+
+  vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+  return (ggml_backend_buffer_type_t) handle;
+}
+
+static inline apir_buffer_type_host_handle_t
+vn_decode_apir_buffer_type_host_handle(struct vn_cs_decoder *dec) {
+  apir_buffer_type_host_handle_t handle;
+
+  vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+  return handle;
+}
+
+/* *** ggml_backend_type_t *** */
+
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+
+static inline void
+vn_encode_ggml_buffer(struct vn_cs_encoder *enc, const ggml_backend_buffer_t buffer) {
+  apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+  vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_t
+vn_decode_ggml_buffer(struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_t buffer;
+  size_t buffer_ptr_size = sizeof(buffer);
+
+  vn_cs_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
+
+  return buffer;
+}
+
+/* enum ggml_status */
+
+static inline void
+vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) {
+  vn_cs_encoder_write(enc, sizeof(*status), status, sizeof(*status));
+}
+
+static inline void
+vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) {
+  vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
+
+/* vn_renderer_shmem */
+
+static inline void
+vn_encode_virtgpu_shmem_res_id(struct vn_cs_encoder *enc, uint32_t shmem_res_id) {
+  vn_encode_uint32_t(enc, &shmem_res_id);
+}
+
+static inline void
+vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) {
+  vn_decode_uint32_t(dec, shmem_res_id);
+}
+
+/* ggml_cgraph */
+
+static inline size_t
+vn_serialize_ggml_cgraph(ggml_cgraph *cgraph, std::vector<uint8_t> & cgraph_data) {
+  serialize_graph(cgraph, cgraph_data);
+
+  return cgraph_data.size();
+}
+
+static inline void
+vn_encode_cgraph_data(struct vn_cs_encoder *enc, std::vector<uint8_t> & cgraph_data) {
+  size_t cgraph_size = cgraph_data.size();
+
+  vn_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
+}
+
+static inline ggml_cgraph *
+vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) {
+  UNUSED(cgraph_size);
+
+  uint32_t n_nodes;
+  vn_decode_uint32_t(dec, &n_nodes);
+  const uint64_t * nodes = vn_decode_uint64_t_array_inplace(dec, n_nodes);
+
+  uint32_t n_tensors;
+  vn_decode_uint32_t(dec, &n_tensors);
+  const rpc_tensor *tensors = vn_decode_rpc_tensor_array_inplace(dec, n_tensors);
+
+  return deserialize_graph(n_nodes, n_tensors, tensors, nodes);
+}
+
+static inline void
+vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) {
+  vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void
+vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  size_t tensor_size = sizeof(*tensor);
+
+  if (tensor->extra) {
+    FATAL("Cannot pass tensors with extra");
+  }
+
+  if (tensor->src[0] && tensor->buffer) {
+    static int first = 1;
+    if (first) {
+      // not sure if the buffer needs to be updated inside the src tensors or not
+      WARNING("Cannot pass tensors with src and buffer");
+      first = 0;
+    }
+  }
+
+  vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+  // (could also make a copy of the tensor, and update locally.)
+
+  if (tensor->buffer) {
+    apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+    vn_encode_ggml_buffer_handle(enc, &buffer_handle);
+  }
+
+  if (tensor->view_src) {
+    vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    const ggml_tensor *tensor_src = tensor->src[i];
+    vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+  }
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
+
+  // it safe to remove the `const` qualifier here, we *do* want to
+  // modify the shared memory data to fix the `src` pointers.
+  ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+  if (tensor->buffer) {
+    tensor->buffer = vn_decode_ggml_buffer(dec);
+  }
+
+  if (tensor->view_src) {
+    ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->view_src = tensor_view_src;
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+  }
+
+  return tensor;
+}
diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
new file mode 100644
index 000000000..30ae511aa
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
@@ -0,0 +1,118 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "shared/venus_cs_ggml-rpc.h"
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void
+track_backend_buffer(ggml_backend_buffer_t buffer) {
+  backend_buffers.insert(buffer);
+}
+
+bool
+untrack_backend_buffer(ggml_backend_buffer_t buffer) {
+  auto it = backend_buffers.find(buffer);
+  if (it == backend_buffers.end()) {
+    return false;
+  }
+
+  backend_buffers.erase(it);
+  return true;
+}
+
+std::unordered_set<ggml_backend_buffer_t>
+get_track_backend_buffers() {
+  return backend_buffers;
+}
+
+ggml_tensor *
+deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+                                            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result->nb[i] = tensor->nb[i];
+  }
+  result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+  if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+    printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer);
+    result->buffer = nullptr;
+  }
+
+  uint64_t tensor_data = tensor->data;
+  if (result->buffer) {
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+
+    // tensor->data is serialized as an offset to the buffer base address
+    tensor_data += buffer_start;
+
+    GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow
+    GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size);
+  }
+
+  result->op = (ggml_op) tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result->op_params[i] = tensor->op_params[i];
+  }
+  result->flags = tensor->flags;
+  result->data = reinterpret_cast<void *>(tensor_data);
+  ggml_set_name(result, tensor->name);
+  return result;
+}
+
+ggml_tensor *
+create_node(uint64_t id,
+            struct ggml_context * ctx,
+            const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+            std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+  if (id == 0) {
+    return nullptr;
+  }
+  if (tensor_map.find(id) != tensor_map.end()) {
+    return tensor_map[id];
+  }
+  const rpc_tensor * tensor = tensor_ptrs.at(id);
+  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+  if (result == nullptr) {
+    return nullptr;
+  }
+  tensor_map[id] = result;
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+  }
+  result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+  result->view_offs = tensor->view_offs;
+  return result;
+}
+
+ggml_cgraph *
+deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) {
+  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+  struct ggml_init_params params = {
+    /*.mem_size   =*/ buf_size,
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+  struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+  graph->n_nodes = n_nodes;
+  std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+  for (uint32_t i = 0; i < n_tensors; i++) {
+    tensor_ptrs[tensors[i].id] = &tensors[i];
+  }
+  std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    int64_t id;
+    memcpy(&id, &nodes[i], sizeof(id));
+    graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+  }
+
+  return graph;
+}
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
new file mode 100644
index 000000000..430d17ad9
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting frontend")
+
+ggml_add_backend_library(ggml-remotingfrontend
+                         ggml-backend-buffer.cpp
+                         ggml-backend.cpp
+                         ggml-backend-device.cpp
+                         ggml-backend-reg.cpp
+                         ggml-backend-buffer-type.cpp
+                         ggml-backend-host-buffer-type.cpp
+                         ggml-metal-remoting.cpp
+                         virtgpu.cpp
+                         virtgpu-shm.cpp
+                         virtgpu-utils.cpp
+                         virtgpu-forward-device.cpp
+                         virtgpu-forward-buffer-type.cpp
+                         virtgpu-forward-buffer.cpp
+                         virtgpu-forward-backend.cpp
+                         virtgpu-forward-metal.cpp
+                         virtgpu-forward-impl.h
+                         ../../include/ggml-remoting-frontend.h
+                         venus_cs_ggml-rpc-front.cpp
+                        )
+
+# dnf install -y libdrm-devel
+target_link_libraries(ggml-remotingfrontend PUBLIC drm)
+target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/)
+target_include_directories(ggml-remotingfrontend PUBLIC ./include)
+
+target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
new file mode 100644
index 000000000..b655b8018
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -0,0 +1,98 @@
+#include "ggml-remoting.h"
+
+#define BUFT_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+
+  const int USE_FROM_PTR = true;
+
+  if (USE_FROM_PTR) {
+    context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+    context->base = context->apir_context.shmem->mmap_ptr;
+    context->is_from_ptr = true;
+  } else {
+    context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+    context->is_from_ptr = false;
+    context->base = NULL;
+  }
+  context->is_host_buffer = false;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+
+  return buffer;
+}
+
+static const char *
+ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  return apir_buffer_type_get_name(gpu, buft);
+}
+
+static size_t
+ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  static size_t align = 0;
+
+  if (align == 0) {
+    align = apir_buffer_type_get_alignment(gpu, buft);
+  }
+
+  return align;
+}
+
+static size_t
+ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  static size_t max_size = 0;
+  if (max_size == 0) {
+    max_size = apir_buffer_type_get_max_size(gpu, buft);
+  }
+
+  return max_size;
+}
+
+static bool
+ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  return apir_buffer_type_is_host(gpu, buft);
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+  /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ NULL,
+};
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+  /* .alloc_buffer     = */ NULL,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ NULL,
+};
+
+/****************************************************************************************/
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
new file mode 100644
index 000000000..e720efcf4
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -0,0 +1,167 @@
+#include "ggml-remoting.h"
+
+#define BUFFER_TO_GPU(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
+
+struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
+struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
+
+struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"};
+struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"};
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+  IMPLEMENTED_ONCE;
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
+  if (context->base) {
+    return context->base;
+  }
+
+  context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer),
+				       BUFFER_TO_APIR_CONTEXT(buffer));
+
+  return context->base;
+}
+
+static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+  NOT_IMPLEMENTED;
+
+  STOP_HERE;
+
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(value);
+  UNUSED(offset);
+  UNUSED(size);
+}
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&set_tensor_timer);
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+#if 0
+  INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size);
+#endif
+#if 0
+  void **addr = (void **)(uintptr_t)data;
+  for (int i = 0; i <= 10; i++) {
+    INFO("%s: %p | %llx", __func__, addr, *addr);
+    addr++;
+  }
+  INFO("\n");
+#endif
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  if (context->is_from_ptr) {
+    memcpy((char *)tensor->data + offset, data, size);
+  } else {
+    apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  }
+
+  stop_timer(&set_tensor_timer);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&get_tensor_timer);
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  if (context->is_from_ptr) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+  } else {
+    apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  }
+
+  stop_timer(&get_tensor_timer);
+}
+
+static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&set_tensor_from_ptr_timer);
+
+  UNUSED(buffer);
+
+  memcpy((char *)tensor->data + offset, data, size);
+
+  stop_timer(&set_tensor_from_ptr_timer);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  UNUSED(buffer);
+
+  start_timer(&get_tensor_from_ptr_timer);
+
+  memcpy(data, (const char *)tensor->data + offset, size);
+
+  stop_timer(&get_tensor_from_ptr_timer);
+}
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+  NOT_IMPLEMENTED;
+
+  STOP_HERE;
+
+  return true;
+
+  UNUSED(buffer);
+  UNUSED(src);
+  UNUSED(dst);
+}
+
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  UNUSED(buffer);
+
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
+
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  free(context);
+  buffer->context = NULL;
+}
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ NULL,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ NULL,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
new file mode 100644
index 000000000..dce3e2c03
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -0,0 +1,216 @@
+#include "ggml-remoting.h"
+
+static const char *
+ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_get_name(gpu);
+}
+
+static const char *
+ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_get_description(gpu);
+}
+
+static enum ggml_backend_dev_type
+ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  static enum ggml_backend_dev_type type;
+  static bool has_type = false;
+  if (!has_type) {
+    has_type = true;
+    type = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
+  }
+
+  return type;
+}
+
+static void
+ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_get_memory(gpu, free, total);
+}
+
+static bool
+ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+#if USE_ALWAYS_TRUE_SUPPORTS_OP == 1
+  /* ggml-rpc cheats it like this */
+  /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
+  UNUSED(dev);
+  UNUSED(op);
+
+  return true;
+#elif USE_METAL_GUEST_SUPPORTS_OP == 1
+  UNUSED(dev);
+
+  struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
+
+  return ggml_metal_supports_op(device_ctx->metal_dev_ctx, op);
+#else
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_supports_op(gpu, op);
+#endif
+}
+
+static bool
+ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+  //IMPLEMENTED_ONCE;
+
+#if 1
+  bool supported = buft->device == dev;
+  if (!supported) {
+    //WARNING("%s: unsupported buffer type (%s). Double check.", __func__, buft->iface.get_name(buft));
+  }
+
+  return supported;
+#else
+  UNUSED(dev);
+  UNUSED(buft);
+
+  return true;
+#endif
+}
+
+static bool
+ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+  //IMPLEMENTED_ONCE;
+
+  UNUSED(dev);
+  UNUSED(op);
+
+  // related to supports_buft, need to confirm
+
+  return false; // same as ggml-metal
+}
+
+static void
+ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+  IMPLEMENTED;
+
+  props->name        = ggml_backend_remoting_device_get_name(dev);
+  props->description = ggml_backend_remoting_device_get_description(dev);
+  props->type        = ggml_backend_remoting_device_get_type(dev);
+  ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+#if 0
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+  apir_device_get_props(gpu,
+			&props->caps.async,
+			&props->caps.host_buffer,
+			&props->caps.buffer_from_host_ptr,
+			&props->caps.events
+    );
+#else
+  // ignore the actual backend answers and set it as we provide it in
+  // the API Remoting frontend
+  props->caps.async = false;
+  props->caps.host_buffer = false;
+  props->caps.buffer_from_host_ptr = false;
+  props->caps.events = false;
+#endif
+
+  INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d",
+    __func__, props->caps.async, props->caps.host_buffer,
+       props->caps.buffer_from_host_ptr, props->caps.events);
+}
+
+ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ (void *) ctx,
+  };
+
+  return &buft;
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ (void *) ctx,
+  };
+
+  return &buft;
+}
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
+  context->base = ptr;
+  context->is_from_ptr = true;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
+
+  INFO("#");
+  INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer);
+  INFO("#\n");
+
+  return buffer;
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  static struct ggml_backend_buffer_type host_bufft = {
+    /* .iface    = */ ggml_backend_remoting_host_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ nullptr,
+  };
+
+  return &host_bufft;
+}
+
+const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
+  /* .get_name             = */ ggml_backend_remoting_device_get_name,
+  /* .get_description      = */ ggml_backend_remoting_device_get_description,
+  /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
+  /* .get_type             = */ ggml_backend_remoting_device_get_type,
+  /* .get_props            = */ ggml_backend_remoting_device_get_props,
+  /* .init_backend         = */ ggml_backend_remoting_device_init,
+  /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
+  /* .get_host_buffer_type = */ NULL,
+  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
+  /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
+  /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
+  /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
+  /* .event_new            = */ NULL,
+  /* .event_free           = */ NULL,
+  /* .event_synchronize    = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
new file mode 100644
index 000000000..c09c80d64
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -0,0 +1,110 @@
+#include "ggml-remoting.h"
+
+#define BUFT_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+
+static void
+ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  BEING_IMPLEMENTED;
+
+  void *ptr = buffer->context;
+
+  if (ptr == nullptr) {
+        return;
+  }
+  struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
+
+  struct vn_renderer_shmem *shmem = nullptr;
+  size_t index;
+
+  for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) {
+    const uint8_t* addr = (const uint8_t*) std::get<0>(device_ctx->shared_memory[i]) /* ptr */;
+    const uint8_t* endr = addr + std::get<1>(device_ctx->shared_memory[i]) /* size */;
+    if (ptr >= addr && ptr < endr) {
+      shmem = std::get<2>(device_ctx->shared_memory[i]) /* shmem */;
+      index = i;
+      break;
+    }
+  }
+
+  if (shmem == nullptr) {
+    WARNING("failed to free host shared memory: memory not in map\n");
+    return;
+  }
+
+  virtgpu_shmem_destroy(device_ctx->gpu, shmem->shmem);
+
+  device_ctx->shared_memory.erase(device_ctx->shared_memory.begin() + index);
+}
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+  context->base = context->apir_context.shmem->mmap_ptr;
+  context->is_host_buffer = true;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+  INFO("##");
+  INFO("## %s(%llx) --> %p <======================", __func__, size, buffer);
+  INFO("##\n");
+
+  return buffer;
+}
+
+static const char *
+ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED_ONCE;
+
+  return "GUEST host buffer";
+}
+
+static size_t
+ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED_ONCE;
+
+  return 64; // not 100% sure ...
+}
+
+static bool
+ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED_ONCE;
+
+  return true;
+}
+
+static size_t
+ggml_backend_remoting_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED;
+  STOP_HERE;
+
+  return SIZE_MAX;
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_host_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_remoting_host_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+    /* .is_host          = */ ggml_backend_remoting_host_buffer_type_is_host,
+  };
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
new file mode 100644
index 000000000..3d20d8c04
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -0,0 +1,159 @@
+#include <mutex>
+#include <iostream>
+
+#include "ggml-remoting.h"
+#include "ggml-metal-remoting.h"
+
+static struct virtgpu *apir_initialize() {
+  static struct virtgpu *apir_gpu_instance = NULL;
+  static bool apir_initialized = false;
+
+  if (apir_initialized) {
+    return apir_gpu_instance;
+  }
+  apir_initialized = true;
+
+  apir_gpu_instance = create_virtgpu();
+  if (!apir_gpu_instance) {
+    FATAL("failed to initialize the virtgpu :/");
+    return NULL;
+  }
+
+  apir_initialized = true;
+
+  return apir_gpu_instance;
+}
+
+static int ggml_backend_remoting_get_device_count() {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    WARNING("apir_initialize failed :/");
+    return 0;
+  }
+
+  return apir_device_get_count(gpu);
+}
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+
+  IMPLEMENTED;
+
+  return ggml_backend_remoting_get_device_count();
+}
+
+static std::vector<ggml_backend_dev_t> devices;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
+  GGML_ASSERT(device < devices.size());
+  return devices[device];
+}
+
+static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
+  IMPLEMENTED;
+
+  if (devices.size() > 0) {
+    INFO("%s: already initialized", __func__);
+  }
+
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    FATAL("apir_initialize failed :/");
+    return;
+  }
+
+  static bool initialized = false;
+
+  {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!initialized) {
+
+      for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
+        ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context;
+        char desc[256] = "API Remoting device";
+
+        ctx->device = i;
+        ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
+        ctx->description = desc;
+	ctx->gpu = gpu;
+
+	ggml_backend_dev_t dev = new ggml_backend_device {
+	  /* .iface   = */ ggml_backend_remoting_device_interface,
+	  /* .reg     = */ reg,
+	  /* .context = */ ctx,
+	};
+
+	ctx->metal_dev_ctx = get_metal_dev_context(dev);
+
+        devices.push_back(dev);
+      }
+      initialized = true;
+    }
+  }
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+  UNUSED(reg);
+
+  IMPLEMENTED;
+
+  return ggml_backend_remoting_get_device(device);
+}
+
+static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+
+  return GGML_REMOTING_FRONTEND_NAME;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+  /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+  /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+  /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+  /* .get_proc_address = */ NULL,
+};
+
+
+static void showTime() {
+  show_timer(&graph_compute_timer);
+  show_timer(&get_tensor_timer);
+  show_timer(&set_tensor_timer);
+  show_timer(&wait_host_reply_timer);
+
+  if (get_tensor_from_ptr_timer.count) {
+    show_timer(&get_tensor_from_ptr_timer);
+    show_timer(&set_tensor_from_ptr_timer);
+  }
+}
+
+ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    FATAL("apir_initialize failed :/");
+    return NULL;
+  }
+
+  static ggml_backend_reg reg = {
+    /* .api_version = */ GGML_BACKEND_API_VERSION,
+    /* .iface       = */ ggml_backend_remoting_reg_i,
+    /* .context     = */ gpu,
+  };
+
+  static bool initialized = false;
+  if (initialized) {
+    return &reg;
+  }
+  initialized = true;
+
+  ggml_backend_remoting_reg_init_devices(&reg);
+
+  int cr = atexit(showTime);
+  GGML_ASSERT(cr == 0);
+
+  MESSAGE("%s: initialzed", __func__);
+
+  return &reg;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
new file mode 100644
index 000000000..14f95ec88
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -0,0 +1,70 @@
+#include "ggml-remoting.h"
+
+static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
+  UNUSED(backend);
+
+  //IMPLEMENTED_ONCE;
+
+  return "API Remoting backend";
+}
+
+static void ggml_backend_remoting_free(ggml_backend_t backend) {
+  IMPLEMENTED;
+
+  delete backend;
+}
+
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
+static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+  struct virtgpu *gpu = DEV_TO_GPU(backend->device);
+
+  IMPLEMENTED_ONCE;
+
+  start_timer(&graph_compute_timer);
+
+  ggml_status status = apir_backend_graph_compute(gpu, cgraph);
+
+  stop_timer(&graph_compute_timer);
+
+  return status;
+}
+
+static ggml_backend_i ggml_backend_remoting_interface = {
+  /* .get_name                = */ ggml_backend_remoting_get_name,
+  /* .free                    = */ ggml_backend_remoting_free,
+  /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
+  /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
+  /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
+  /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
+  /* .graph_plan_create       = */ NULL,
+  /* .graph_plan_free         = */ NULL,
+  /* .graph_plan_update       = */ NULL,
+  /* .graph_plan_compute      = */ NULL,
+  /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
+  /* .event_record            = */ NULL,
+  /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_remoting_guid() {
+  static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+
+  return &guid;
+}
+
+
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
+  UNUSED(params);
+  IMPLEMENTED;
+
+  ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
+
+  ggml_backend_t remoting_backend = new ggml_backend {
+    /* .guid      = */ ggml_backend_remoting_guid(),
+    /* .interface = */ ggml_backend_remoting_interface,
+    /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device),
+    /* .context   = */ ctx,
+  };
+
+  return remoting_backend;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.cpp b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.cpp
new file mode 100644
index 000000000..97723eabf
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.cpp
@@ -0,0 +1,243 @@
+#include "ggml-remoting.h"
+#include "ggml-metal-remoting.h"
+
+const struct ggml_backend_metal_device_context *get_metal_dev_context(const ggml_backend_dev_t dev) {
+  static struct ggml_backend_metal_device_context metal_dev_ctx;
+  static bool has_metal_dev_ctx = false;
+
+  if (has_metal_dev_ctx) {
+    return &metal_dev_ctx;
+  }
+
+  has_metal_dev_ctx = true;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_metal_get_device_context(gpu, &metal_dev_ctx);
+
+  return &metal_dev_ctx;
+}
+
+
+bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op) {
+    const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
+    const bool has_simdgroup_reduction = ctx_dev->has_simdgroup_reduction;
+    const bool use_bfloat              = ctx_dev->use_bfloat;
+
+    if (!use_bfloat) {
+        if (op->type == GGML_TYPE_BF16) {
+            return false;
+        }
+
+        for (size_t i = 0, n = 3; i < n; ++i) {
+            if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
+                return false;
+            }
+        }
+    }
+
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_SIGMOID:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_ERF:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_ELU:
+                case GGML_UNARY_OP_NEG:
+                case GGML_UNARY_OP_ABS:
+                case GGML_UNARY_OP_SGN:
+                case GGML_UNARY_OP_STEP:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_EXP:
+                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+                default:
+                    return false;
+            }
+        case GGML_OP_GLU:
+            switch (ggml_get_glu_op(op)) {
+                case GGML_GLU_OP_REGLU:
+                case GGML_GLU_OP_GEGLU:
+                case GGML_GLU_OP_SWIGLU:
+                case GGML_GLU_OP_SWIGLU_OAI:
+                case GGML_GLU_OP_GEGLU_ERF:
+                case GGML_GLU_OP_GEGLU_QUICK:
+                    return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+               default:
+                    return false;
+            }
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_CONCAT:
+            return true;
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_ADD_ID:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_ACC:
+        case GGML_OP_REPEAT:
+        case GGML_OP_SCALE:
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            return true;
+        case GGML_OP_CLAMP:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_SIN:
+        case GGML_OP_COS:
+            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_LOG:
+            return false; // TODO: implement
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_GROUP_NORM:
+            return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_L2_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
+        case GGML_OP_ARGMAX:
+            return true;
+        case GGML_OP_NORM:
+            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
+        case GGML_OP_ROPE:
+            return true;
+        case GGML_OP_IM2COL:
+            return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
+        case GGML_OP_POOL_1D:
+            return false;
+        case GGML_OP_UPSCALE:
+            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
+        case GGML_OP_POOL_2D:
+        case GGML_OP_PAD:
+        case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_LEAKY_RELU:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_ARANGE:
+            return true;
+        case GGML_OP_FLASH_ATTN_EXT:
+            if (op->src[0]->ne[0] == 32) {
+                // head size == 32 (e.g. bert-bge-small)
+                // TODO: not sure if it is worth adding kernels for this size
+                return false;
+            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek sizes
+                // TODO: disabled for now, until optmized
+                return false;
+            }
+            if (op->src[1]->type != op->src[2]->type) {
+                return false;
+            }
+            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
+        case GGML_OP_SSM_CONV:
+        case GGML_OP_SSM_SCAN:
+        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_RWKV_WKV7:
+            return true;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            return has_simdgroup_reduction &&
+                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
+        case GGML_OP_CPY:
+        case GGML_OP_DUP:
+        case GGML_OP_CONT:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                        switch (op->type) {
+                           case GGML_TYPE_F32:
+                           case GGML_TYPE_F16:
+                           case GGML_TYPE_BF16:
+                           case GGML_TYPE_Q8_0:
+                           case GGML_TYPE_Q4_0:
+                           case GGML_TYPE_Q4_1:
+                           case GGML_TYPE_Q5_0:
+                           case GGML_TYPE_Q5_1:
+                           case GGML_TYPE_IQ4_NL:
+                                return true;
+                           default:
+                                return false;
+                        }
+                    case GGML_TYPE_F16:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    case GGML_TYPE_BF16:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_BF16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
+                    default:
+                        return false;
+                };
+            }
+        case GGML_OP_SET:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_I32:
+                        return true;
+                    default:
+                        return false;
+                };
+            }
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_GET_ROWS:
+            {
+                return op->ne[3] == 1;
+            }
+        case GGML_OP_SET_ROWS:
+            {
+                if (op->src[0]->type != GGML_TYPE_F32) {
+                    return false;
+                }
+
+                switch (op->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_BF16:
+                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_IQ4_NL:
+                        return true;
+                    default:
+                        return false;
+                };
+            }
+        default:
+            return false;
+    }
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.h
new file mode 100644
index 000000000..ac41823b5
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-metal-remoting.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+struct ggml_backend_metal_device_context {
+  bool has_simdgroup_mm;
+  bool has_simdgroup_reduction;
+  bool use_bfloat;
+};
+
+
+const struct ggml_backend_metal_device_context *get_metal_dev_context(const ggml_backend_dev_t dev);
+
+bool ggml_metal_supports_op(const struct ggml_backend_metal_device_context * ctx_dev, const struct ggml_tensor * op);
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
new file mode 100644
index 000000000..87679fe59
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
@@ -0,0 +1,26 @@
+#include <ostream>
+#include <iostream>
+#include <mutex>
+#include <memory>
+#include <chrono>
+#include <thread>
+#include <unistd.h>
+#include <sys/sysmacros.h>
+#include <sys/stat.h>
+
+#include "ggml-remoting-frontend.h"
+#include "remoting.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+
+
+int ggml_backend_remoting_get_device_count();
+
+
+
+
+struct remoting_device_struct {
+    std::mutex mutex;
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
new file mode 100644
index 000000000..c6f39a533
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <string>
+#include <memory>
+
+#include "ggml-remoting-frontend.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-metal-remoting.h"
+#include "virtgpu.h"
+
+
+// 1 is fast, 0 avoid micro-benchmark crashes
+#define USE_ALWAYS_TRUE_SUPPORTS_OP 0
+#define USE_METAL_GUEST_SUPPORTS_OP 1
+
+#define DEV_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
+
+#define BUFFER_TO_GGML_CONTEXT(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)
+
+#define BUFFER_TO_APIR_CONTEXT(name) \
+  &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context
+
+#define BUFFER_TO_HOST_HANDLE(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle
+
+#define GET_DEVICE_CONTEXT() \
+  (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft->context;
+}
+
+#define NOT_IMPLEMENTED							\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \
+      first = false;							\
+    }									\
+  } while(0)
+
+#define BEING_IMPLEMENTED							\
+  do {									\
+      printf("\nINFO: ###\nINFO: ### function being implemented: %s\nINFO: ###\n\n", __func__); \
+  } while(0)
+
+#define NEXT
+
+#define STOP_HERE \
+  thks_bye()
+
+#define BREAKPOINT \
+  breakpoint()
+
+#ifndef NDEBUG
+#define IMPLEMENTED							\
+  printf("INFO: ### reached implemented function %s\n", __func__)
+#else
+#define IMPLEMENTED							\
+  do {} while(0)
+#endif
+
+#ifndef NDEBUG
+#define IMPLEMENTED_ONCE						\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("INFO: ### reached implemented function %s\n", __func__);  \
+      first = false;							\
+    }									\
+  } while(0)
+#else
+#define IMPLEMENTED_ONCE			\
+  do {} while(0)
+#endif
+
+#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
+
+struct ggml_backend_remoting_device_context {
+  size_t device;
+  std::string name;
+  std::string description;
+
+  std::vector<std::tuple<void*, size_t, struct vn_renderer_shmem *>> shared_memory;
+
+  struct virtgpu *gpu;
+
+  const struct ggml_backend_metal_device_context *metal_dev_ctx;
+};
+
+struct ggml_backend_remoting_buffer_context {
+  apir_buffer_context_t apir_context;
+
+  struct virtgpu *gpu;
+
+  void *base;
+
+  bool is_host_buffer;
+  bool is_from_ptr;
+};
+
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
+extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device);
+ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev);
+
+struct remoting_buffer_struct;
+typedef std::shared_ptr<remoting_buffer_struct> remoting_buffer;
+typedef std::weak_ptr<remoting_buffer_struct> remoting_buffer_ref;
+
+void ggml_remoting_destroy_buffer(remoting_buffer& buf);
+
+struct remoting_device_struct;
+typedef std::shared_ptr<remoting_device_struct> remoting_device;
+typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
+
+struct remoting_context_struct {
+  int i;
+};
+typedef std::shared_ptr<remoting_context_struct> remoting_context;
+typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  return BUFFER_TO_HOST_HANDLE(buffer);
+}
diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
new file mode 100644
index 000000000..4e4f7c2c3
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
@@ -0,0 +1,1408 @@
+/*
+ * Header for the Direct Rendering Manager
+ *
+ * Author: Rickard E. (Rik) Faith <faith@valinux.com>
+ *
+ * Acknowledgments:
+ * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic cmpxchg.
+ */
+
+/*
+ * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_H_
+#define _DRM_H_
+
+#if   defined(__linux__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#else /* One of the BSDs */
+
+#include <stdint.h>
+#include <sys/ioccom.h>
+#include <sys/types.h>
+typedef int8_t   __s8;
+typedef uint8_t  __u8;
+typedef int16_t  __s16;
+typedef uint16_t __u16;
+typedef int32_t  __s32;
+typedef uint32_t __u32;
+typedef int64_t  __s64;
+typedef uint64_t __u64;
+typedef size_t   __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_NAME	"drm"	  /**< Name in kernel, /dev, and /proc */
+#define DRM_MIN_ORDER	5	  /**< At least 2^5 bytes = 32 bytes */
+#define DRM_MAX_ORDER	22	  /**< Up to 2^22 bytes = 4MB */
+#define DRM_RAM_PERCENT 10	  /**< How much system ram can we lock? */
+
+#define _DRM_LOCK_HELD	0x80000000U /**< Hardware lock is held */
+#define _DRM_LOCK_CONT	0x40000000U /**< Hardware lock is contended */
+#define _DRM_LOCK_IS_HELD(lock)	   ((lock) & _DRM_LOCK_HELD)
+#define _DRM_LOCK_IS_CONT(lock)	   ((lock) & _DRM_LOCK_CONT)
+#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT))
+
+typedef unsigned int drm_context_t;
+typedef unsigned int drm_drawable_t;
+typedef unsigned int drm_magic_t;
+
+/*
+ * Cliprect.
+ *
+ * \warning: If you change this structure, make sure you change
+ * XF86DRIClipRectRec in the server as well
+ *
+ * \note KW: Actually it's illegal to change either for
+ * backwards-compatibility reasons.
+ */
+struct drm_clip_rect {
+	unsigned short x1;
+	unsigned short y1;
+	unsigned short x2;
+	unsigned short y2;
+};
+
+/*
+ * Drawable information.
+ */
+struct drm_drawable_info {
+	unsigned int num_rects;
+	struct drm_clip_rect *rects;
+};
+
+/*
+ * Texture region,
+ */
+struct drm_tex_region {
+	unsigned char next;
+	unsigned char prev;
+	unsigned char in_use;
+	unsigned char padding;
+	unsigned int age;
+};
+
+/*
+ * Hardware lock.
+ *
+ * The lock structure is a simple cache-line aligned integer.  To avoid
+ * processor bus contention on a multiprocessor system, there should not be any
+ * other data stored in the same cache line.
+ */
+struct drm_hw_lock {
+	__volatile__ unsigned int lock;		/**< lock variable */
+	char padding[60];			/**< Pad to cache line */
+};
+
+/*
+ * DRM_IOCTL_VERSION ioctl argument type.
+ *
+ * \sa drmGetVersion().
+ */
+struct drm_version {
+	int version_major;	  /**< Major version */
+	int version_minor;	  /**< Minor version */
+	int version_patchlevel;	  /**< Patch level */
+	__kernel_size_t name_len;	  /**< Length of name buffer */
+	char *name;	  /**< Name of driver */
+	__kernel_size_t date_len;	  /**< Length of date buffer */
+	char *date;	  /**< User-space buffer to hold date */
+	__kernel_size_t desc_len;	  /**< Length of desc buffer */
+	char *desc;	  /**< User-space buffer to hold desc */
+};
+
+/*
+ * DRM_IOCTL_GET_UNIQUE ioctl argument type.
+ *
+ * \sa drmGetBusid() and drmSetBusId().
+ */
+struct drm_unique {
+	__kernel_size_t unique_len;	  /**< Length of unique */
+	char *unique;	  /**< Unique name for driver instantiation */
+};
+
+struct drm_list {
+	int count;		  /**< Length of user-space structures */
+	struct drm_version *version;
+};
+
+struct drm_block {
+	int unused;
+};
+
+/*
+ * DRM_IOCTL_CONTROL ioctl argument type.
+ *
+ * \sa drmCtlInstHandler() and drmCtlUninstHandler().
+ */
+struct drm_control {
+	enum {
+		DRM_ADD_COMMAND,
+		DRM_RM_COMMAND,
+		DRM_INST_HANDLER,
+		DRM_UNINST_HANDLER
+	} func;
+	int irq;
+};
+
+/*
+ * Type of memory to map.
+ */
+enum drm_map_type {
+	_DRM_FRAME_BUFFER = 0,	  /**< WC (no caching), no core dump */
+	_DRM_REGISTERS = 1,	  /**< no caching, no core dump */
+	_DRM_SHM = 2,		  /**< shared, cached */
+	_DRM_AGP = 3,		  /**< AGP/GART */
+	_DRM_SCATTER_GATHER = 4,  /**< Scatter/gather memory for PCI DMA */
+	_DRM_CONSISTENT = 5	  /**< Consistent memory for PCI DMA */
+};
+
+/*
+ * Memory mapping flags.
+ */
+enum drm_map_flags {
+	_DRM_RESTRICTED = 0x01,	     /**< Cannot be mapped to user-virtual */
+	_DRM_READ_ONLY = 0x02,
+	_DRM_LOCKED = 0x04,	     /**< shared, cached, locked */
+	_DRM_KERNEL = 0x08,	     /**< kernel requires access */
+	_DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */
+	_DRM_CONTAINS_LOCK = 0x20,   /**< SHM page that contains lock */
+	_DRM_REMOVABLE = 0x40,	     /**< Removable mapping */
+	_DRM_DRIVER = 0x80	     /**< Managed by driver */
+};
+
+struct drm_ctx_priv_map {
+	unsigned int ctx_id;	 /**< Context requesting private mapping */
+	void *handle;		 /**< Handle of map */
+};
+
+/*
+ * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls
+ * argument type.
+ *
+ * \sa drmAddMap().
+ */
+struct drm_map {
+	unsigned long offset;	 /**< Requested physical address (0 for SAREA)*/
+	unsigned long size;	 /**< Requested physical size (bytes) */
+	enum drm_map_type type;	 /**< Type of memory to map */
+	enum drm_map_flags flags;	 /**< Flags */
+	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
+				 /**< Kernel-space: kernel-virtual address */
+	int mtrr;		 /**< MTRR slot used */
+	/*   Private data */
+};
+
+/*
+ * DRM_IOCTL_GET_CLIENT ioctl argument type.
+ */
+struct drm_client {
+	int idx;		/**< Which client desired? */
+	int auth;		/**< Is client authenticated? */
+	unsigned long pid;	/**< Process ID */
+	unsigned long uid;	/**< User ID */
+	unsigned long magic;	/**< Magic */
+	unsigned long iocs;	/**< Ioctl count */
+};
+
+enum drm_stat_type {
+	_DRM_STAT_LOCK,
+	_DRM_STAT_OPENS,
+	_DRM_STAT_CLOSES,
+	_DRM_STAT_IOCTLS,
+	_DRM_STAT_LOCKS,
+	_DRM_STAT_UNLOCKS,
+	_DRM_STAT_VALUE,	/**< Generic value */
+	_DRM_STAT_BYTE,		/**< Generic byte counter (1024bytes/K) */
+	_DRM_STAT_COUNT,	/**< Generic non-byte counter (1000/k) */
+
+	_DRM_STAT_IRQ,		/**< IRQ */
+	_DRM_STAT_PRIMARY,	/**< Primary DMA bytes */
+	_DRM_STAT_SECONDARY,	/**< Secondary DMA bytes */
+	_DRM_STAT_DMA,		/**< DMA */
+	_DRM_STAT_SPECIAL,	/**< Special DMA (e.g., priority or polled) */
+	_DRM_STAT_MISSED	/**< Missed DMA opportunity */
+	    /* Add to the *END* of the list */
+};
+
+/*
+ * DRM_IOCTL_GET_STATS ioctl argument type.
+ */
+struct drm_stats {
+	unsigned long count;
+	struct {
+		unsigned long value;
+		enum drm_stat_type type;
+	} data[15];
+};
+
+/*
+ * Hardware locking flags.
+ */
+enum drm_lock_flags {
+	_DRM_LOCK_READY = 0x01,	     /**< Wait until hardware is ready for DMA */
+	_DRM_LOCK_QUIESCENT = 0x02,  /**< Wait until hardware quiescent */
+	_DRM_LOCK_FLUSH = 0x04,	     /**< Flush this context's DMA queue first */
+	_DRM_LOCK_FLUSH_ALL = 0x08,  /**< Flush all DMA queues first */
+	/* These *HALT* flags aren't supported yet
+	   -- they will be used to support the
+	   full-screen DGA-like mode. */
+	_DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+	_DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+};
+
+/*
+ * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type.
+ *
+ * \sa drmGetLock() and drmUnlock().
+ */
+struct drm_lock {
+	int context;
+	enum drm_lock_flags flags;
+};
+
+/*
+ * DMA flags
+ *
+ * \warning
+ * These values \e must match xf86drm.h.
+ *
+ * \sa drm_dma.
+ */
+enum drm_dma_flags {
+	/* Flags for DMA buffer dispatch */
+	_DRM_DMA_BLOCK = 0x01,	      /**<
+				       * Block until buffer dispatched.
+				       *
+				       * \note The buffer may not yet have
+				       * been processed by the hardware --
+				       * getting a hardware lock with the
+				       * hardware quiescent will ensure
+				       * that the buffer has been
+				       * processed.
+				       */
+	_DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+	_DRM_DMA_PRIORITY = 0x04,     /**< High priority dispatch */
+
+	/* Flags for DMA buffer request */
+	_DRM_DMA_WAIT = 0x10,	      /**< Wait for free buffers */
+	_DRM_DMA_SMALLER_OK = 0x20,   /**< Smaller-than-requested buffers OK */
+	_DRM_DMA_LARGER_OK = 0x40     /**< Larger-than-requested buffers OK */
+};
+
+/*
+ * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type.
+ *
+ * \sa drmAddBufs().
+ */
+struct drm_buf_desc {
+	int count;		 /**< Number of buffers of this size */
+	int size;		 /**< Size in bytes */
+	int low_mark;		 /**< Low water mark */
+	int high_mark;		 /**< High water mark */
+	enum {
+		_DRM_PAGE_ALIGN = 0x01,	/**< Align on page boundaries for DMA */
+		_DRM_AGP_BUFFER = 0x02,	/**< Buffer is in AGP space */
+		_DRM_SG_BUFFER = 0x04,	/**< Scatter/gather memory buffer */
+		_DRM_FB_BUFFER = 0x08,	/**< Buffer is in frame buffer */
+		_DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */
+	} flags;
+	unsigned long agp_start; /**<
+				  * Start address of where the AGP buffers are
+				  * in the AGP aperture
+				  */
+};
+
+/*
+ * DRM_IOCTL_INFO_BUFS ioctl argument type.
+ */
+struct drm_buf_info {
+	int count;		/**< Entries in list */
+	struct drm_buf_desc *list;
+};
+
+/*
+ * DRM_IOCTL_FREE_BUFS ioctl argument type.
+ */
+struct drm_buf_free {
+	int count;
+	int *list;
+};
+
+/*
+ * Buffer information
+ *
+ * \sa drm_buf_map.
+ */
+struct drm_buf_pub {
+	int idx;		       /**< Index into the master buffer list */
+	int total;		       /**< Buffer size */
+	int used;		       /**< Amount of buffer in use (for DMA) */
+	void *address;	       /**< Address of buffer */
+};
+
+/*
+ * DRM_IOCTL_MAP_BUFS ioctl argument type.
+ */
+struct drm_buf_map {
+	int count;		/**< Length of the buffer list */
+#ifdef __cplusplus
+	void *virt;
+#else
+	void *virtual;		/**< Mmap'd area in user-virtual */
+#endif
+	struct drm_buf_pub *list;	/**< Buffer information */
+};
+
+/*
+ * DRM_IOCTL_DMA ioctl argument type.
+ *
+ * Indices here refer to the offset into the buffer list in drm_buf_get.
+ *
+ * \sa drmDMA().
+ */
+struct drm_dma {
+	int context;			  /**< Context handle */
+	int send_count;			  /**< Number of buffers to send */
+	int *send_indices;	  /**< List of handles to buffers */
+	int *send_sizes;		  /**< Lengths of data to send */
+	enum drm_dma_flags flags;	  /**< Flags */
+	int request_count;		  /**< Number of buffers requested */
+	int request_size;		  /**< Desired size for buffers */
+	int *request_indices;	  /**< Buffer information */
+	int *request_sizes;
+	int granted_count;		  /**< Number of buffers granted */
+};
+
+enum drm_ctx_flags {
+	_DRM_CONTEXT_PRESERVED = 0x01,
+	_DRM_CONTEXT_2DONLY = 0x02
+};
+
+/*
+ * DRM_IOCTL_ADD_CTX ioctl argument type.
+ *
+ * \sa drmCreateContext() and drmDestroyContext().
+ */
+struct drm_ctx {
+	drm_context_t handle;
+	enum drm_ctx_flags flags;
+};
+
+/*
+ * DRM_IOCTL_RES_CTX ioctl argument type.
+ */
+struct drm_ctx_res {
+	int count;
+	struct drm_ctx *contexts;
+};
+
+/*
+ * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type.
+ */
+struct drm_draw {
+	drm_drawable_t handle;
+};
+
+/*
+ * DRM_IOCTL_UPDATE_DRAW ioctl argument type.
+ */
+typedef enum {
+	DRM_DRAWABLE_CLIPRECTS
+} drm_drawable_info_type_t;
+
+struct drm_update_draw {
+	drm_drawable_t handle;
+	unsigned int type;
+	unsigned int num;
+	unsigned long long data;
+};
+
+/*
+ * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type.
+ */
+struct drm_auth {
+	drm_magic_t magic;
+};
+
+/*
+ * DRM_IOCTL_IRQ_BUSID ioctl argument type.
+ *
+ * \sa drmGetInterruptFromBusID().
+ */
+struct drm_irq_busid {
+	int irq;	/**< IRQ number */
+	int busnum;	/**< bus number */
+	int devnum;	/**< device number */
+	int funcnum;	/**< function number */
+};
+
+enum drm_vblank_seq_type {
+	_DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+	_DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+	/* bits 1-6 are reserved for high crtcs */
+	_DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+	_DRM_VBLANK_EVENT = 0x4000000,   /**< Send event instead of blocking */
+	_DRM_VBLANK_FLIP = 0x8000000,   /**< Scheduled buffer swap should flip */
+	_DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+	_DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+	_DRM_VBLANK_SIGNAL = 0x40000000	/**< Send signal instead of blocking, unsupported */
+};
+#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE)
+#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \
+				_DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS)
+
+struct drm_wait_vblank_request {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	unsigned long signal;
+};
+
+struct drm_wait_vblank_reply {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+};
+
+/*
+ * DRM_IOCTL_WAIT_VBLANK ioctl argument type.
+ *
+ * \sa drmWaitVBlank().
+ */
+union drm_wait_vblank {
+	struct drm_wait_vblank_request request;
+	struct drm_wait_vblank_reply reply;
+};
+
+#define _DRM_PRE_MODESET 1
+#define _DRM_POST_MODESET 2
+
+/*
+ * DRM_IOCTL_MODESET_CTL ioctl argument type
+ *
+ * \sa drmModesetCtl().
+ */
+struct drm_modeset_ctl {
+	__u32 crtc;
+	__u32 cmd;
+};
+
+/*
+ * DRM_IOCTL_AGP_ENABLE ioctl argument type.
+ *
+ * \sa drmAgpEnable().
+ */
+struct drm_agp_mode {
+	unsigned long mode;	/**< AGP mode */
+};
+
+/*
+ * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type.
+ *
+ * \sa drmAgpAlloc() and drmAgpFree().
+ */
+struct drm_agp_buffer {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for binding / unbinding */
+	unsigned long type;	/**< Type of memory to allocate */
+	unsigned long physical;	/**< Physical used by i810 */
+};
+
+/*
+ * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type.
+ *
+ * \sa drmAgpBind() and drmAgpUnbind().
+ */
+struct drm_agp_binding {
+	unsigned long handle;	/**< From drm_agp_buffer */
+	unsigned long offset;	/**< In bytes -- will round to page boundary */
+};
+
+/*
+ * DRM_IOCTL_AGP_INFO ioctl argument type.
+ *
+ * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(),
+ * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(),
+ * drmAgpVendorId() and drmAgpDeviceId().
+ */
+struct drm_agp_info {
+	int agp_version_major;
+	int agp_version_minor;
+	unsigned long mode;
+	unsigned long aperture_base;	/* physical address */
+	unsigned long aperture_size;	/* bytes */
+	unsigned long memory_allowed;	/* bytes */
+	unsigned long memory_used;
+
+	/* PCI information */
+	unsigned short id_vendor;
+	unsigned short id_device;
+};
+
+/*
+ * DRM_IOCTL_SG_ALLOC ioctl argument type.
+ */
+struct drm_scatter_gather {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for mapping / unmapping */
+};
+
+/*
+ * DRM_IOCTL_SET_VERSION ioctl argument type.
+ */
+struct drm_set_version {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+};
+
+/* DRM_IOCTL_GEM_CLOSE ioctl argument type */
+struct drm_gem_close {
+	/** Handle of the object to be closed. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/* DRM_IOCTL_GEM_FLINK ioctl argument type */
+struct drm_gem_flink {
+	/** Handle for the object being named */
+	__u32 handle;
+
+	/** Returned global name */
+	__u32 name;
+};
+
+/* DRM_IOCTL_GEM_OPEN ioctl argument type */
+struct drm_gem_open {
+	/** Name of object being opened */
+	__u32 name;
+
+	/** Returned handle for the object */
+	__u32 handle;
+
+	/** Returned size of the object */
+	__u64 size;
+};
+
+/**
+ * DRM_CAP_DUMB_BUFFER
+ *
+ * If set to 1, the driver supports creating dumb buffers via the
+ * &DRM_IOCTL_MODE_CREATE_DUMB ioctl.
+ */
+#define DRM_CAP_DUMB_BUFFER		0x1
+/**
+ * DRM_CAP_VBLANK_HIGH_CRTC
+ *
+ * If set to 1, the kernel supports specifying a :ref:`CRTC index<crtc_index>`
+ * in the high bits of &drm_wait_vblank_request.type.
+ *
+ * Starting kernel version 2.6.39, this capability is always set to 1.
+ */
+#define DRM_CAP_VBLANK_HIGH_CRTC	0x2
+/**
+ * DRM_CAP_DUMB_PREFERRED_DEPTH
+ *
+ * The preferred bit depth for dumb buffers.
+ *
+ * The bit depth is the number of bits used to indicate the color of a single
+ * pixel excluding any padding. This is different from the number of bits per
+ * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per
+ * pixel.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
+/**
+ * DRM_CAP_DUMB_PREFER_SHADOW
+ *
+ * If set to 1, the driver prefers userspace to render to a shadow buffer
+ * instead of directly rendering to a dumb buffer. For best speed, userspace
+ * should do streaming ordered memory copies into the dumb buffer and never
+ * read from it.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFER_SHADOW	0x4
+/**
+ * DRM_CAP_PRIME
+ *
+ * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT
+ * and &DRM_PRIME_CAP_EXPORT.
+ *
+ * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and
+ * &DRM_PRIME_CAP_EXPORT are always advertised.
+ *
+ * PRIME buffers are exposed as dma-buf file descriptors.
+ * See :ref:`prime_buffer_sharing`.
+ */
+#define DRM_CAP_PRIME			0x5
+/**
+ * DRM_PRIME_CAP_IMPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME
+ * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_IMPORT		0x1
+/**
+ * DRM_PRIME_CAP_EXPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME
+ * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_EXPORT		0x2
+/**
+ * DRM_CAP_TIMESTAMP_MONOTONIC
+ *
+ * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in
+ * struct drm_event_vblank. If set to 1, the kernel will report timestamps with
+ * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these
+ * clocks.
+ *
+ * Starting from kernel version 2.6.39, the default value for this capability
+ * is 1. Starting kernel version 4.15, this capability is always set to 1.
+ */
+#define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
+/**
+ * DRM_CAP_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy
+ * page-flips.
+ */
+#define DRM_CAP_ASYNC_PAGE_FLIP		0x7
+/**
+ * DRM_CAP_CURSOR_WIDTH
+ *
+ * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid
+ * width x height combination for the hardware cursor. The intention is that a
+ * hardware agnostic userspace can query a cursor plane size to use.
+ *
+ * Note that the cross-driver contract is to merely return a valid size;
+ * drivers are free to attach another meaning on top, eg. i915 returns the
+ * maximum plane size.
+ */
+#define DRM_CAP_CURSOR_WIDTH		0x8
+/**
+ * DRM_CAP_CURSOR_HEIGHT
+ *
+ * See &DRM_CAP_CURSOR_WIDTH.
+ */
+#define DRM_CAP_CURSOR_HEIGHT		0x9
+/**
+ * DRM_CAP_ADDFB2_MODIFIERS
+ *
+ * If set to 1, the driver supports supplying modifiers in the
+ * &DRM_IOCTL_MODE_ADDFB2 ioctl.
+ */
+#define DRM_CAP_ADDFB2_MODIFIERS	0x10
+/**
+ * DRM_CAP_PAGE_FLIP_TARGET
+ *
+ * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and
+ * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in
+ * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP
+ * ioctl.
+ */
+#define DRM_CAP_PAGE_FLIP_TARGET	0x11
+/**
+ * DRM_CAP_CRTC_IN_VBLANK_EVENT
+ *
+ * If set to 1, the kernel supports reporting the CRTC ID in
+ * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and
+ * &DRM_EVENT_FLIP_COMPLETE events.
+ *
+ * Starting kernel version 4.12, this capability is always set to 1.
+ */
+#define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
+/**
+ * DRM_CAP_SYNCOBJ
+ *
+ * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ		0x13
+/**
+ * DRM_CAP_SYNCOBJ_TIMELINE
+ *
+ * If set to 1, the driver supports timeline operations on sync objects. See
+ * :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ_TIMELINE	0x14
+/**
+ * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic
+ * commits.
+ */
+#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP	0x15
+
+/* DRM_IOCTL_GET_CAP ioctl argument type */
+struct drm_get_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+/**
+ * DRM_CLIENT_CAP_STEREO_3D
+ *
+ * If set to 1, the DRM core will expose the stereo 3D capabilities of the
+ * monitor by advertising the supported 3D layouts in the flags of struct
+ * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 3.13.
+ */
+#define DRM_CLIENT_CAP_STEREO_3D	1
+
+/**
+ * DRM_CLIENT_CAP_UNIVERSAL_PLANES
+ *
+ * If set to 1, the DRM core will expose all planes (overlay, primary, and
+ * cursor) to userspace.
+ *
+ * This capability has been introduced in kernel version 3.15. Starting from
+ * kernel version 3.17, this capability is always supported for all drivers.
+ */
+#define DRM_CLIENT_CAP_UNIVERSAL_PLANES  2
+
+/**
+ * DRM_CLIENT_CAP_ATOMIC
+ *
+ * If set to 1, the DRM core will expose atomic properties to userspace. This
+ * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and
+ * &DRM_CLIENT_CAP_ASPECT_RATIO.
+ *
+ * If the driver doesn't support atomic mode-setting, enabling this capability
+ * will fail with -EOPNOTSUPP.
+ *
+ * This capability has been introduced in kernel version 4.0. Starting from
+ * kernel version 4.2, this capability is always supported for atomic-capable
+ * drivers.
+ */
+#define DRM_CLIENT_CAP_ATOMIC	3
+
+/**
+ * DRM_CLIENT_CAP_ASPECT_RATIO
+ *
+ * If set to 1, the DRM core will provide aspect ratio information in modes.
+ * See ``DRM_MODE_FLAG_PIC_AR_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 4.18.
+ */
+#define DRM_CLIENT_CAP_ASPECT_RATIO    4
+
+/**
+ * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS
+ *
+ * If set to 1, the DRM core will expose special connectors to be used for
+ * writing back to memory the scene setup in the commit. The client must enable
+ * &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * This capability is always supported for atomic-capable drivers starting from
+ * kernel version 4.19.
+ */
+#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS	5
+
+/**
+ * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT
+ *
+ * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and
+ * virtualbox) have additional restrictions for cursor planes (thus
+ * making cursor planes on those drivers not truly universal,) e.g.
+ * they need cursor planes to act like one would expect from a mouse
+ * cursor and have correctly set hotspot properties.
+ * If this client cap is not set the DRM core will hide cursor plane on
+ * those virtualized drivers because not setting it implies that the
+ * client is not capable of dealing with those extra restictions.
+ * Clients which do set cursor hotspot and treat the cursor plane
+ * like a mouse cursor should set this property.
+ * The client must enable &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * Setting this property on drivers which do not special case
+ * cursor planes (i.e. non-virtualized drivers) will return
+ * EOPNOTSUPP, which can be used by userspace to gauge
+ * requirements of the hardware/drivers they're running on.
+ *
+ * This capability is always supported for atomic-capable virtualized
+ * drivers starting from kernel version 6.6.
+ */
+#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT	6
+
+/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
+struct drm_set_client_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+#define DRM_RDWR O_RDWR
+#define DRM_CLOEXEC O_CLOEXEC
+struct drm_prime_handle {
+	__u32 handle;
+
+	/** Flags.. only applicable for handle->fd */
+	__u32 flags;
+
+	/** Returned dmabuf file descriptor */
+	__s32 fd;
+};
+
+struct drm_syncobj_create {
+	__u32 handle;
+#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0)
+	__u32 flags;
+};
+
+struct drm_syncobj_destroy {
+	__u32 handle;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0)
+#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0)
+struct drm_syncobj_handle {
+	__u32 handle;
+	__u32 flags;
+
+	__s32 fd;
+	__u32 pad;
+};
+
+struct drm_syncobj_transfer {
+	__u32 src_handle;
+	__u32 dst_handle;
+	__u64 src_point;
+	__u64 dst_point;
+	__u32 flags;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */
+struct drm_syncobj_wait {
+	__u64 handles;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+struct drm_syncobj_timeline_wait {
+	__u64 handles;
+	/* wait on specific timeline point for every handles*/
+	__u64 points;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+/**
+ * struct drm_syncobj_eventfd
+ * @handle: syncobj handle.
+ * @flags: Zero to wait for the point to be signalled, or
+ *         &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be
+ *         available for the point.
+ * @point: syncobj timeline point (set to zero for binary syncobjs).
+ * @fd: Existing eventfd to sent events to.
+ * @pad: Must be zero.
+ *
+ * Register an eventfd to be signalled by a syncobj. The eventfd counter will
+ * be incremented by one.
+ */
+struct drm_syncobj_eventfd {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+	__s32 fd;
+	__u32 pad;
+};
+
+
+struct drm_syncobj_array {
+	__u64 handles;
+	__u32 count_handles;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */
+struct drm_syncobj_timeline_array {
+	__u64 handles;
+	__u64 points;
+	__u32 count_handles;
+	__u32 flags;
+};
+
+
+/* Query current scanout sequence number */
+struct drm_crtc_get_sequence {
+	__u32 crtc_id;		/* requested crtc_id */
+	__u32 active;		/* return: crtc output is active */
+	__u64 sequence;		/* return: most recent vblank sequence */
+	__s64 sequence_ns;	/* return: most recent time of first pixel out */
+};
+
+/* Queue event to be delivered at specified sequence. Time stamp marks
+ * when the first pixel of the refresh cycle leaves the display engine
+ * for the display
+ */
+#define DRM_CRTC_SEQUENCE_RELATIVE		0x00000001	/* sequence is relative to current */
+#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS		0x00000002	/* Use next sequence if we've missed */
+
+struct drm_crtc_queue_sequence {
+	__u32 crtc_id;
+	__u32 flags;
+	__u64 sequence;		/* on input, target sequence. on output, actual sequence */
+	__u64 user_data;	/* user data passed to event */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "drm_mode.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_IOCTL_BASE			'd'
+#define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
+#define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOW(nr,type)		_IOW(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOWR(nr,type)		_IOWR(DRM_IOCTL_BASE,nr,type)
+
+#define DRM_IOCTL_VERSION		DRM_IOWR(0x00, struct drm_version)
+#define DRM_IOCTL_GET_UNIQUE		DRM_IOWR(0x01, struct drm_unique)
+#define DRM_IOCTL_GET_MAGIC		DRM_IOR( 0x02, struct drm_auth)
+#define DRM_IOCTL_IRQ_BUSID		DRM_IOWR(0x03, struct drm_irq_busid)
+#define DRM_IOCTL_GET_MAP               DRM_IOWR(0x04, struct drm_map)
+#define DRM_IOCTL_GET_CLIENT            DRM_IOWR(0x05, struct drm_client)
+#define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
+#define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
+#define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+/**
+ * DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
+ *
+ * GEM handles are not reference-counted by the kernel. User-space is
+ * responsible for managing their lifetime. For example, if user-space imports
+ * the same memory object twice on the same DRM file description, the same GEM
+ * handle is returned by both imports, and user-space needs to ensure
+ * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
+ * when a memory object is allocated, then exported and imported again on the
+ * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
+ * and always returns fresh new GEM handles even if an existing GEM handle
+ * already refers to the same memory object before the IOCTL is performed.
+ */
+#define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
+#define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
+#define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
+#define DRM_IOCTL_GET_CAP		DRM_IOWR(0x0c, struct drm_get_cap)
+#define DRM_IOCTL_SET_CLIENT_CAP	DRM_IOW( 0x0d, struct drm_set_client_cap)
+
+#define DRM_IOCTL_SET_UNIQUE		DRM_IOW( 0x10, struct drm_unique)
+#define DRM_IOCTL_AUTH_MAGIC		DRM_IOW( 0x11, struct drm_auth)
+#define DRM_IOCTL_BLOCK			DRM_IOWR(0x12, struct drm_block)
+#define DRM_IOCTL_UNBLOCK		DRM_IOWR(0x13, struct drm_block)
+#define DRM_IOCTL_CONTROL		DRM_IOW( 0x14, struct drm_control)
+#define DRM_IOCTL_ADD_MAP		DRM_IOWR(0x15, struct drm_map)
+#define DRM_IOCTL_ADD_BUFS		DRM_IOWR(0x16, struct drm_buf_desc)
+#define DRM_IOCTL_MARK_BUFS		DRM_IOW( 0x17, struct drm_buf_desc)
+#define DRM_IOCTL_INFO_BUFS		DRM_IOWR(0x18, struct drm_buf_info)
+#define DRM_IOCTL_MAP_BUFS		DRM_IOWR(0x19, struct drm_buf_map)
+#define DRM_IOCTL_FREE_BUFS		DRM_IOW( 0x1a, struct drm_buf_free)
+
+#define DRM_IOCTL_RM_MAP		DRM_IOW( 0x1b, struct drm_map)
+
+#define DRM_IOCTL_SET_SAREA_CTX		DRM_IOW( 0x1c, struct drm_ctx_priv_map)
+#define DRM_IOCTL_GET_SAREA_CTX 	DRM_IOWR(0x1d, struct drm_ctx_priv_map)
+
+#define DRM_IOCTL_SET_MASTER            DRM_IO(0x1e)
+#define DRM_IOCTL_DROP_MASTER           DRM_IO(0x1f)
+
+#define DRM_IOCTL_ADD_CTX		DRM_IOWR(0x20, struct drm_ctx)
+#define DRM_IOCTL_RM_CTX		DRM_IOWR(0x21, struct drm_ctx)
+#define DRM_IOCTL_MOD_CTX		DRM_IOW( 0x22, struct drm_ctx)
+#define DRM_IOCTL_GET_CTX		DRM_IOWR(0x23, struct drm_ctx)
+#define DRM_IOCTL_SWITCH_CTX		DRM_IOW( 0x24, struct drm_ctx)
+#define DRM_IOCTL_NEW_CTX		DRM_IOW( 0x25, struct drm_ctx)
+#define DRM_IOCTL_RES_CTX		DRM_IOWR(0x26, struct drm_ctx_res)
+#define DRM_IOCTL_ADD_DRAW		DRM_IOWR(0x27, struct drm_draw)
+#define DRM_IOCTL_RM_DRAW		DRM_IOWR(0x28, struct drm_draw)
+#define DRM_IOCTL_DMA			DRM_IOWR(0x29, struct drm_dma)
+#define DRM_IOCTL_LOCK			DRM_IOW( 0x2a, struct drm_lock)
+#define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
+#define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
+
+/**
+ * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
+ *
+ * User-space sets &drm_prime_handle.handle with the GEM handle to export and
+ * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
+ * &drm_prime_handle.fd.
+ *
+ * The export can fail for any driver-specific reason, e.g. because export is
+ * not supported for this specific GEM handle (but might be for others).
+ *
+ * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
+ */
+#define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+/**
+ * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
+ *
+ * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
+ * import, and gets back a GEM handle in &drm_prime_handle.handle.
+ * &drm_prime_handle.flags is unused.
+ *
+ * If an existing GEM handle refers to the memory object backing the DMA-BUF,
+ * that GEM handle is returned. Therefore user-space which needs to handle
+ * arbitrary DMA-BUFs must have a user-space lookup data structure to manually
+ * reference-count duplicated GEM handles. For more information see
+ * &DRM_IOCTL_GEM_CLOSE.
+ *
+ * The import can fail for any driver-specific reason, e.g. because import is
+ * only supported for DMA-BUFs allocated on this DRM device.
+ *
+ * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
+ */
+#define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
+
+#define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
+#define DRM_IOCTL_AGP_RELEASE		DRM_IO(  0x31)
+#define DRM_IOCTL_AGP_ENABLE		DRM_IOW( 0x32, struct drm_agp_mode)
+#define DRM_IOCTL_AGP_INFO		DRM_IOR( 0x33, struct drm_agp_info)
+#define DRM_IOCTL_AGP_ALLOC		DRM_IOWR(0x34, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_FREE		DRM_IOW( 0x35, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_BIND		DRM_IOW( 0x36, struct drm_agp_binding)
+#define DRM_IOCTL_AGP_UNBIND		DRM_IOW( 0x37, struct drm_agp_binding)
+
+#define DRM_IOCTL_SG_ALLOC		DRM_IOWR(0x38, struct drm_scatter_gather)
+#define DRM_IOCTL_SG_FREE		DRM_IOW( 0x39, struct drm_scatter_gather)
+
+#define DRM_IOCTL_WAIT_VBLANK		DRM_IOWR(0x3a, union drm_wait_vblank)
+
+#define DRM_IOCTL_CRTC_GET_SEQUENCE	DRM_IOWR(0x3b, struct drm_crtc_get_sequence)
+#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE	DRM_IOWR(0x3c, struct drm_crtc_queue_sequence)
+
+#define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
+
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+/**
+ * DRM_IOCTL_MODE_RMFB - Remove a framebuffer.
+ *
+ * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * Warning: removing a framebuffer currently in-use on an enabled plane will
+ * disable that plane. The CRTC the plane is linked to may also be disabled
+ * (depending on driver capabilities).
+ */
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
+#define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
+
+/**
+ * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object.
+ *
+ * KMS dumb buffers provide a very primitive way to allocate a buffer object
+ * suitable for scanout and map it for software rendering. KMS dumb buffers are
+ * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb
+ * buffers are not suitable to be displayed on any other device than the KMS
+ * device where they were allocated from. Also see
+ * :ref:`kms_dumb_buffer_objects`.
+ *
+ * The IOCTL argument is a struct drm_mode_create_dumb.
+ *
+ * User-space is expected to create a KMS dumb buffer via this IOCTL, then add
+ * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via
+ * &DRM_IOCTL_MODE_MAP_DUMB.
+ *
+ * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported.
+ * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate
+ * driver preferences for dumb buffers.
+ */
+#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb)
+#define DRM_IOCTL_MODE_MAP_DUMB    DRM_IOWR(0xB3, struct drm_mode_map_dumb)
+#define DRM_IOCTL_MODE_DESTROY_DUMB    DRM_IOWR(0xB4, struct drm_mode_destroy_dumb)
+#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res)
+#define DRM_IOCTL_MODE_GETPLANE	DRM_IOWR(0xB6, struct drm_mode_get_plane)
+#define DRM_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct drm_mode_set_plane)
+#define DRM_IOCTL_MODE_ADDFB2		DRM_IOWR(0xB8, struct drm_mode_fb_cmd2)
+#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES	DRM_IOWR(0xB9, struct drm_mode_obj_get_properties)
+#define DRM_IOCTL_MODE_OBJ_SETPROPERTY	DRM_IOWR(0xBA, struct drm_mode_obj_set_property)
+#define DRM_IOCTL_MODE_CURSOR2		DRM_IOWR(0xBB, struct drm_mode_cursor2)
+#define DRM_IOCTL_MODE_ATOMIC		DRM_IOWR(0xBC, struct drm_mode_atomic)
+#define DRM_IOCTL_MODE_CREATEPROPBLOB	DRM_IOWR(0xBD, struct drm_mode_create_blob)
+#define DRM_IOCTL_MODE_DESTROYPROPBLOB	DRM_IOWR(0xBE, struct drm_mode_destroy_blob)
+
+#define DRM_IOCTL_SYNCOBJ_CREATE	DRM_IOWR(0xBF, struct drm_syncobj_create)
+#define DRM_IOCTL_SYNCOBJ_DESTROY	DRM_IOWR(0xC0, struct drm_syncobj_destroy)
+#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD	DRM_IOWR(0xC1, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE	DRM_IOWR(0xC2, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_WAIT		DRM_IOWR(0xC3, struct drm_syncobj_wait)
+#define DRM_IOCTL_SYNCOBJ_RESET		DRM_IOWR(0xC4, struct drm_syncobj_array)
+#define DRM_IOCTL_SYNCOBJ_SIGNAL	DRM_IOWR(0xC5, struct drm_syncobj_array)
+
+#define DRM_IOCTL_MODE_CREATE_LEASE	DRM_IOWR(0xC6, struct drm_mode_create_lease)
+#define DRM_IOCTL_MODE_LIST_LESSEES	DRM_IOWR(0xC7, struct drm_mode_list_lessees)
+#define DRM_IOCTL_MODE_GET_LEASE	DRM_IOWR(0xC8, struct drm_mode_get_lease)
+#define DRM_IOCTL_MODE_REVOKE_LEASE	DRM_IOWR(0xC9, struct drm_mode_revoke_lease)
+
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT	DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait)
+#define DRM_IOCTL_SYNCOBJ_QUERY		DRM_IOWR(0xCB, struct drm_syncobj_timeline_array)
+#define DRM_IOCTL_SYNCOBJ_TRANSFER	DRM_IOWR(0xCC, struct drm_syncobj_transfer)
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL	DRM_IOWR(0xCD, struct drm_syncobj_timeline_array)
+
+/**
+ * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata.
+ *
+ * This queries metadata about a framebuffer. User-space fills
+ * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the
+ * struct as the output.
+ *
+ * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
+ * will be filled with GEM buffer handles. Fresh new GEM handles are always
+ * returned, even if another GEM handle referring to the same memory object
+ * already exists on the DRM file description. The caller is responsible for
+ * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
+ * new handle will be returned for multiple planes in case they use the same
+ * memory object. Planes are valid until one has a zero handle -- this can be
+ * used to compute the number of planes.
+ *
+ * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
+ * until one has a zero &drm_mode_fb_cmd2.pitches.
+ *
+ * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
+ * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
+ * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ *
+ * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
+ * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
+ * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
+ * double-close handles which are specified multiple times in the array.
+ */
+#define DRM_IOCTL_MODE_GETFB2		DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
+
+#define DRM_IOCTL_SYNCOBJ_EVENTFD	DRM_IOWR(0xCF, struct drm_syncobj_eventfd)
+
+/**
+ * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer.
+ *
+ * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable
+ * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept
+ * alive. When the plane no longer uses the framebuffer (because the
+ * framebuffer is replaced with another one, or the plane is disabled), the
+ * framebuffer is cleaned up.
+ *
+ * This is useful to implement flicker-free transitions between two processes.
+ *
+ * Depending on the threat model, user-space may want to ensure that the
+ * framebuffer doesn't expose any sensitive user information: closed
+ * framebuffers attached to a plane can be read back by the next DRM master.
+ */
+#define DRM_IOCTL_MODE_CLOSEFB		DRM_IOWR(0xD0, struct drm_mode_closefb)
+
+/*
+ * Device specific ioctls should only be in their respective headers
+ * The device specific ioctl range is from 0x40 to 0x9f.
+ * Generic IOCTLS restart at 0xA0.
+ *
+ * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and
+ * drmCommandReadWrite().
+ */
+#define DRM_COMMAND_BASE                0x40
+#define DRM_COMMAND_END			0xA0
+
+/**
+ * struct drm_event - Header for DRM events
+ * @type: event type.
+ * @length: total number of payload bytes (including header).
+ *
+ * This struct is a header for events written back to user-space on the DRM FD.
+ * A read on the DRM FD will always only return complete events: e.g. if the
+ * read buffer is 100 bytes large and there are two 64 byte events pending,
+ * only one will be returned.
+ *
+ * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and
+ * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK,
+ * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE.
+ */
+struct drm_event {
+	__u32 type;
+	__u32 length;
+};
+
+/**
+ * DRM_EVENT_VBLANK - vertical blanking event
+ *
+ * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the
+ * &_DRM_VBLANK_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_VBLANK 0x01
+/**
+ * DRM_EVENT_FLIP_COMPLETE - page-flip completion event
+ *
+ * This event is sent in response to an atomic commit or legacy page-flip with
+ * the &DRM_MODE_PAGE_FLIP_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_FLIP_COMPLETE 0x02
+/**
+ * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event
+ *
+ * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE.
+ *
+ * The event payload is a struct drm_event_crtc_sequence.
+ */
+#define DRM_EVENT_CRTC_SEQUENCE	0x03
+
+struct drm_event_vblank {
+	struct drm_event base;
+	__u64 user_data;
+	__u32 tv_sec;
+	__u32 tv_usec;
+	__u32 sequence;
+	__u32 crtc_id; /* 0 on older kernels that do not support this */
+};
+
+/* Event delivered at sequence. Time stamp marks when the first pixel
+ * of the refresh cycle leaves the display engine for the display
+ */
+struct drm_event_crtc_sequence {
+	struct drm_event	base;
+	__u64			user_data;
+	__s64			time_ns;
+	__u64			sequence;
+};
+
+/* typedef area */
+typedef struct drm_clip_rect drm_clip_rect_t;
+typedef struct drm_drawable_info drm_drawable_info_t;
+typedef struct drm_tex_region drm_tex_region_t;
+typedef struct drm_hw_lock drm_hw_lock_t;
+typedef struct drm_version drm_version_t;
+typedef struct drm_unique drm_unique_t;
+typedef struct drm_list drm_list_t;
+typedef struct drm_block drm_block_t;
+typedef struct drm_control drm_control_t;
+typedef enum drm_map_type drm_map_type_t;
+typedef enum drm_map_flags drm_map_flags_t;
+typedef struct drm_ctx_priv_map drm_ctx_priv_map_t;
+typedef struct drm_map drm_map_t;
+typedef struct drm_client drm_client_t;
+typedef enum drm_stat_type drm_stat_type_t;
+typedef struct drm_stats drm_stats_t;
+typedef enum drm_lock_flags drm_lock_flags_t;
+typedef struct drm_lock drm_lock_t;
+typedef enum drm_dma_flags drm_dma_flags_t;
+typedef struct drm_buf_desc drm_buf_desc_t;
+typedef struct drm_buf_info drm_buf_info_t;
+typedef struct drm_buf_free drm_buf_free_t;
+typedef struct drm_buf_pub drm_buf_pub_t;
+typedef struct drm_buf_map drm_buf_map_t;
+typedef struct drm_dma drm_dma_t;
+typedef union drm_wait_vblank drm_wait_vblank_t;
+typedef struct drm_agp_mode drm_agp_mode_t;
+typedef enum drm_ctx_flags drm_ctx_flags_t;
+typedef struct drm_ctx drm_ctx_t;
+typedef struct drm_ctx_res drm_ctx_res_t;
+typedef struct drm_draw drm_draw_t;
+typedef struct drm_update_draw drm_update_draw_t;
+typedef struct drm_auth drm_auth_t;
+typedef struct drm_irq_busid drm_irq_busid_t;
+typedef enum drm_vblank_seq_type drm_vblank_seq_type_t;
+
+typedef struct drm_agp_buffer drm_agp_buffer_t;
+typedef struct drm_agp_binding drm_agp_binding_t;
+typedef struct drm_agp_info drm_agp_info_t;
+typedef struct drm_scatter_gather drm_scatter_gather_t;
+typedef struct drm_set_version drm_set_version_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
new file mode 100644
index 000000000..9debb320c
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2013 Red Hat
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRTGPU_DRM_H
+#define VIRTGPU_DRM_H
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints.
+ *
+ * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel
+ * compatibility Keep fields aligned to their size
+ */
+
+#define DRM_VIRTGPU_MAP         0x01
+#define DRM_VIRTGPU_EXECBUFFER  0x02
+#define DRM_VIRTGPU_GETPARAM    0x03
+#define DRM_VIRTGPU_RESOURCE_CREATE 0x04
+#define DRM_VIRTGPU_RESOURCE_INFO     0x05
+#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06
+#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07
+#define DRM_VIRTGPU_WAIT     0x08
+#define DRM_VIRTGPU_GET_CAPS  0x09
+#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a
+#define DRM_VIRTGPU_CONTEXT_INIT 0x0b
+
+#define VIRTGPU_EXECBUF_FENCE_FD_IN	0x01
+#define VIRTGPU_EXECBUF_FENCE_FD_OUT	0x02
+#define VIRTGPU_EXECBUF_RING_IDX	0x04
+#define VIRTGPU_EXECBUF_FLAGS  (\
+		VIRTGPU_EXECBUF_FENCE_FD_IN |\
+		VIRTGPU_EXECBUF_FENCE_FD_OUT |\
+		VIRTGPU_EXECBUF_RING_IDX |\
+		0)
+
+struct drm_virtgpu_map {
+	__u64 offset; /* use for mmap system call */
+	__u32 handle;
+	__u32 pad;
+};
+
+#define VIRTGPU_EXECBUF_SYNCOBJ_RESET		0x01
+#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \
+		VIRTGPU_EXECBUF_SYNCOBJ_RESET | \
+		0)
+struct drm_virtgpu_execbuffer_syncobj {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+};
+
+/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */
+struct drm_virtgpu_execbuffer {
+	__u32 flags;
+	__u32 size;
+	__u64 command; /* void* */
+	__u64 bo_handles;
+	__u32 num_bo_handles;
+	__s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */
+	__u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */
+	__u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */
+	__u32 num_in_syncobjs;
+	__u32 num_out_syncobjs;
+	__u64 in_syncobjs;
+	__u64 out_syncobjs;
+};
+
+#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
+#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */
+#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */
+#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */
+#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing  */
+#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */
+#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */
+#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */
+
+struct drm_virtgpu_getparam {
+	__u64 param;
+	__u64 value;
+};
+
+/* NO_BO flags? NO resource flag? */
+/* resource flag for y_0_top */
+struct drm_virtgpu_resource_create {
+	__u32 target;
+	__u32 format;
+	__u32 bind;
+	__u32 width;
+	__u32 height;
+	__u32 depth;
+	__u32 array_size;
+	__u32 last_level;
+	__u32 nr_samples;
+	__u32 flags;
+	__u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */
+	__u32 res_handle;  /* returned by kernel */
+	__u32 size;        /* validate transfer in the host */
+	__u32 stride;      /* validate transfer in the host */
+};
+
+struct drm_virtgpu_resource_info {
+	__u32 bo_handle;
+	__u32 res_handle;
+	__u32 size;
+	__u32 blob_mem;
+};
+
+struct drm_virtgpu_3d_box {
+	__u32 x;
+	__u32 y;
+	__u32 z;
+	__u32 w;
+	__u32 h;
+	__u32 d;
+};
+
+struct drm_virtgpu_3d_transfer_to_host {
+	__u32 bo_handle;
+	struct drm_virtgpu_3d_box box;
+	__u32 level;
+	__u32 offset;
+	__u32 stride;
+	__u32 layer_stride;
+};
+
+struct drm_virtgpu_3d_transfer_from_host {
+	__u32 bo_handle;
+	struct drm_virtgpu_3d_box box;
+	__u32 level;
+	__u32 offset;
+	__u32 stride;
+	__u32 layer_stride;
+};
+
+#define VIRTGPU_WAIT_NOWAIT 1 /* like it */
+struct drm_virtgpu_3d_wait {
+	__u32 handle; /* 0 is an invalid handle */
+	__u32 flags;
+};
+
+#define VIRTGPU_DRM_CAPSET_VIRGL 1
+#define VIRTGPU_DRM_CAPSET_VIRGL2 2
+#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3
+#define VIRTGPU_DRM_CAPSET_VENUS 4
+#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5
+#define VIRTGPU_DRM_CAPSET_DRM 6
+struct drm_virtgpu_get_caps {
+	__u32 cap_set_id;
+	__u32 cap_set_ver;
+	__u64 addr;
+	__u32 size;
+	__u32 pad;
+};
+
+struct drm_virtgpu_resource_create_blob {
+#define VIRTGPU_BLOB_MEM_GUEST             0x0001
+#define VIRTGPU_BLOB_MEM_HOST3D            0x0002
+#define VIRTGPU_BLOB_MEM_HOST3D_GUEST      0x0003
+
+#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE     0x0001
+#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE    0x0002
+#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004
+	/* zero is invalid blob_mem */
+	__u32 blob_mem;
+	__u32 blob_flags;
+	__u32 bo_handle;
+	__u32 res_handle;
+	__u64 size;
+
+	/*
+	 * for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and
+	 * VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero.
+	 */
+	__u32 pad;
+	__u32 cmd_size;
+	__u64 cmd;
+	__u64 blob_id;
+};
+
+#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID       0x0001
+#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS       0x0002
+#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003
+#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME      0x0004
+struct drm_virtgpu_context_set_param {
+	__u64 param;
+	__u64 value;
+};
+
+struct drm_virtgpu_context_init {
+	__u32 num_params;
+	__u32 pad;
+
+	/* pointer to drm_virtgpu_context_set_param array */
+	__u64 ctx_set_params;
+};
+
+/*
+ * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in
+ * effect.  The event size is sizeof(drm_event), since there is no additional
+ * payload.
+ */
+#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000
+
+#define DRM_IOCTL_VIRTGPU_MAP \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
+
+#define DRM_IOCTL_VIRTGPU_EXECBUFFER \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
+		struct drm_virtgpu_execbuffer)
+
+#define DRM_IOCTL_VIRTGPU_GETPARAM \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\
+		struct drm_virtgpu_getparam)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE			\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE,	\
+		struct drm_virtgpu_resource_create)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \
+		 struct drm_virtgpu_resource_info)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST,	\
+		struct drm_virtgpu_3d_transfer_from_host)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST,	\
+		struct drm_virtgpu_3d_transfer_to_host)
+
+#define DRM_IOCTL_VIRTGPU_WAIT				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT,	\
+		struct drm_virtgpu_3d_wait)
+
+#define DRM_IOCTL_VIRTGPU_GET_CAPS \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \
+	struct drm_virtgpu_get_caps)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB,	\
+		struct drm_virtgpu_resource_create_blob)
+
+#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT					\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT,		\
+		struct drm_virtgpu_context_init)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ggml/src/ggml-remotingfrontend/include/venus_hw.h b/ggml/src/ggml-remotingfrontend/include/venus_hw.h
new file mode 100644
index 000000000..3ef774b82
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/venus_hw.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2020 Chromium
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef VENUS_HW_H
+#define VENUS_HW_H
+
+#include <stdint.h>
+
+struct virgl_renderer_capset_venus {
+   uint32_t wire_format_version;
+   uint32_t vk_xml_version;
+   uint32_t vk_ext_command_serialization_spec_version;
+   uint32_t vk_mesa_venus_protocol_spec_version;
+
+   /* This flag indicates render server config, and will be needed until drm
+    * virtio-gpu blob mem gets fixed to attach_resource before resource_map.
+    */
+   uint32_t supports_blob_id_0;
+
+   /* Extension number N, where N is defined by the Vulkan spec, corresponds
+    * to bit [N / 32] & (1 << N % 32). The below mask1 covers the first 1023
+    * Vulkan extensions (numbered from 1 to 1023).
+    *
+    * Bit (mask1[0] & 0x1) is used for backward compatibility purpose. When
+    * that bit is set, the extension mask(s) are valid. Otherwise, all the
+    * extensions are assumed to be supported by the renderer side protocol.
+    */
+   uint32_t vk_extension_mask1[32];
+
+   /* The single-threaded renderer cannot afford potential blocking calls. It
+    * also leads to GPU lost if the wait depends on a following command. This
+    * capset allows such blocking calls to passthrough from the clients, and
+    * shifts the responsibilities to the client drivers.
+    */
+   uint32_t allow_vk_wait_syncs;
+
+   /* This flag indicates that the renderer supports multiple fencing
+    * timelines. The client driver is expected to associate each VkQueue with
+    * one of these timelines at queue creation by binding it with an unused
+    * ring_idx. Queues created without a ring_idx binding are associated to a
+    * shared legacy timeline. The special ring_idx==0 is reserved for CPU
+    * fences that are signaled by the renderer immediately upon consumption of
+    * the associated renderer submission.
+    */
+   uint32_t supports_multiple_timelines;
+
+   /* This flag indicates to the guest that hypervisor does not support memory
+    * pages injections and blob allocations must be done by guest from the
+    * dedicated heap (Host visible memory).
+    */
+   uint32_t use_guest_vram;
+};
+
+#endif /* VENUS_HW_H */
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
new file mode 100644
index 000000000..7ce0dbb7f
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -0,0 +1,87 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "../ggml-remotingbackend/shared/venus_cs_ggml-rpc.h"
+
+#include "ggml-remoting.h"
+
+rpc_tensor
+serialize_tensor(const ggml_tensor * tensor) {
+  rpc_tensor result;
+  result.id = reinterpret_cast<uint64_t>(tensor);
+  result.type = tensor->type;
+  if (tensor->buffer) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
+
+    result.buffer = BUFFER_TO_HOST_HANDLE(buffer);
+  } else {
+    result.buffer = 0;
+  }
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result.ne[i] = tensor->ne[i];
+    result.nb[i] = tensor->nb[i];
+  }
+  result.op = tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result.op_params[i] = tensor->op_params[i];
+  }
+  result.flags = tensor->flags;
+  for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+    result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+  }
+  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+  result.view_offs = tensor->view_offs;
+  result.data = reinterpret_cast<uint64_t>(tensor->data);
+  if (tensor->data) {
+    if (!tensor->buffer) {
+      FATAL("tensor has data but not buffer :/");
+    }
+    // tensor->data is serialized as an offset to the buffer base address
+    result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+  }
+  snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+  return result;
+}
+
+void
+add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+  if (tensor == nullptr) {
+    return;
+  }
+  if (visited.find(tensor) != visited.end()) {
+    return;
+  }
+  visited.insert(tensor);
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    add_tensor(tensor->src[i], tensors, visited);
+  }
+  add_tensor(tensor->view_src, tensors, visited);
+  tensors.push_back(serialize_tensor(tensor));
+}
+
+void
+serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+  uint32_t n_nodes = cgraph->n_nodes;
+  std::vector<rpc_tensor> tensors;
+  std::unordered_set<ggml_tensor*> visited;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    add_tensor(cgraph->nodes[i], tensors, visited);
+  }
+  // serialization format:
+  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+  uint32_t n_tensors = tensors.size();
+  int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+  output.resize(output_size, 0);
+  memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+  }
+  uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+  *out_ntensors = n_tensors;
+  rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
new file mode 100644
index 000000000..8a7c9bea6
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -0,0 +1,54 @@
+#include "virtgpu-forward-impl.h"
+
+static long long current_time_ms() {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+ggml_status
+apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
+
+  std::vector<uint8_t> cgraph_data;
+  size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data);
+
+  struct vn_renderer_shmem *shmem;
+  if (cgraph_size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, cgraph_size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, cgraph_size, (int)cgraph_size/1024, (int)cgraph_size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
+  }
+
+  //INFO("Send shmem ID %d", shmem->res_id);
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+  //INFO("Send shmem size %lu", cgraph_size);
+  vn_encode_size_t(encoder, &cgraph_size);
+
+  char *shmem_data = (char *) shmem->mmap_ptr;
+  struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, cgraph_size);
+
+  vn_encode_cgraph_data(&secondary_enc, cgraph_data);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  ggml_status status = GGML_STATUS_ABORTED;
+  vn_decode_ggml_status(decoder, &status);
+  //INFO("Received status %u", status);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
+
+  return status;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
new file mode 100644
index 000000000..4b635f21a
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -0,0 +1,119 @@
+#include "virtgpu-forward-impl.h"
+
+const char *
+apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device name buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  //INFO("%s: Forward BUFT NAME --> %s", __func__, string);
+
+  /* *** */
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return string;
+}
+
+size_t
+apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  size_t alignment;
+  vn_decode_size_t(decoder, &alignment);
+
+  INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return alignment;
+}
+
+size_t
+apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  size_t max_size;
+  vn_decode_size_t(decoder, &max_size);
+
+  INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return max_size;
+}
+
+bool
+apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  bool is_host;
+  vn_decode_bool_t(decoder, &is_host);
+
+  INFO("%s: buffer is host? %d", __func__, is_host);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return is_host;
+}
+
+apir_buffer_context_t
+apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  apir_buffer_context_t buffer_context;
+  INFO("%s: allocate device memory (%lu)", __func__,  size);
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return buffer_context;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
new file mode 100644
index 000000000..cf160b133
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -0,0 +1,148 @@
+#include "virtgpu-forward-impl.h"
+
+void *
+apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  uintptr_t base;
+  vn_decode_uintptr_t(decoder, &base);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return (void *) base;
+}
+
+void
+apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+#if 0
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
+       buffer_context->host_handle, tensor, data, offset, size);
+#endif
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+  vn_encode_ggml_tensor(encoder, tensor);
+
+  struct vn_renderer_shmem *shmem;
+  if (size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, size);
+    //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
+  }
+
+  memcpy(shmem->mmap_ptr, data, size);
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
+  vn_encode_size_t(encoder, &offset);
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
+
+  return;
+}
+
+#if false
+void
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
+  UNUSED(gpu);
+  UNUSED(tensor);
+  char *buffer_base_addr = (char *) buffer_context->shmem->mmap_ptr;
+
+  memcpy(data, buffer_base_addr+offset, size);
+}
+#else
+void
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+  vn_encode_ggml_tensor(encoder, tensor);
+
+  struct vn_renderer_shmem *shmem;
+  if (size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
+  }
+
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+  vn_encode_size_t(encoder, &offset);
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  memcpy(data, shmem->mmap_ptr, size);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
+}
+#endif
+
+void
+apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		  uint8_t value) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+  vn_encode_uint8_t(encoder, &value);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  remote_call_finish(gpu, encoder, decoder);
+}
+
+
+void
+apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  remote_call_finish(gpu, encoder, decoder);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
new file mode 100644
index 000000000..e0b5745bf
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -0,0 +1,238 @@
+#include "virtgpu-forward-impl.h"
+
+int
+apir_device_get_count(struct virtgpu *gpu) {
+  static int32_t dev_count = -1;
+  if (dev_count != -1) {
+    CACHED;
+    return dev_count;
+  }
+
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_int32_t(decoder, &dev_count);
+
+  INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return dev_count;
+}
+
+const char *
+apir_device_get_name(struct virtgpu *gpu) {
+  static char *string = nullptr;
+  if (string) {
+    CACHED;
+    return string;
+  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME);
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device name buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward DEV NAME --> %s", __func__, string);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return string;
+}
+
+const char *
+apir_device_get_description(struct virtgpu *gpu) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device description buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward DEV DESCR --> %s", __func__, string);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return string;
+}
+
+uint32_t
+apir_device_get_type(struct virtgpu *gpu) {
+  static uint32_t dev_type = 255;
+  if (dev_type != 255) {
+    CACHED;
+    return dev_type;
+  }
+
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_uint32_t(decoder, &dev_type);
+
+  INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return dev_type;
+}
+
+void
+apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
+  static size_t dev_free = 0;
+  static size_t dev_total = 0;
+  /*
+  if (dev_total != 0) {
+    WARNING("Not sure if llama.cpp expects fresh information for the free memory ...");
+    *free = dev_free;
+    *total = dev_total;
+
+    CACHED;
+    return;
+  }
+  */
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_size_t(decoder, &dev_free);
+  vn_decode_size_t(decoder, &dev_total);
+
+  *free = dev_free;
+  *total = dev_total;
+
+  INFO("%s: Forward DEV FREE  mem --> %zu MB", __func__, dev_free / 1024 / 1024);
+  INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024);
+
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return;
+}
+
+bool
+apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
+
+  vn_encode_ggml_tensor_inline(encoder, op);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  bool supports_op;
+  vn_decode_bool_t(decoder, &supports_op);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return supports_op;
+}
+
+apir_buffer_type_host_handle_t
+apir_device_get_buffer_type(struct virtgpu *gpu) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  apir_buffer_type_host_handle_t buft_handle;
+  vn_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return buft_handle;
+}
+
+void
+apir_device_get_props(struct virtgpu *gpu,
+		      bool *async,
+		      bool *host_buffer,
+		      bool *buffer_from_host_ptr,
+		      bool *events) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_bool_t(decoder, async);
+  vn_decode_bool_t(decoder, host_buffer);
+  vn_decode_bool_t(decoder, buffer_from_host_ptr);
+  vn_decode_bool_t(decoder, events);
+
+  /* *** */
+  remote_call_finish(gpu, encoder, decoder);
+
+  return;
+}
+
+apir_buffer_context_t
+apir_device_buffer_from_ptr(struct virtgpu *gpu,
+			    size_t size,
+			    size_t max_tensor_size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  apir_buffer_context_t buffer_context;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
+
+  /* *** */
+
+  buffer_context.shmem = virtgpu_shmem_create(gpu, size);
+  if (!buffer_context.shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
+
+  vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id);
+
+  vn_encode_size_t(encoder, &size);
+  vn_encode_size_t(encoder, &max_tensor_size);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+  buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
+
+  /* *** */
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return buffer_context;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
new file mode 100644
index 000000000..7edae38e7
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
@@ -0,0 +1,31 @@
+#include "ggml-backend-impl.h"
+#include "ggml-remoting.h"
+#include "virtgpu.h"
+#include "../ggml-remotingbackend/shared/apir_backend.h"
+#include "../ggml-remotingbackend/shared/venus_cs_ggml.h"
+
+#define CACHED
+//  printf("INFO: ### found response in the cache %s\n", __func__)o
+
+
+#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__)		\
+  do {									\
+    int32_t forward_flag = (int32_t) apir_command_type__;		\
+    encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_Forward, forward_flag); \
+    if (!encoder_name) {							\
+      FATAL("%s: failed to prepare the remote call encoder :/", __func__); \
+    }									\
+  } while(0)
+
+
+#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
+  do {									\
+    ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
+    if (!decoder_name) {						\
+      FATAL("%s: failed to kick the remote call :/", __func__);		\
+    }									\
+    if (ret_name < APIR_FORWARD_BASE_INDEX) {				\
+      FATAL("%s: failed to forward the API call: %s: code %d", __func__, apir_forward_error(ret_name), ret_name); \
+    }									\
+    ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
+  } while(0)
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-metal.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-metal.cpp
new file mode 100644
index 000000000..a66344aee
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-metal.cpp
@@ -0,0 +1,20 @@
+#include "virtgpu-forward-impl.h"
+
+bool
+apir_metal_get_device_context(struct virtgpu *gpu, struct ggml_backend_metal_device_context *metal_dev_ctx) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirForwardReturnCode ret;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_METAL_GET_DEVICE_CONTEXT);
+
+  REMOTE_CALL(gpu, encoder, decoder, ret);
+
+  vn_decode_bool_t(decoder, &metal_dev_ctx->has_simdgroup_mm);
+  vn_decode_bool_t(decoder, &metal_dev_ctx->has_simdgroup_reduction);
+  vn_decode_bool_t(decoder, &metal_dev_ctx->use_bfloat);
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  return true;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
new file mode 100644
index 000000000..4cbb6341e
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -0,0 +1,50 @@
+#include "ggml.h"
+#include "ggml-impl.h"
+#include "ggml-alloc.h"
+
+#include "virtgpu-utils.h"
+
+#include "../ggml-remotingbackend/shared/apir_backend.h"
+
+/* device */
+int apir_device_get_count(struct virtgpu *gpu);
+const char *apir_device_get_name(struct virtgpu *gpu);
+const char *apir_device_get_description(struct virtgpu *gpu);
+uint32_t apir_device_get_type(struct virtgpu *gpu);
+void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
+bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu);
+void apir_device_get_props(struct virtgpu *gpu,
+			   bool *async,
+			   bool *host_buffer,
+			   bool *buffer_from_host_ptr,
+			   bool *events);
+apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu *gpu,
+						  size_t size,
+						  size_t max_tensor_size);
+/* buffer-type */
+const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size);
+
+/* buffer */
+
+void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context);
+enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor);
+void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+			    ggml_tensor *tensor, const void *data, size_t offset, size_t size);
+void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+			    const ggml_tensor *tensor, void *data, size_t offset, size_t size);
+void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       uint8_t value);
+void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context);
+
+/* backend */
+
+ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph);
+
+/* metal */
+
+bool apir_metal_get_device_context(struct virtgpu *gpu, struct ggml_backend_metal_device_context *metal_dev_ctx);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
new file mode 100644
index 000000000..a09fd2237
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -0,0 +1,111 @@
+#include <assert.h>
+
+#include "virtgpu-shm.h"
+
+static uint32_t
+virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu,
+                                   uint32_t blob_mem,
+                                   uint32_t blob_flags,
+                                   size_t blob_size,
+                                   uint64_t blob_id,
+                                   uint32_t *res_id)
+{
+#ifdef SIMULATE_BO_SIZE_FIX
+   blob_size = align64(blob_size, 4096);
+#endif
+
+   struct drm_virtgpu_resource_create_blob args = {
+      .blob_mem = blob_mem,
+      .blob_flags = blob_flags,
+      .bo_handle = 0,
+      .res_handle = 0,
+      .size = blob_size,
+      .pad = 0,
+      .cmd_size = 0,
+      .cmd = 0,
+      .blob_id = blob_id,
+   };
+
+   if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args))
+      return 0;
+
+   *res_id = args.res_handle;
+   return args.bo_handle;
+}
+
+static void
+virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle)
+{
+   struct drm_gem_close args = {
+      .handle = gem_handle,
+      .pad = 0,
+   };
+
+   const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
+   assert(!ret);
+#ifdef NDEBUG
+   UNUSED(ret);
+#endif
+}
+
+static void *
+virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
+{
+   struct drm_virtgpu_map args = {
+      .offset = 0,
+      .handle = gem_handle,
+      .pad = 0,
+   };
+
+   if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args))
+      return NULL;
+
+   void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd,
+                    args.offset);
+   if (ptr == MAP_FAILED)
+      return NULL;
+
+   return ptr;
+}
+
+void
+virtgpu_shmem_destroy(struct virtgpu *gpu,
+                      struct virtgpu_shmem *shmem)
+{
+  munmap(shmem->base.mmap_ptr, shmem->base.mmap_size);
+  virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
+}
+
+struct vn_renderer_shmem *
+virtgpu_shmem_create(struct virtgpu *gpu, size_t size)
+{
+   size = align64(size, 16384);
+
+   uint32_t res_id;
+   uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(
+      gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0,
+      &res_id);
+   if (!gem_handle)
+      return NULL;
+
+   void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
+   if (!ptr) {
+      virtgpu_ioctl_gem_close(gpu, gem_handle);
+      return NULL;
+   }
+   if (gpu->shmem_array.elem_size == 0) {
+     INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n");
+     assert(false);
+   }
+   struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle);
+
+   shmem->gem_handle = gem_handle;
+   shmem->base.res_id = res_id;
+   shmem->base.mmap_size = size;
+   shmem->base.mmap_ptr = ptr;
+   shmem->base.refcount.count = 1;
+   shmem->base.gem_handle = gem_handle;
+   shmem->base.shmem = shmem;
+
+   return &shmem->base;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
new file mode 100644
index 000000000..52217f5b7
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <atomic>
+#include <sys/mman.h>
+
+#include "virtgpu.h"
+#include "virtgpu-utils.h"
+
+struct vn_refcount {
+   int count; //atomic_int
+};
+
+
+struct vn_renderer_shmem {
+   struct vn_refcount refcount;
+
+   uint32_t res_id;
+   size_t mmap_size; /* for internal use only (i.e., munmap) */
+   void *mmap_ptr;
+
+   struct list_head cache_head;
+   int64_t cache_timestamp;
+
+   uint32_t gem_handle;
+
+   struct virtgpu_shmem *shmem;
+};
+
+struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size);
+void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem);
+
+
+struct virtgpu_shmem {
+   struct vn_renderer_shmem base;
+   uint32_t gem_handle;
+};
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
new file mode 100644
index 000000000..833f0e468
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
@@ -0,0 +1,200 @@
+#include "virtgpu-utils.h"
+#include <malloc.h>
+#include <cstring>
+#include <stdlib.h>
+
+#define NODE_ALLOC_ALIGN 64
+#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1))
+#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1)
+#define NULL_NODE 0
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr) free(_ptr)
+#define p_atomic_cmpxchg(v, old, _new) \
+   __sync_val_compare_and_swap((v), (old), (_new))
+
+static inline uint64_t
+util_logbase2_64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+   uint64_t pos = 0ull;
+   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
+   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
+   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1ull<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
+void
+util_sparse_array_init(struct util_sparse_array *arr,
+                       size_t elem_size, size_t node_size)
+{
+   memset(arr, 0, sizeof(*arr));
+   arr->elem_size = elem_size;
+   arr->node_size_log2 = util_logbase2_64(node_size);
+   assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
+}
+
+static inline void *
+os_malloc_aligned(size_t size, size_t alignment)
+{
+   void *ptr;
+   alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+   if(posix_memalign(&ptr, alignment, size) != 0)
+      return NULL;
+   return ptr;
+}
+
+static inline void *
+_util_sparse_array_node_data(uintptr_t handle)
+{
+   return (void *)(handle & NODE_PTR_MASK);
+}
+
+static inline unsigned
+_util_sparse_array_node_level(uintptr_t handle)
+{
+   return handle & NODE_LEVEL_MASK;
+}
+
+static inline void
+_util_sparse_array_node_finish(struct util_sparse_array *arr,
+                               uintptr_t node)
+{
+   if (_util_sparse_array_node_level(node) > 0) {
+      uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node);
+      size_t node_size = 1ull << arr->node_size_log2;
+      for (size_t i = 0; i < node_size; i++) {
+         if (children[i])
+            _util_sparse_array_node_finish(arr, children[i]);
+      }
+   }
+
+   os_free_aligned(_util_sparse_array_node_data(node));
+}
+
+static inline uintptr_t
+_util_sparse_array_node(void *data, unsigned level)
+{
+   assert(data != NULL);
+   assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0);
+   assert((level & NODE_PTR_MASK) == 0);
+   return (uintptr_t)data | level;
+}
+
+inline uintptr_t
+_util_sparse_array_node_alloc(struct util_sparse_array *arr,
+                              unsigned level)
+{
+   size_t size;
+   if (level == 0) {
+      size = arr->elem_size << arr->node_size_log2;
+   } else {
+      size = sizeof(uintptr_t) << arr->node_size_log2;
+   }
+
+   void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
+   memset(data, 0, size);
+
+   return _util_sparse_array_node(data, level);
+}
+
+static inline uintptr_t
+_util_sparse_array_set_or_free_node(uintptr_t *node_ptr,
+                                    uintptr_t cmp_node,
+                                    uintptr_t node)
+{
+   uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
+
+   if (prev_node != cmp_node) {
+      /* We lost the race.  Free this one and return the one that was already
+       * allocated.
+       */
+      os_free_aligned(_util_sparse_array_node_data(node));
+      return prev_node;
+   } else {
+      return node;
+   }
+}
+
+void *
+util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx)
+{
+   const unsigned node_size_log2 = arr->node_size_log2;
+   uintptr_t root = p_atomic_read(&arr->root);
+   if (unlikely(!root)) {
+      unsigned root_level = 0;
+      uint64_t idx_iter = idx >> node_size_log2;
+      while (idx_iter) {
+         idx_iter >>= node_size_log2;
+         root_level++;
+      }
+      uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
+      root = _util_sparse_array_set_or_free_node(&arr->root,
+                                                 NULL_NODE, new_root);
+   }
+
+   while (1) {
+      unsigned root_level = _util_sparse_array_node_level(root);
+      uint64_t root_idx = idx >> (root_level * node_size_log2);
+      if (likely(root_idx < (1ull << node_size_log2)))
+         break;
+
+      /* In this case, we have a root but its level is low enough that the
+       * requested index is out-of-bounds.
+       */
+      uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
+
+      uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
+      new_root_children[0] = root;
+
+      /* We only add one at a time instead of the whole tree because it's
+       * easier to ensure correctness of both the tree building and the
+       * clean-up path.  Because we're only adding one node we never have to
+       * worry about trying to free multiple things without freeing the old
+       * things.
+       */
+      root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
+   }
+
+   void *node_data = _util_sparse_array_node_data(root);
+   unsigned node_level = _util_sparse_array_node_level(root);
+   while (node_level > 0) {
+      uint64_t child_idx = (idx >> (node_level * node_size_log2)) &
+                           ((1ull << node_size_log2) - 1);
+
+      uintptr_t *children = (uintptr_t *) node_data;
+      uintptr_t child = p_atomic_read(&children[child_idx]);
+
+      if (unlikely(!child)) {
+         child = _util_sparse_array_node_alloc(arr, node_level - 1);
+         child = _util_sparse_array_set_or_free_node(&children[child_idx],
+                                                     NULL_NODE, child);
+      }
+
+      node_data = _util_sparse_array_node_data(child);
+      node_level = _util_sparse_array_node_level(child);
+   }
+
+   uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
+   return (void *)((char *)node_data + (elem_idx * arr->elem_size));
+}
+
+void *something = NULL;
+void thks_bye () {
+  // break here
+  INFO("thks bye, stopping early and happilly :)");
+  if (!something) { // avoid the [[noreturn]] detection mechanism
+    exit(0);
+  }
+}
+
+void breakpoint() {
+  // break here
+  INFO("breakpoint here :)");
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
new file mode 100644
index 000000000..dd911a63b
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <cstdint>
+#include <cassert>
+#include <cstddef>
+#include <ctime>
+#include <cerrno>
+#include <atomic>
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define likely(x) __builtin_expect(!!(x), 1)
+
+#ifndef UNUSED
+#define UNUSED(x) (void)(x)
+#endif
+
+/** Checks is a value is a power of two. Does not handle zero. */
+#define IS_POT(v) (((v) & ((v) - 1)) == 0)
+
+/** Checks is a value is a power of two. Zero handled. */
+#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v))
+
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
+#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
+
+void thks_bye();
+void breakpoint();
+
+#ifndef NDEBUG
+inline void
+INFO(const char *format, ...) {
+  fprintf(stderr, "INFO: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+#else
+inline void
+INFO(...) {}
+#endif
+
+inline void
+MESSAGE(const char *format, ...) {
+  fprintf(stderr, "APIR: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+WARNING(const char *format, ...) {
+  fprintf(stderr, "WARNING: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+ERROR(const char *format, ...) {
+  fprintf(stderr, "ERROR: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+FATAL(const char *format, ...) {
+  fprintf(stderr, "FATAL: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+
+  abort();
+}
+
+static inline bool
+util_is_power_of_two_nonzero64(uint64_t v)
+{
+   return IS_POT_NONZERO(v);
+}
+
+static inline uint64_t
+align64(uint64_t value, uint64_t alignment)
+{
+   assert(util_is_power_of_two_nonzero64(alignment));
+   return ALIGN_POT(value, alignment);
+}
+
+struct list_head
+{
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+struct util_sparse_array {
+   size_t elem_size;
+   unsigned node_size_log2;
+
+   uintptr_t root;
+};
+
+void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx);
+void util_sparse_array_init(struct util_sparse_array *arr,
+			    size_t elem_size, size_t node_size);
+
+inline void
+os_time_sleep(int64_t usecs)
+{
+   struct timespec time;
+   time.tv_sec = usecs / 1000000;
+   time.tv_nsec = (usecs % 1000000) * 1000;
+   while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
new file mode 100644
index 000000000..b3b0ab2dc
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -0,0 +1,681 @@
+#include <stdio.h>
+#include <cassert>
+#include <cerrno>
+#include <unistd.h>
+
+#include <cstdlib>
+
+#include "virtgpu.h"
+
+static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
+static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu);
+
+
+static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu);
+
+static int virtgpu_ioctl_context_init(struct virtgpu *gpu,
+				      enum virgl_renderer_capset capset_id);
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size);
+static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
+static void virtgpu_init_renderer_info(struct virtgpu *gpu);
+
+struct timer_data wait_host_reply_timer = {0, 0, 0, "wait_host_reply"};
+
+static void log_call_duration(long long call_duration_ns, const char *name);
+
+const uint64_t APIR_HANDSHAKE_MAX_WAIT_MS = 15*1000; // 15s
+const uint64_t APIR_LOADLIBRARY_MAX_WAIT_MS = 60*1000; // 60s
+
+static inline void
+virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
+{
+   /* VIRTGPU_BLOB_MEM_GUEST allocates from the guest system memory.  They are
+    * logically contiguous in the guest but are sglists (iovecs) in the host.
+    * That makes them slower to process in the host.  With host process
+    * isolation, it also becomes impossible for the host to access sglists
+    * directly.
+    *
+    * While there are ideas (and shipped code in some cases) such as creating
+    * udmabufs from sglists, or having a dedicated guest heap, it seems the
+    * easiest way is to reuse VIRTGPU_BLOB_MEM_HOST3D.  That is, when the
+    * renderer sees a request to export a blob where
+    *
+    *  - blob_mem is VIRTGPU_BLOB_MEM_HOST3D
+    *  - blob_flags is VIRTGPU_BLOB_FLAG_USE_MAPPABLE
+    *  - blob_id is 0
+    *
+    * it allocates a host shmem.
+    *
+    * supports_blob_id_0 has been enforced by mandated render server config.
+    */
+   assert(gpu->capset.data.supports_blob_id_0);
+   gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
+}
+
+static int
+virtgpu_handshake(struct virtgpu *gpu) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  encoder = remote_call_prepare(gpu,  APIR_COMMAND_TYPE_HandShake, 0);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+    return 1;
+  }
+
+  /* write handshake props */
+
+  uint32_t guest_major = APIR_PROTOCOL_MAJOR;
+  uint32_t guest_minor = APIR_PROTOCOL_MINOR;
+  vn_encode_uint32_t(encoder, &guest_major);
+  vn_encode_uint32_t(encoder, &guest_minor);
+
+  /* *** */
+
+
+  uint32_t ret_magic;
+  long long call_duration_ns;
+  ret_magic = remote_call(gpu, encoder, &decoder, APIR_HANDSHAKE_MAX_WAIT_MS, &call_duration_ns);
+  log_call_duration(call_duration_ns, "API Remoting handshake");
+
+  if (!decoder) {
+    FATAL("%s: failed to initiate the communication with the virglrenderer library. "
+	  "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", __func__);
+    return 1;
+  }
+
+  /* read handshake return values */
+
+  uint32_t host_major;
+  uint32_t host_minor;
+
+  if (ret_magic != APIR_HANDSHAKE_MAGIC) {
+    FATAL("%s: handshake with the virglrenderer failed (code=%d | %s):/",
+	  __func__, ret_magic, apir_backend_initialize_error(ret_magic));
+  } else {
+    vn_decode_uint32_t(decoder, &host_major);
+    vn_decode_uint32_t(decoder, &host_minor);
+  }
+
+  /* *** */
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  if (ret_magic != APIR_HANDSHAKE_MAGIC) {
+    return 1;
+  }
+
+  /* *** */
+
+  INFO("%s: Guest is running with %u.%u", __func__, guest_major, guest_minor);
+  INFO("%s: Host is running with %u.%u", __func__, host_major, host_minor);
+
+  if (guest_major != host_major) {
+    ERROR("Host major (%d) and guest major (%d) version differ", host_major, guest_major);
+  } else if (guest_minor != host_minor) {
+    WARNING("Host minor (%d) and guest minor (%d) version differ", host_minor, guest_minor);
+  }
+
+  INFO("Handshake with the host virglrenderer library completed.");
+
+  return 0;
+}
+
+static ApirLoadLibraryReturnCode
+virtgpu_load_library(struct virtgpu *gpu) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  ApirLoadLibraryReturnCode ret;
+
+  encoder = remote_call_prepare(gpu,  APIR_COMMAND_TYPE_LoadLibrary, 0);
+  if (!encoder) {
+    FATAL("%s: hypercall error: failed to prepare the remote call encoder :/", __func__);
+    return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
+  }
+
+  long long call_duration_ns;
+
+  ret = (ApirLoadLibraryReturnCode) remote_call(gpu, encoder, &decoder,
+                                                APIR_LOADLIBRARY_MAX_WAIT_MS, &call_duration_ns);
+  log_call_duration(call_duration_ns, "API Remoting LoadLibrary");
+
+  if (!decoder) {
+    FATAL("%s: hypercall error: failed to kick the API remoting hypercall. :/", __func__);
+    return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR;
+  }
+
+  remote_call_finish(gpu, encoder, decoder);
+
+  if (ret == APIR_LOAD_LIBRARY_SUCCESS) {
+    INFO("%s: The API Remoting backend was successfully loaded and initialized", __func__);
+
+    return ret;
+  }
+
+  // something wrong happened, find out what.
+
+  if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
+    FATAL("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)",
+	  __func__, apir_load_library_error(ret), ret);
+    return ret;
+  }
+
+  INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__);
+
+  ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
+
+  if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
+    FATAL("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s):/",
+	  __func__, apir_ret, apir_load_library_error(apir_ret));
+  } else {
+    uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
+    FATAL("%s: the API Remoting backend library initialize its backend library: apir code=%d):/",
+	  __func__, lib_ret);
+  }
+  return ret;
+}
+
+struct virtgpu *
+create_virtgpu() {
+  struct virtgpu *gpu = new struct virtgpu();
+
+  util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem),
+			 1024);
+
+  virt_gpu_result_t result = virtgpu_open(gpu);
+  if (result != APIR_SUCCESS) {
+    FATAL("%s: failed to create the open the virtgpu device :/", __func__);
+    return NULL;
+  }
+
+  result = virtgpu_init_params(gpu);
+  assert(result == APIR_SUCCESS);
+
+  result = virtgpu_init_capset(gpu);
+  assert(result == APIR_SUCCESS);
+
+  result = virtgpu_init_context(gpu);
+  assert(result == APIR_SUCCESS);
+
+#ifdef NDEBUG
+   UNUSED(result);
+#endif
+
+  virtgpu_init_shmem_blob_mem(gpu);
+
+  gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000);
+  gpu->data_shmem = virtgpu_shmem_create(gpu, 0x1830000); // 24MiB
+
+  if (!gpu->reply_shmem) {
+    FATAL("%s: failed to create the shared reply memory pages :/", __func__);
+    return NULL;
+  }
+
+  if (!gpu->data_shmem) {
+    FATAL("%s: failed to create the shared data memory pages :/", __func__);
+    return NULL;
+  }
+
+  if (virtgpu_handshake(gpu)) {
+    FATAL("%s: failed to handshake with the virglrenderer library :/", __func__);
+    return NULL;
+  }
+
+  if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
+    FATAL("%s: failed to load the backend library :/", __func__);
+    return NULL;
+  }
+
+  return gpu;
+}
+
+static virt_gpu_result_t
+virtgpu_open(struct virtgpu *gpu)
+{
+   drmDevicePtr devs[8];
+   int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
+   if (count < 0) {
+     ERROR("%s: failed to enumerate DRM devices", __func__);
+     return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED;
+   for (int i = 0; i < count; i++) {
+      result = virtgpu_open_device(gpu, devs[i]);
+      if (result == APIR_SUCCESS)
+         break;
+   }
+
+   drmFreeDevices(devs, count);
+
+   return result;
+}
+
+static virt_gpu_result_t
+virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
+{
+   bool supported_bus = false;
+
+   switch (dev->bustype) {
+   case DRM_BUS_PCI:
+      if (dev->deviceinfo.pci->vendor_id == VIRTGPU_PCI_VENDOR_ID &&
+          dev->deviceinfo.pci->device_id == VIRTGPU_PCI_DEVICE_ID)
+         supported_bus = true;
+      break;
+   case DRM_BUS_PLATFORM:
+      supported_bus = true;
+      break;
+   default:
+      break;
+   }
+
+   if (!supported_bus || !(dev->available_nodes & (1 << DRM_NODE_RENDER))) {
+      if (VN_DEBUG(INIT)) {
+         const char *name = "unknown";
+         for (uint32_t i = 0; i < DRM_NODE_MAX; i++) {
+            if (dev->available_nodes & (1 << i)) {
+               name = dev->nodes[i];
+               break;
+            }
+         }
+         vn_log(gpu->instance, "skipping DRM device %s", name);
+      }
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   const char *primary_path = dev->nodes[DRM_NODE_PRIMARY];
+   const char *node_path = dev->nodes[DRM_NODE_RENDER];
+
+   int fd = open(node_path, O_RDWR | O_CLOEXEC);
+   if (fd < 0) {
+      if (VN_DEBUG(INIT))
+         vn_log(gpu->instance, "failed to open %s", node_path);
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   drmVersionPtr version = drmGetVersion(fd);
+   if (!version || strcmp(version->name, "virtio_gpu") ||
+       version->version_major != 0) {
+      if (VN_DEBUG(INIT)) {
+         if (version) {
+            vn_log(gpu->instance, "unknown DRM driver %s version %d",
+                   version->name, version->version_major);
+         } else {
+            vn_log(gpu->instance, "failed to get DRM driver version");
+         }
+      }
+      if (version)
+         drmFreeVersion(version);
+      close(fd);
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   gpu->fd = fd;
+
+   struct stat st;
+   if (stat(primary_path, &st) == 0) {
+      gpu->has_primary = true;
+      gpu->primary_major = major(st.st_rdev);
+      gpu->primary_minor = minor(st.st_rdev);
+   } else {
+      gpu->has_primary = false;
+      gpu->primary_major = 0;
+      gpu->primary_minor = 0;
+   }
+   stat(node_path, &st);
+   gpu->render_major = major(st.st_rdev);
+   gpu->render_minor = minor(st.st_rdev);
+
+   gpu->bustype = dev->bustype;
+   if (dev->bustype == DRM_BUS_PCI)
+      gpu->pci_bus_info = *dev->businfo.pci;
+
+   drmFreeVersion(version);
+
+   MESSAGE("using DRM device %s", node_path);
+
+   return APIR_SUCCESS;
+}
+
+void
+vn_log(struct remoting_dev_instance *instance, const char *format, ...)
+{
+   if (instance) {
+     printf("<INST>");
+   }
+
+   va_list ap;
+
+   va_start(ap, format);
+   vprintf(format, ap);
+   va_end(ap);
+
+   /* instance may be NULL or partially initialized */
+}
+
+static virt_gpu_result_t
+virtgpu_init_context(struct virtgpu *gpu)
+{
+   assert(!gpu->capset.version);
+   const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
+   if (ret) {
+      if (VN_DEBUG(INIT)) {
+         vn_log(gpu->instance, "failed to initialize context: %s",
+                strerror(errno));
+      }
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t
+virtgpu_init_capset(struct virtgpu *gpu)
+{
+   gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS;
+   gpu->capset.version = 0;
+
+   const int ret =
+      virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version,
+                             &gpu->capset.data, sizeof(gpu->capset.data));
+   if (ret) {
+      if (VN_DEBUG(INIT)) {
+         vn_log(gpu->instance, "failed to get venus v%d capset: %s",
+                gpu->capset.version, strerror(errno));
+      }
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t
+virtgpu_init_params(struct virtgpu *gpu)
+{
+   const uint64_t required_params[] = {
+      VIRTGPU_PARAM_3D_FEATURES,   VIRTGPU_PARAM_CAPSET_QUERY_FIX,
+      VIRTGPU_PARAM_RESOURCE_BLOB, VIRTGPU_PARAM_CONTEXT_INIT,
+   };
+   uint64_t val;
+   for (uint32_t i = 0; i < ARRAY_SIZE(required_params); i++) {
+      val = virtgpu_ioctl_getparam(gpu, required_params[i]);
+      if (!val) {
+         if (VN_DEBUG(INIT)) {
+            vn_log(gpu->instance, "required kernel param %d is missing",
+                   (int)required_params[i]);
+         }
+         return APIR_ERROR_INITIALIZATION_FAILED;
+      }
+   }
+
+   val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_HOST_VISIBLE);
+   if (val) {
+      gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
+   } else {
+      val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_GUEST_VRAM);
+      if (val) {
+         gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_GUEST_VRAM;
+      }
+   }
+
+   if (!val) {
+      vn_log(gpu->instance,
+             "one of required kernel params (%d or %d) is missing",
+             (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM);
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   /* Cross-device feature is optional.  It enables sharing dma-bufs
+    * with other virtio devices, like virtio-wl or virtio-video used
+    * by ChromeOS VMs.  Qemu doesn't support cross-device sharing.
+    */
+   val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_CROSS_DEVICE);
+   if (val)
+      gpu->supports_cross_device = true;
+
+   /* implied by CONTEXT_INIT uapi */
+   gpu->max_timeline_count = 64;
+
+   return APIR_SUCCESS;
+}
+
+static int
+virtgpu_ioctl_context_init(struct virtgpu *gpu,
+                           enum virgl_renderer_capset capset_id)
+{
+   struct drm_virtgpu_context_set_param ctx_set_params[3] = {
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID,
+         .value = capset_id,
+      },
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS,
+         .value = 64,
+      },
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK,
+         .value = 0, /* don't generate drm_events on fence signaling */
+      },
+   };
+
+   struct drm_virtgpu_context_init args = {
+      .num_params = ARRAY_SIZE(ctx_set_params),
+      .pad = 0,
+      .ctx_set_params = (uintptr_t)&ctx_set_params,
+   };
+
+   return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args);
+}
+
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size)
+{
+   struct drm_virtgpu_get_caps args = {
+      .cap_set_id = id,
+      .cap_set_ver = version,
+      .addr = (uintptr_t)capset,
+      .size = (__u32) capset_size,
+      .pad = 0,
+   };
+
+   return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
+}
+
+static uint64_t
+virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
+{
+   /* val must be zeroed because kernel only writes the lower 32 bits */
+   uint64_t val = 0;
+   struct drm_virtgpu_getparam args = {
+      .param = param,
+      .value = (uintptr_t)&val,
+   };
+
+   const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
+   return ret ? 0 : val;
+}
+
+
+struct vn_cs_encoder *
+remote_call_prepare(
+  struct virtgpu *gpu,
+  ApirCommandType apir_cmd_type,
+  int32_t cmd_flags)
+{
+
+  if (!gpu->reply_shmem) {
+    FATAL("%s: the reply shmem page can't be null", __func__);
+  }
+
+  /*
+   * Prepare the command encoder and its buffer
+   */
+
+  static char encoder_buffer[4096];
+
+  static struct vn_cs_encoder enc;
+  enc = {
+    encoder_buffer,
+    encoder_buffer,
+    encoder_buffer + sizeof(encoder_buffer),
+  };
+
+  /*
+   * Fill the command encoder with the common args:
+   * - cmd_type (int32_t)
+   * - cmd_flags (int32_t)
+   * - reply res id (uint32_t)
+   */
+
+  int32_t cmd_type = VENUS_COMMAND_TYPE_LENGTH + apir_cmd_type;
+  vn_encode_int32_t(&enc, &cmd_type);
+  vn_encode_int32_t(&enc, &cmd_flags);
+
+  uint32_t reply_res_id = gpu->reply_shmem->res_id;
+  vn_encode_uint32_t(&enc, &reply_res_id);
+
+  return &enc;
+}
+
+void
+remote_call_finish(
+  struct virtgpu *gpu,
+  struct vn_cs_encoder *enc,
+  struct vn_cs_decoder *dec) {
+  UNUSED(gpu);
+
+  if (!enc) {
+    ERROR("Invalid (null) encoder :/");
+  }
+
+  if (!dec) {
+    ERROR("Invalid (null) decoder :/");
+  }
+
+  // encoder and decoder are statically allocated, nothing to do to release them
+}
+
+uint32_t
+remote_call(
+  struct virtgpu *gpu,
+  struct vn_cs_encoder *encoder,
+  struct vn_cs_decoder **decoder,
+  float max_wait_ms,
+  long long *call_duration_ns)
+{
+  /*
+   * Prepare the reply notification pointer
+   */
+
+  volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr;
+  *atomic_reply_notif = 0;
+
+  /*
+   * Trigger the execbuf ioctl
+   */
+
+  struct drm_virtgpu_execbuffer args = {
+    .flags = VIRTGPU_EXECBUF_RING_IDX,
+    .size = (uint32_t) (encoder->cur - encoder->start),
+    .command = (uintptr_t) encoder->start,
+
+    .bo_handles = 0,
+    .num_bo_handles = 0,
+
+    .fence_fd = 0,
+    .ring_idx = 0,
+    .syncobj_stride = 0,
+    .num_in_syncobjs = 0,
+    .num_out_syncobjs = 0,
+    .in_syncobjs = 0,
+    .out_syncobjs = 0,
+  };
+
+  *decoder = NULL;
+
+  int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
+
+  if (ret != 0) {
+    FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret);
+  }
+
+  /*
+   * Wait for the response notification
+   */
+
+  start_timer(&wait_host_reply_timer);
+
+  struct timespec ts_start, ts_end;
+  clock_gettime(CLOCK_MONOTONIC, &ts_start);
+  long long start_time = (long long)ts_start.tv_sec * 1000000000LL + ts_start.tv_nsec;
+
+  bool timedout = false;
+  uint32_t notif_value = 0;
+  while (true) {
+    notif_value = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire);
+
+    if (notif_value != 0) {
+      break;
+    }
+
+    int64_t base_sleep_us = 15;
+
+    os_time_sleep(base_sleep_us);
+
+    if (max_wait_ms) {
+      clock_gettime(CLOCK_MONOTONIC, &ts_end);
+      long long end_time = (long long)ts_end.tv_sec * 1000000000LL + ts_end.tv_nsec;
+      float duration_ms = (end_time - start_time) / 1000000;
+
+      if (duration_ms > max_wait_ms) {
+        timedout = true;
+        break;
+      }
+    }
+  }
+
+  if (call_duration_ns) {
+    *call_duration_ns = stop_timer(&wait_host_reply_timer);
+  }
+
+  if (max_wait_ms && timedout) {
+      ERROR("timed out waiting for the host answer...");
+      return APIR_FORWARD_TIMEOUT;
+  }
+
+  /*
+   * Prepare the decoder
+   */
+  static struct vn_cs_decoder response_dec;
+  response_dec.cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif);
+  response_dec.end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size;
+  *decoder = &response_dec;
+
+  // extract the actual return value from the notif flag
+  uint32_t returned_value = notif_value - 1;
+  return returned_value;
+}
+
+static void log_call_duration(long long call_duration_ns, const char *name) {
+  double call_duration_ms = (double) call_duration_ns / 1e6;  // 1 millisecond = 1e6 nanoseconds
+  double call_duration_s  = (double) call_duration_ns / 1e9;  // 1 second = 1e9 nanoseconds
+
+  if (call_duration_s > 1) {
+    MESSAGE("%s: waited %.2fs for the %s host reply...", __func__, call_duration_s, name);
+  } else if (call_duration_ms > 1) {
+    MESSAGE("%s: waited %.2fms for the %s host reply...", __func__, call_duration_ms, name);
+  } else {
+    MESSAGE("%s: waited %lldns for the %s host reply...", __func__, call_duration_ns, name);
+  }
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
new file mode 100644
index 000000000..7a8cfc3d7
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <xf86drm.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <threads.h>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include "virtgpu-forward.h"
+#include "virtgpu-utils.h"
+#include "../ggml-remotingbackend/shared/api_remoting.h"
+#include "../ggml-remotingbackend/shared/venus_cs.h"
+
+#include "virtgpu-shm.h"
+
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "drm-uapi/virtgpu_drm.h"
+#include "venus_hw.h"
+
+// must match https://gitlab.freedesktop.org/kpouget/virglrenderer/-/blob/main/src/virglrenderer_hw.h?ref_type=heads
+enum virgl_renderer_capset {
+  VIRGL_RENDERER_CAPSET_VIRGL                   = 1,
+  VIRGL_RENDERER_CAPSET_VIRGL2                  = 2,
+  /* 3 is reserved for gfxstream */
+  VIRGL_RENDERER_CAPSET_VENUS                   = 4,
+  /* 5 is reserved for cross-domain */
+  VIRGL_RENDERER_CAPSET_DRM                     = 6,
+};
+
+/* from src/virtio/vulkan/vn_renderer_virtgpu.c */
+#define VIRTGPU_PCI_VENDOR_ID 0x1af4
+#define VIRTGPU_PCI_DEVICE_ID 0x1050
+#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004
+#define VIRTGPU_PARAM_GUEST_VRAM 9
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define VN_DEBUG(what) true
+
+typedef enum virt_gpu_result_t {
+  APIR_SUCCESS = 0,
+  APIR_ERROR_INITIALIZATION_FAILED = -1,
+} virt_gpu_result_t;
+
+
+struct remoting_dev_instance {
+  int yes;
+};
+
+#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+
+inline void
+vn_log(struct remoting_dev_instance *instance, const char *format, ...)
+  PRINTFLIKE(2, 3);
+
+
+struct virtgpu {
+  struct remoting_dev_instance *instance;
+
+  int fd;
+
+  bool has_primary;
+  int primary_major;
+  int primary_minor;
+  int render_major;
+  int render_minor;
+
+  int bustype;
+  drmPciBusInfo pci_bus_info;
+
+  uint32_t max_timeline_count;
+
+  struct {
+    enum virgl_renderer_capset id;
+    uint32_t version;
+    struct virgl_renderer_capset_venus data;
+  } capset;
+
+  uint32_t shmem_blob_mem;
+  uint32_t bo_blob_mem;
+
+  /* note that we use gem_handle instead of res_id to index because
+   * res_id is monotonically increasing by default (see
+   * virtio_gpu_resource_id_get)
+   */
+  struct util_sparse_array shmem_array;
+
+  mtx_t dma_buf_import_mutex;
+
+  bool supports_cross_device;
+
+  /* APIR */
+  struct vn_renderer_shmem *reply_shmem;
+  struct vn_renderer_shmem *data_shmem;
+};
+
+
+static inline int
+virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
+{
+  return drmIoctl(gpu->fd, request, args);
+}
+
+struct virtgpu *create_virtgpu();
+
+struct vn_cs_encoder *remote_call_prepare(
+  struct virtgpu *gpu,
+  ApirCommandType apir_cmd_type,
+  int32_t cmd_flags);
+
+uint32_t remote_call(
+  struct virtgpu *gpu,
+  struct vn_cs_encoder *enc,
+  struct vn_cs_decoder **dec,
+  float max_wait_ms,
+  long long *call_duration_ns
+);
+
+void remote_call_finish(
+  struct virtgpu *gpu,
+  struct vn_cs_encoder *enc,
+  struct vn_cs_decoder *dec);
diff --git a/podman_compile.sh b/podman_compile.sh
new file mode 100755
index 000000000..9dd04b42a
--- /dev/null
+++ b/podman_compile.sh
@@ -0,0 +1,39 @@
+#! /bin/bash
+
+
+set -o pipefail
+set -o errexit
+set -o nounset
+set -o errtrace
+
+opts=""
+opts="$opts --device /dev/dri "
+
+#IMAGE=quay.io/ramalama/remoting:latest
+IMAGE=localhost/mesa:compile
+
+what=${1:-}
+if [[ -z "$what" ]]; then
+    what=remoting
+fi
+
+cmd="bash ./build.$what.sh"
+
+POD_NAME=mac_ai_compiling
+podman machine ssh podman rm $POD_NAME --force
+
+set -x
+podman run \
+--name $POD_NAME \
+--user root:root \
+--cgroupns host \
+--security-opt label=disable \
+--env HOME="$HOME" \
+--env PERF_MODE="${PERF_MODE:-}" \
+--env BENCH_MODE="${BENCH_MODE:-}" \
+-v "$HOME":"$HOME":Z \
+-w "$PWD" \
+-it --rm \
+$opts \
+$IMAGE \
+$cmd
diff --git a/prepare.backend.sh b/prepare.backend.sh
new file mode 100755
index 000000000..8bc5be19e
--- /dev/null
+++ b/prepare.backend.sh
@@ -0,0 +1,17 @@
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+cmake -S . -B ../build.remoting-backend$FLAVOR \
+      -DGGML_REMOTINGBACKEND=ON \
+      -DGGML_NATIVE=OFF \
+      -DGGML_METAL=ON \
+      -DGGML_BACKEND_DL=OFF \
+      -DLLAMA_CURL=OFF \
+      -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \
+      "$@"
+
+#      -DCMAKE_BUILD_TYPE=Debug \
+#
diff --git a/prepare.remoting.sh b/prepare.remoting.sh
new file mode 100755
index 000000000..5ab734704
--- /dev/null
+++ b/prepare.remoting.sh
@@ -0,0 +1,8 @@
+cmake -S . -B ../build.remoting-frontend \
+      -DGGML_REMOTINGFRONTEND=ON \
+      -DGGML_CPU_ARM_ARCH=native \
+      -DGGML_NATIVE=OFF \
+      -DGGML_OPENMP=OFF \
+      -DLLAMA_CURL=OFF \
+      -DCMAKE_BUILD_TYPE=Debug \
+      "$@"
diff --git a/prepare.sh b/prepare.sh
new file mode 100644
index 000000000..2fb46cefd
--- /dev/null
+++ b/prepare.sh
@@ -0,0 +1 @@
+cmake -S . -B ./build -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF #-DCMAKE_BUILD_TYPE=Debug #-DGGML_VULKAN_DEBUG=1
diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh
new file mode 100644
index 000000000..c12b0abe0
--- /dev/null
+++ b/prepare.vulkan.sh
@@ -0,0 +1,7 @@
+cmake -S . \
+      -B ../build.vulkan \
+      -DGGML_VULKAN=ON \
+      -DGGML_NATIVE=OFF \
+      -DGGML_METAL=OFF \
+      -DLLAMA_CURL=OFF \
+      -DCMAKE_BUILD_TYPE=Debug
diff --git a/run.ramalama.sh b/run.ramalama.sh
new file mode 100755
index 000000000..ea3d5b0ee
--- /dev/null
+++ b/run.ramalama.sh
@@ -0,0 +1,4 @@
+ICD_DIR=/Users/kevinpouget/.local/share/vulkan/icd.d
+export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.cont.aarch64.json
+
+llama-run ~/models/llama3.2 "say nothing" --ngl 99
diff --git a/run.remoting.sh b/run.remoting.sh
new file mode 100755
index 000000000..44c693bb9
--- /dev/null
+++ b/run.remoting.sh
@@ -0,0 +1,63 @@
+#! /bin/bash
+#clear
+if [[ ${1:-} == "strace" ]]; then
+    prefix="strace"
+elif [[ ${1:-} == "gdb" ]]; then
+    prefix="gdb --args"
+else
+    prefix=""
+fi
+
+MODEL=${MODEL:-llama3.2}
+
+LLAMA_BUILD_DIR=../build.remoting-frontend$FLAVOR
+
+MODEL_HOME="$HOME/models"
+
+set -x
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    cat <<EOF
+###
+### Running llama-server
+###
+
+EOF
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-server \
+        --host 0.0.0.0 \
+        --port 8080 \
+        --model "$MODEL_HOME/$MODEL" \
+        --n-gpu-layers 99 \
+        --threads 1
+elif [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    cat <<EOF
+###
+### Running llama-bench
+###
+
+EOF
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-bench \
+        --model "$MODEL_HOME/$MODEL" \
+        --n-gpu-layers 99
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    cat <<EOF
+###
+### Running test-backend-ops perf
+###
+
+EOF
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/test-backend-ops perf
+
+else
+    PROMPT="say nothing"
+    #PROMPT="tell what's Apple metal API"
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-run \
+        --ngl 99 \
+        --verbose \
+        --context-size 4096 \
+        "$MODEL_HOME/$MODEL" \
+        "$PROMPT"
+fi
diff --git a/run.sh b/run.sh
new file mode 100755
index 000000000..13d8c0425
--- /dev/null
+++ b/run.sh
@@ -0,0 +1 @@
+./build/bin/llama-run --ngl 999 --verbose ~/models/llama3.2 "say nothing"
diff --git a/run.vulkan.sh b/run.vulkan.sh
new file mode 100755
index 000000000..9254bf975
--- /dev/null
+++ b/run.vulkan.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+if [[ ${1:-} == "strace" ]]; then
+    prefix="strace"
+elif [[ ${1:-} == "gdb" ]]; then
+    prefix="gdb --args"
+elif [[ ${1:-} == "gdbr" ]]; then
+    prefix="gdb -ex='set confirm on' -ex=run -ex=quit --args"
+else
+    prefix=""
+fi
+
+#rm -f /usr/lib64/libvulkan_virtio.so
+
+ICD_DIR=/Users/kevinpouget/.local/share/vulkan/icd.d
+
+MESA_FLAVOR=good
+if [[ "$MESA_FLAVOR" == "work" ]]; then
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.aarch64.json
+elif [[ "$MESA_FLAVOR" == "good" ]]; then
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.good.aarch64.json
+elif [[ "$MESA_FLAVOR" == "cont" ]]; then
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.cont.aarch64.json
+else
+    echo "ERROR: invalid MESA_FLAVOR=$MESA_FLAVOR"
+    exit 1
+fi
+
+# init result vtest wsi no_abort log_ctx_info cache no_sparse no_gpl
+export VN_DEBUG=vtest
+$prefix ../build.vulkan/bin/llama-run --verbose ~/models/llama3.2 "say nothing" --ngl 99
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 99bfed751..ed1c5f479 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -122,7 +122,7 @@ llama_context::llama_context(
     }
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
+/*
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
     LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
@@ -133,7 +133,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
-
+*/
     if (n_ctx_per_seq < hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index d7ab56ccd..e235547eb 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -116,7 +116,9 @@ llama_kv_cache::llama_kv_cache(
             dev_name = ggml_backend_dev_name(dev);
         }
 
+	/*
         LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
+	*/
 
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index f71c40f8e..7ee203f52 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -677,7 +677,7 @@ llama_model_loader::llama_model_loader(
             }
         }
 
-        LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        //LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
 
         for (int i = 0; i < n_kv; i++) {
             const char * name           = gguf_get_key(meta.get(), i);
@@ -694,7 +694,7 @@ llama_model_loader::llama_model_loader(
             }
             replace_all(value, "\n", "\\n");
 
-            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
         }
 
         // print type counts
@@ -703,7 +703,7 @@ llama_model_loader::llama_model_loader(
                 continue;
             }
 
-            LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
     }
 
@@ -1154,6 +1154,7 @@ std::string llama_model_loader::ftype_name() const {
 }
 
 void llama_model_loader::print_info() const {
+    return;
     LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
     LLAMA_LOG_INFO("%s: file type   = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
     if (n_bytes < GiB) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 7d3429617..32fec73dc 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1967,12 +1967,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
         const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
-            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
+            //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
             return {cpu_dev, &pimpl->cpu_buft_list};
         }
         const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
         auto * dev = devices.at(layer_gpu);
-        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
+        //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
         return {dev, &pimpl->gpu_buft_list.at(dev)};
     };
 
@@ -5769,6 +5769,7 @@ uint64_t llama_model::n_elements() const {
 }
 
 void llama_model::print_info() const {
+    return;
     const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
 
     auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index de5d1681d..b0eb171d2 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2333,8 +2333,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else {
                 // token is control, but not marked as EOG -> print a debug log
                 if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                            __func__, t.second, t.first.c_str());
+                    //LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                    //        __func__, t.second, t.first.c_str());
                 }
             }
         }
@@ -3180,6 +3180,7 @@ int32_t llama_vocab::impl::detokenize(
 }
 
 void llama_vocab::impl::print_info() const {
+    return;
     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
     LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
@@ -3545,6 +3546,7 @@ std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, boo
 }
 
 void llama_vocab::print_info() const {
+    return;
     pimpl->print_info();
 }
 
diff --git a/tools/run/run.cpp b/tools/run/run.cpp
index 6fe728c68..2ae30430c 100644
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
@@ -1000,6 +1000,34 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
     response += piece;
 }
 
+static long long timer_start = 0;
+static long long timer_total = 0;
+static long long timer_count = 0;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static void show_timer(void) {
+  double ms = timer_total/1000000;
+  double itl = ms/timer_count;
+  double speed = 1/itl * 1000;
+
+  printe("LLAMA generate [%9.0f] ms for %4lld invocations | ITL %2.2f ms | throughput = %4.2f t/s\n", ms, timer_count, itl, speed);
+}
+
+
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
     const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
@@ -1009,10 +1037,15 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         return 1;
     }
 
+    int cr = atexit(show_timer);
+    GGML_ASSERT(cr == 0);
+
     // prepare a batch for the prompt
     llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     llama_token new_token_id;
+
     while (true) {
+        start_timer();
         check_context_size(llama_data.context, batch);
         if (llama_decode(llama_data.context.get(), batch)) {
             printe("failed to decode\n");
@@ -1034,6 +1067,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
 
         // prepare the next batch with the sampled token
         batch = llama_batch_get_one(&new_token_id, 1);
+	stop_timer();
     }
 
     printf(LOG_COL_DEFAULT);