diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8a156e0f9..ccae921f1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,5 +1,5 @@
 # ------------------------------------------------------------
-#  Build workflow (Linux x64, macOS x64, Windows x64)
+#  Build and test workflow (Linux x64, macOS x64, Windows x64)
 # ------------------------------------------------------------
 
 name: build
@@ -20,7 +20,7 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-latest, macos-latest, windows-latest ]
-        step: [ build, check ]
+        step: [ build, test ]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v3
@@ -33,7 +33,7 @@ jobs:
         if: matrix.os == 'ubuntu-latest'
         run: |
           sudo apt-get -qq update
-          sudo apt-get -qq install -y make pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libgl1-mesa-glx
+          sudo apt-get -qq install -y make pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libyuv-dev libgl1-mesa-glx
 
       - name: Get MacOS dev libraries and tools
         if: matrix.os == 'macos-latest'
@@ -55,9 +55,10 @@ jobs:
             mingw-w64-x86_64-opus
             mingw-w64-x86_64-x264-git
             mingw-w64-x86_64-SDL2
+            mingw-w64-x86_64-libyuv
 
       - name: Get Windows OpenGL drivers
-        if: matrix.step == 'check' && matrix.os == 'windows-latest'
+        if: matrix.step == 'test' && matrix.os == 'windows-latest'
         shell: msys2 {0}
         run: |
           wget -q https://github.com/pal1000/mesa-dist-win/releases/download/20.2.1/mesa3d-20.2.1-release-mingw.7z
@@ -81,28 +82,28 @@ jobs:
         run: |
           make build
 
-      - name: Verify core rendering (windows-latest)
-        if: matrix.step == 'check' && matrix.os == 'windows-latest' && always()
+      - name: Test (windows-latest)
+        if: matrix.step == 'test' && matrix.os == 'windows-latest' && always()
         shell: msys2 {0}
         env:
           MESA_GL_VERSION_OVERRIDE: 3.3COMPAT
         run: |
-          GL_CTX=-autoGlContext make verify-cores
+          GL_CTX=-autoGlContext make test verify-cores
 
-      - name: Verify core rendering (ubuntu-latest)
-        if: matrix.step == 'check' && matrix.os == 'ubuntu-latest' && always()
+      - name: Test (ubuntu-latest)
+        if: matrix.step == 'test' && matrix.os == 'ubuntu-latest' && always()
         env:
           MESA_GL_VERSION_OVERRIDE: 3.3COMPAT
         run: |
-          GL_CTX=-autoGlContext xvfb-run --auto-servernum make verify-cores
+          GL_CTX=-autoGlContext xvfb-run --auto-servernum make test verify-cores
 
-      - name: Verify core rendering (macos-latest)
-        if: matrix.step == 'check' && matrix.os == 'macos-latest' && always()
+      - name: Test (macos-latest)
+        if: matrix.step == 'test' && matrix.os == 'macos-latest' && always()
         run: |
-          make verify-cores
+          make test verify-cores
 
       - uses: actions/upload-artifact@v3
-        if: matrix.step == 'check' && always()
+        if: matrix.step == 'test' && always()
         with:
           name: emulator-test-frames
           path: _rendered/*.png
diff --git a/.github/workflows/cd/cloudretro.io/config.yaml b/.github/workflows/cd/cloudretro.io/config.yaml
index 9cfb0e7b7..fa8b21a59 100644
--- a/.github/workflows/cd/cloudretro.io/config.yaml
+++ b/.github/workflows/cd/cloudretro.io/config.yaml
@@ -24,14 +24,16 @@ worker:
       domain: cloudretro.io
 
 emulator:
-  threads: 4
   libretro:
     logLevel: 1
     cores:
       list:
         mame:
           options:
-            "fbneo-cpu-speed-adjust": "200%"
             "fbneo-diagnostic-input": "Hold Start"
+        nes:
+          scale: 2
         pcsx:
           altRepo: true
+        snes:
+          scale: 2
diff --git a/Dockerfile b/Dockerfile
index 98a8a807b..d874271d5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -51,6 +51,7 @@ RUN apt-get -q update && apt-get -q install --no-install-recommends -y \
     libopus-dev \
     libsdl2-dev \
     libvpx-dev \
+    libyuv-dev \
     libx264-dev \
     pkg-config \
 && rm -rf /var/lib/apt/lists/*
diff --git a/Makefile b/Makefile
index f8097ac01..f0afe6ad7 100644
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,8 @@ CGO_CFLAGS='-g -O3 -funroll-loops'
 CGO_LDFLAGS='-g -O3'
 GO_TAGS=static
 
+.PHONY: clean test
+
 fmt:
 	@goimports -w cmd pkg tests
 	@gofmt -s -w cmd pkg tests
@@ -32,6 +34,9 @@ build.worker:
 
 build: build.coordinator build.worker
 
+test:
+	go test -v ./pkg/...
+
 verify-cores:
 	go test -run TestAll ./pkg/worker/room -v -renderFrames $(GL_CTX) -outputPath "../../../_rendered"
 
diff --git a/README.md b/README.md
index 66944fdce..b3f181c31 100644
--- a/README.md
+++ b/README.md
@@ -61,13 +61,13 @@ a better sense of performance.
 
 ```
 # Ubuntu / Windows (WSL2)
-apt-get install -y make gcc pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev
+apt-get install -y make gcc pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libyuv-dev
 
 # MacOS
 brew install pkg-config libvpx x264 opus sdl2
 
 # Windows (MSYS2)
-pacman -Sy --noconfirm --needed git make mingw-w64-x86_64-{gcc,pkgconf,dlfcn,libvpx,opus,x264-git,SDL2}
+pacman -Sy --noconfirm --needed git make mingw-w64-x86_64-{gcc,pkgconf,dlfcn,libvpx,opus,x264-git,SDL2,libyuv}
 ```
 
 Because the coordinator and workers need to run simultaneously. Workers connect to the coordinator.
diff --git a/pkg/config/config.yaml b/pkg/config/config.yaml
index 481269ad9..30737b082 100644
--- a/pkg/config/config.yaml
+++ b/pkg/config/config.yaml
@@ -99,12 +99,9 @@ worker:
   tag:
 
 emulator:
-  # set output viewport scale factor
-  scale: 1
-
   # set the total number of threads for the image processing
-  # (experimental)
-  threads: 4
+  # (removed)
+  threads: 0
 
   aspectRatio:
     # enable aspect ratio changing
@@ -163,6 +160,7 @@ emulator:
       #   - altRepo (bool) prioritize secondary repo as the download source
       #   - lib (string)
       #   - roms ([]string)
+      #   - scale (int) scales the output video frames by this factor.
       #   - folder (string)
       #       By default emulator selection is based on the folder named as cores
       #       in the list (i.e. nes, snes) but if you specify folder param,
@@ -244,8 +242,6 @@ encoder:
   video:
     # h264, vpx (VP8)
     codec: h264
-    # concurrent execution units (0 - disabled)
-    concurrency: 0
     # see: https://trac.ffmpeg.org/wiki/Encode/H.264
     h264:
       # Constant Rate Factor (CRF) 0-51 (default: 23)
@@ -273,12 +269,6 @@ encoder:
 # one additional FFMPEG concat demux file
 recording:
   enabled: false
-  # image compression level:
-  #   0 - default compression
-  #	 -1 - no compression
-  #	 -2 - best speed
-  #	 -3 - best compression
-  compressLevel: 0
   # name contains the name of the recording dir (or zip)
   # format:
   # %date:go_time_format% -- refer: https://go.dev/src/time/format.go
diff --git a/pkg/config/emulator.go b/pkg/config/emulator.go
index 3b9025188..dda7b4865 100644
--- a/pkg/config/emulator.go
+++ b/pkg/config/emulator.go
@@ -7,7 +7,6 @@ import (
 )
 
 type Emulator struct {
-	Scale       int
 	Threads     int
 	AspectRatio struct {
 		Keep   bool
@@ -54,6 +53,7 @@ type LibretroCoreConfig struct {
 	Lib           string
 	Options       map[string]string
 	Roms          []string
+	Scale         float64
 	UsesLibCo     bool
 	VFR           bool
 	Width         int
diff --git a/pkg/config/shared.go b/pkg/config/shared.go
index 856479f4e..026b79d3d 100644
--- a/pkg/config/shared.go
+++ b/pkg/config/shared.go
@@ -41,11 +41,10 @@ type Server struct {
 }
 
 type Recording struct {
-	Enabled       bool
-	CompressLevel int
-	Name          string
-	Folder        string
-	Zip           bool
+	Enabled bool
+	Name    string
+	Folder  string
+	Zip     bool
 }
 
 func (s *Server) WithFlags() {
diff --git a/pkg/config/worker.go b/pkg/config/worker.go
index ed0145d49..ab6af2cc8 100644
--- a/pkg/config/worker.go
+++ b/pkg/config/worker.go
@@ -52,9 +52,8 @@ type Audio struct {
 }
 
 type Video struct {
-	Codec       string
-	Concurrency int
-	H264        struct {
+	Codec string
+	H264  struct {
 		Crf      uint8
 		LogLevel int32
 		Preset   string
diff --git a/pkg/encoder/color/bgra/bgra.go b/pkg/encoder/color/bgra/bgra.go
new file mode 100644
index 000000000..39a50c228
--- /dev/null
+++ b/pkg/encoder/color/bgra/bgra.go
@@ -0,0 +1,56 @@
+package bgra
+
+import (
+	"image"
+	"image/color"
+)
+
+type BGRA struct {
+	image.RGBA
+}
+
+var BGRAModel = color.ModelFunc(func(c color.Color) color.Color {
+	if _, ok := c.(BGRAColor); ok {
+		return c
+	}
+	r, g, b, a := c.RGBA()
+	return BGRAColor{uint8(r >> 8), uint8(g >> 8), uint8(b >> 8), uint8(a >> 8)}
+})
+
+// BGRAColor represents a BGRA color.
+type BGRAColor struct {
+	R, G, B, A uint8
+}
+
+func (c BGRAColor) RGBA() (r, g, b, a uint32) {
+	r = uint32(c.B)
+	r |= r << 8
+	g = uint32(c.G)
+	g |= g << 8
+	b = uint32(c.R)
+	b |= b << 8
+	a = uint32(255) //uint32(c.A)
+	a |= a << 8
+	return
+}
+
+func NewBGRA(r image.Rectangle) *BGRA {
+	return &BGRA{*image.NewRGBA(r)}
+}
+
+func (p *BGRA) ColorModel() color.Model { return BGRAModel }
+func (p *BGRA) At(x, y int) color.Color {
+	i := p.PixOffset(x, y)
+	s := p.Pix[i : i+4 : i+4]
+	return BGRAColor{s[0], s[1], s[2], s[3]}
+}
+
+func (p *BGRA) Set(x, y int, c color.Color) {
+	i := p.PixOffset(x, y)
+	c1 := BGRAModel.Convert(c).(BGRAColor)
+	s := p.Pix[i : i+4 : i+4]
+	s[0] = c1.R
+	s[1] = c1.G
+	s[2] = c1.B
+	s[3] = 255
+}
diff --git a/pkg/encoder/color/rgb565/rgb565.go b/pkg/encoder/color/rgb565/rgb565.go
new file mode 100644
index 000000000..11c66c8bf
--- /dev/null
+++ b/pkg/encoder/color/rgb565/rgb565.go
@@ -0,0 +1,62 @@
+package rgb565
+
+import (
+	"encoding/binary"
+	"image"
+	"image/color"
+	"math"
+)
+
+// RGB565 is an in-memory image whose At method returns RGB565 values.
+type RGB565 struct {
+	// Pix holds the image's pixels, as RGB565 values in big-endian format. The pixel at
+	// (x, y) starts at Pix[(y-p.Rect.Min.Y)*p.Stride + (x-p.Rect.Min.X)*2].
+	Pix []uint8
+	// Stride is the Pix stride (in bytes) between vertically adjacent pixels.
+	Stride int
+	// Rect is the image's bounds.
+	Rect image.Rectangle
+}
+
+// Model is the model for RGB565 colors.
+var Model = color.ModelFunc(func(c color.Color) color.Color {
+	//if _, ok := c.(Color); ok {
+	//	return c
+	//}
+	r, g, b, _ := c.RGBA()
+	return Color(uint16((r<<8)&rMask | (g<<3)&gMask | (b>>3)&bMask))
+})
+
+const (
+	rMask = 0b1111100000000000
+	gMask = 0b0000011111100000
+	bMask = 0b0000000000011111
+)
+
+// Color represents an RGB565 color.
+type Color uint16
+
+func (c Color) RGBA() (r, g, b, a uint32) {
+	return uint32(math.Round(float64(c&rMask>>11)*255.0/31.0)) << 8,
+		uint32(math.Round(float64(c&gMask>>5)*255.0/63.0)) << 8,
+		uint32(math.Round(float64(c&bMask)*255.0/31.0)) << 8,
+		0xffff
+}
+
+func NewRGB565(r image.Rectangle) *RGB565 {
+	return &RGB565{Pix: make([]uint8, r.Dx()*r.Dy()<<1), Stride: r.Dx() << 1, Rect: r}
+}
+
+func (p *RGB565) Bounds() image.Rectangle { return p.Rect }
+func (p *RGB565) ColorModel() color.Model { return Model }
+func (p *RGB565) PixOffset(x, y int) int  { return (x-p.Rect.Min.X)<<1 + (y-p.Rect.Min.Y)*p.Stride }
+
+func (p *RGB565) At(x, y int) color.Color {
+	i := p.PixOffset(x, y)
+	return Color(binary.LittleEndian.Uint16(p.Pix[i : i+2]))
+}
+
+func (p *RGB565) Set(x, y int, c color.Color) {
+	i := p.PixOffset(x, y)
+	binary.LittleEndian.PutUint16(p.Pix[i:i+2], uint16(Model.Convert(c).(Color)))
+}
diff --git a/pkg/encoder/color/rgba/rgba.go b/pkg/encoder/color/rgba/rgba.go
new file mode 100644
index 000000000..c37d62181
--- /dev/null
+++ b/pkg/encoder/color/rgba/rgba.go
@@ -0,0 +1,24 @@
+package rgba
+
+import (
+	"image"
+	"image/color"
+)
+
+func ToRGBA(img image.Image, flipped bool) *image.RGBA {
+	bounds := img.Bounds()
+	sw, sh := bounds.Dx(), bounds.Dy()
+	dst := image.NewRGBA(image.Rect(0, 0, sw, sh))
+	for y := 0; y < sh; y++ {
+		yy := y
+		if flipped {
+			yy = sh - y
+		}
+		for x := 0; x < sw; x++ {
+			px := img.At(x, y)
+			rgba := color.RGBAModel.Convert(px).(color.RGBA)
+			dst.Set(x, yy, rgba)
+		}
+	}
+	return dst
+}
diff --git a/pkg/encoder/encoder.go b/pkg/encoder/encoder.go
index 66827d9e0..60e960d01 100644
--- a/pkg/encoder/encoder.go
+++ b/pkg/encoder/encoder.go
@@ -1,7 +1,7 @@
 package encoder
 
 import (
-	"image"
+	"fmt"
 	"sync"
 	"sync/atomic"
 
@@ -10,7 +10,7 @@ import (
 )
 
 type (
-	InFrame  *image.RGBA
+	InFrame  yuv.RawFrame
 	OutFrame []byte
 	Encoder  interface {
 		LoadBuf(input []byte)
@@ -21,11 +21,13 @@ type (
 	}
 )
 
-type VideoEncoder struct {
-	encoder Encoder
+type Video struct {
+	codec   Encoder
 	log     *logger.Logger
 	stopped atomic.Bool
-	y       yuv.ImgProcessor
+	y       yuv.Conv
+	pf      yuv.PixFmt
+	rot     uint
 	mu      sync.Mutex
 }
 
@@ -41,39 +43,63 @@ const (
 // converts them into YUV I420 format,
 // encodes with provided video encoder, and
 // puts the result into the output channel.
-func NewVideoEncoder(enc Encoder, w, h int, concurrency int, log *logger.Logger) *VideoEncoder {
-	y := yuv.NewYuvImgProcessor(w, h, &yuv.Options{Threads: concurrency})
-	if concurrency > 0 {
-		log.Info().Msgf("Use concurrent image processor: %v", concurrency)
-	}
-	return &VideoEncoder{encoder: enc, y: y, log: log}
+func NewVideoEncoder(codec Encoder, w, h int, scale float64, log *logger.Logger) *Video {
+	return &Video{codec: codec, y: yuv.NewYuvConv(w, h, scale), log: log}
 }
 
-func (vp *VideoEncoder) Encode(img InFrame) OutFrame {
-	vp.mu.Lock()
-	defer vp.mu.Unlock()
-	if vp.stopped.Load() {
+func (v *Video) Encode(frame InFrame) OutFrame {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	if v.stopped.Load() {
 		return nil
 	}
 
-	yCbCr := vp.y.Process(img)
-	vp.encoder.LoadBuf(yCbCr)
-	vp.y.Put(&yCbCr)
+	yCbCr := v.y.Process(yuv.RawFrame(frame), v.rot, v.pf)
+	v.codec.LoadBuf(yCbCr)
+	v.y.Put(&yCbCr)
 
-	if frame := vp.encoder.Encode(); len(frame) > 0 {
-		return frame
+	if bytes := v.codec.Encode(); len(bytes) > 0 {
+		return bytes
 	}
 	return nil
 }
 
-func (vp *VideoEncoder) SetFlip(b bool) { vp.encoder.SetFlip(b) }
+func (v *Video) Info() string { return fmt.Sprintf("libyuv: %v", v.y.Version()) }
+
+func (v *Video) SetPixFormat(f uint32) {
+	switch f {
+	case 1:
+		v.pf = yuv.PixFmt(yuv.FourccArgb)
+	case 2:
+		v.pf = yuv.PixFmt(yuv.FourccRgbp)
+	default:
+		v.pf = yuv.PixFmt(yuv.FourccAbgr)
+	}
+}
+
+// SetRot sets the rotation angle of the frames.
+func (v *Video) SetRot(r uint) {
+	switch r {
+	// de-rotate
+	case 90:
+		v.rot = 270
+	case 270:
+		v.rot = 90
+	default:
+		v.rot = r
+	}
+}
+
+// SetFlip tells the encoder to flip the frames vertically.
+func (v *Video) SetFlip(b bool) { v.codec.SetFlip(b) }
 
-func (vp *VideoEncoder) Stop() {
-	vp.stopped.Store(true)
-	vp.mu.Lock()
-	defer vp.mu.Unlock()
+func (v *Video) Stop() {
+	v.stopped.Store(true)
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	v.rot = 0
 
-	if err := vp.encoder.Shutdown(); err != nil {
-		vp.log.Error().Err(err).Msg("failed to close the encoder")
+	if err := v.codec.Shutdown(); err != nil {
+		v.log.Error().Err(err).Msg("failed to close the encoder")
 	}
 }
diff --git a/pkg/encoder/yuv/libyuv/LICENSE b/pkg/encoder/yuv/libyuv/LICENSE
new file mode 100644
index 000000000..c911747a6
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/encoder/yuv/libyuv/basic_types.h b/pkg/encoder/yuv/libyuv/basic_types.h
new file mode 100644
index 000000000..9c66a132a
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/basic_types.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // For size_t and NULL
+
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
+#define INT_TYPES_DEFINED
+
+#include <stdint.h>  // for uintptr_t and C99 types
+
+#endif  // INT_TYPES_DEFINED
+
+#if !defined(LIBYUV_API)
+#define LIBYUV_API
+#endif  // LIBYUV_API
+
+#define LIBYUV_BOOL int
+
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/pkg/encoder/yuv/libyuv/convert.c b/pkg/encoder/yuv/libyuv/convert.c
new file mode 100644
index 000000000..c59da3b1b
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/convert.c
@@ -0,0 +1,336 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "convert.h"
+
+#include "basic_types.h"
+#include "cpu_id.h"
+#include "planar_functions.h"
+#include "row.h"
+
+// Subsample amount uses a shift.
+//   v is value
+//   a is amount to add to round up
+//   s is shift to subsample down
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+static __inline int Abs(int v) {
+    return v >= 0 ? v : -v;
+}
+
+// Copy I420 with optional flipping.
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8_t *src_y,
+             int src_stride_y,
+             const uint8_t *src_u,
+             int src_stride_u,
+             const uint8_t *src_v,
+             int src_stride_v,
+             uint8_t *dst_y,
+             int dst_stride_y,
+             uint8_t *dst_u,
+             int dst_stride_u,
+             uint8_t *dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+    int halfwidth = (width + 1) >> 1;
+    int halfheight = (height + 1) >> 1;
+    if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+        height == 0) {
+        return -1;
+    }
+    // Negative height means invert the image.
+    if (height < 0) {
+        height = -height;
+        halfheight = (height + 1) >> 1;
+        src_y = src_y + (height - 1) * src_stride_y;
+        src_u = src_u + (halfheight - 1) * src_stride_u;
+        src_v = src_v + (halfheight - 1) * src_stride_v;
+        src_stride_y = -src_stride_y;
+        src_stride_u = -src_stride_u;
+        src_stride_v = -src_stride_v;
+    }
+
+    if (dst_y) {
+        CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+    }
+    // Copy UV planes.
+    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+    return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t *src_argb,
+               int src_stride_argb,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               uint8_t *dst_u,
+               int dst_stride_u,
+               uint8_t *dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+    int y;
+    void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb,
+                        uint8_t *dst_u, uint8_t *dst_v, int width) =
+    ARGBToUVRow_C;
+    void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) =
+    ARGBToYRow_C;
+    if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+        return -1;
+    }
+    // Negative height means invert the image.
+    if (height < 0) {
+        height = -height;
+        src_argb = src_argb + (height - 1) * src_stride_argb;
+        src_stride_argb = -src_stride_argb;
+    }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ARGBToYRow = ARGBToYRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            ARGBToUVRow = ARGBToUVRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ARGBToYRow = ARGBToYRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            ARGBToYRow = ARGBToYRow_AVX2;
+        }
+    }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            ARGBToUVRow = ARGBToUVRow_AVX2;
+        }
+    }
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+        ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+        ARGBToYRow(src_argb, dst_y, width);
+        ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+        src_argb += src_stride_argb * 2;
+        dst_y += dst_stride_y * 2;
+        dst_u += dst_stride_u;
+        dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+        ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+        ARGBToYRow(src_argb, dst_y, width);
+    }
+    return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t *src_abgr,
+               int src_stride_abgr,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               uint8_t *dst_u,
+               int dst_stride_u,
+               uint8_t *dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+    int y;
+    void (*ABGRToUVRow)(const uint8_t *src_abgr0, int src_stride_abgr,
+                        uint8_t *dst_u, uint8_t *dst_v, int width) =
+    ABGRToUVRow_C;
+    void (*ABGRToYRow)(const uint8_t *src_abgr, uint8_t *dst_y, int width) =
+    ABGRToYRow_C;
+    if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+        return -1;
+    }
+    // Negative height means invert the image.
+    if (height < 0) {
+        height = -height;
+        src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+        src_stride_abgr = -src_stride_abgr;
+    }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ABGRToYRow = ABGRToYRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            ABGRToYRow = ABGRToYRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            ABGRToUVRow = ABGRToUVRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ABGRToYRow = ABGRToYRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            ABGRToYRow = ABGRToYRow_AVX2;
+        }
+    }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            ABGRToUVRow = ABGRToUVRow_AVX2;
+        }
+    }
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+        ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+        ABGRToYRow(src_abgr, dst_y, width);
+        ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+        src_abgr += src_stride_abgr * 2;
+        dst_y += dst_stride_y * 2;
+        dst_u += dst_stride_u;
+        dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+        ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+        ABGRToYRow(src_abgr, dst_y, width);
+    }
+    return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t *src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t *dst_y,
+                 int dst_stride_y,
+                 uint8_t *dst_u,
+                 int dst_stride_u,
+                 uint8_t *dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
+    int y;
+    void (*RGB565ToARGBRow)(const uint8_t *src_rgb, uint8_t *dst_argb,
+                            int width) = RGB565ToARGBRow_C;
+    void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb,
+                        uint8_t *dst_u, uint8_t *dst_v, int width) =
+    ARGBToUVRow_C;
+    void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) =
+    ARGBToYRow_C;
+    if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+        return -1;
+    }
+    // Negative height means invert the image.
+    if (height < 0) {
+        height = -height;
+        src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+        src_stride_rgb565 = -src_stride_rgb565;
+    }
+
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+        RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+        if (IS_ALIGNED(width, 8)) {
+            RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+        }
+    }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ARGBToYRow = ARGBToYRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            ARGBToUVRow = ARGBToUVRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ARGBToYRow = ARGBToYRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            ARGBToYRow = ARGBToYRow_AVX2;
+        }
+    }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            ARGBToUVRow = ARGBToUVRow_AVX2;
+        }
+    }
+#endif
+    {
+#if !(defined(HAS_RGB565TOYROW_NEON))
+        // Allocate 2 rows of ARGB.
+        const int row_size = (width * 4 + 31) & ~31;
+        align_buffer_64(row, row_size * 2);
+#endif
+        for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON))
+#else
+            RGB565ToARGBRow(src_rgb565, row, width);
+            RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width);
+            ARGBToUVRow(row, row_size, dst_u, dst_v, width);
+            ARGBToYRow(row, dst_y, width);
+            ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
+            src_rgb565 += src_stride_rgb565 * 2;
+            dst_y += dst_stride_y * 2;
+            dst_u += dst_stride_u;
+            dst_v += dst_stride_v;
+        }
+        if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON))
+#else
+            RGB565ToARGBRow(src_rgb565, row, width);
+            ARGBToUVRow(row, 0, dst_u, dst_v, width);
+            ARGBToYRow(row, dst_y, width);
+#endif
+        }
+#if !(defined(HAS_RGB565TOYROW_NEON))
+        free_aligned_buffer_64(row);
+#endif
+    }
+    return 0;
+}
diff --git a/pkg/encoder/yuv/libyuv/convert.h b/pkg/encoder/yuv/libyuv/convert.h
new file mode 100644
index 000000000..9a81c509c
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/convert.h
@@ -0,0 +1,113 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "rotate.h"  // For enum RotationMode.
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8_t *src_y,
+             int src_stride_y,
+             const uint8_t *src_u,
+             int src_stride_u,
+             const uint8_t *src_v,
+             int src_stride_v,
+             uint8_t *dst_y,
+             int dst_stride_y,
+             uint8_t *dst_u,
+             int dst_stride_u,
+             uint8_t *dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t *src_argb,
+               int src_stride_argb,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               uint8_t *dst_u,
+               int dst_stride_u,
+               uint8_t *dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t *src_abgr,
+               int src_stride_abgr,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               uint8_t *dst_u,
+               int dst_stride_u,
+               uint8_t *dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t *src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t *dst_y,
+                 int dst_stride_y,
+                 uint8_t *dst_u,
+                 int dst_stride_u,
+                 uint8_t *dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height);
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8_t *sample,
+                  size_t sample_size,
+                  uint8_t *dst_y,
+                  int dst_stride_y,
+                  uint8_t *dst_u,
+                  int dst_stride_u,
+                  uint8_t *dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc);
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/convert_argb.h b/pkg/encoder/yuv/libyuv/convert_argb.h
new file mode 100644
index 000000000..ac8e97169
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/convert_argb.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "basic_types.h"
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // BT.601 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/pkg/encoder/yuv/libyuv/convert_to_i420.c b/pkg/encoder/yuv/libyuv/convert_to_i420.c
new file mode 100644
index 000000000..848021427
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/convert_to_i420.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "convert.h"
+#include "video_common.h"
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8_t *sample,
+                  size_t sample_size,
+                  uint8_t *dst_y,
+                  int dst_stride_y,
+                  uint8_t *dst_u,
+                  int dst_stride_u,
+                  uint8_t *dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
+                  enum RotationMode rotation,
+                  uint32_t fourcc) {
+    uint32_t format = CanonicalFourCC(fourcc);
+    const uint8_t *src;
+    // TODO(nisse): Why allow crop_height < 0?
+    const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+    int r = 0;
+    LIBYUV_BOOL need_buf =
+            (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+             format != FOURCC_NV21 && format != FOURCC_YV12) ||
+            dst_y == sample;
+    uint8_t *tmp_y = dst_y;
+    uint8_t *tmp_u = dst_u;
+    uint8_t *tmp_v = dst_v;
+    int tmp_y_stride = dst_stride_y;
+    int tmp_u_stride = dst_stride_u;
+    int tmp_v_stride = dst_stride_v;
+    uint8_t *rotate_buffer = NULL;
+    const int inv_crop_height =
+            (src_height < 0) ? -abs_crop_height : abs_crop_height;
+
+    if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+        crop_width <= 0 || src_height == 0 || crop_height == 0) {
+        return -1;
+    }
+
+    // One pass rotation is available for some formats. For the rest, convert
+    // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+    // and then rotate the I420 to the final destination buffer.
+    // For in-place conversion, if destination dst_y is same as source sample,
+    // also enable temporary buffer.
+    if (need_buf) {
+        int y_size = crop_width * abs_crop_height;
+        int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+        rotate_buffer = (uint8_t *) malloc(y_size + uv_size * 2); /* NOLINT */
+        if (!rotate_buffer) {
+            return 1;  // Out of memory runtime error.
+        }
+        dst_y = rotate_buffer;
+        dst_u = dst_y + y_size;
+        dst_v = dst_u + uv_size;
+        dst_stride_y = crop_width;
+        dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
+    }
+
+    switch (format) {
+        // Single plane formats
+        case FOURCC_RGBP:
+            src = sample + (src_width * crop_y + crop_x) * 2;
+            r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                             dst_stride_u, dst_v, dst_stride_v, crop_width,
+                             inv_crop_height);
+            break;
+        case FOURCC_ARGB:
+            src = sample + (src_width * crop_y + crop_x) * 4;
+            r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, crop_width,
+                           inv_crop_height);
+            break;
+        case FOURCC_ABGR:
+            src = sample + (src_width * crop_y + crop_x) * 4;
+            r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, crop_width,
+                           inv_crop_height);
+            break;
+        default:
+            r = -1;  // unknown fourcc - return failure code.
+    }
+
+    if (need_buf) {
+        if (!r) {
+            r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                           dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+                           tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+                           rotation);
+        }
+        free(rotate_buffer);
+    }
+
+    return r;
+}
diff --git a/pkg/encoder/yuv/libyuv/cpu_id.c b/pkg/encoder/yuv/libyuv/cpu_id.c
new file mode 100644
index 000000000..166057de5
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/cpu_id.c
@@ -0,0 +1,204 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "cpu_id.h"
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>  // For fopen()
+#include <string.h>
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#define SAFEBUFFERS
+
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int *cpu_info) {
+#if defined(_MSC_VER)
+    // GCC version uses inline x86 assembly.
+#else  // defined(_MSC_VER)
+    int info_ebx, info_edx;
+    asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+        // Preserve ebx for fpic 32 bit.
+        "mov         %%ebx, %%edi                  \n"
+        "cpuid                                     \n"
+        "xchg        %%edi, %%ebx                  \n"
+        : "=D"(info_ebx),
+#else
+            "cpuid                                     \n"
+            : "=b"(info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+    "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
+    cpu_info[0] = info_eax;
+    cpu_info[1] = info_ebx;
+    cpu_info[2] = info_ecx;
+    cpu_info[3] = info_edx;
+#endif  // defined(_MSC_VER)
+}
+
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(int eax, int ecx, int* cpu_info) {
+  (void)eax;
+  (void)ecx;
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// For VS2010 and earlier emit can be used:
+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+//  __asm {
+//    xor        ecx, ecx    // xcr 0
+//    xgetbv
+//    mov        xcr0, eax
+//  }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+static int GetXCR0() {
+    int xcr0 = 0;
+#if defined(__i386__) || defined(__x86_64__)
+    asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+    return xcr0;
+}
+
+#else
+// xgetbv unavailable to query for OSSave support.  Return 0.
+#define GetXCR0() 0
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
+
+// Based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char *cpuinfo_name) {
+    char cpuinfo_line[512];
+    FILE *f = fopen(cpuinfo_name, "re");
+    if (!f) {
+        // Assume Neon if /proc/cpuinfo is unavailable.
+        // This will occur for Chrome sandbox for Pepper or Render process.
+        return kCpuHasNEON;
+    }
+    memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
+        if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+            char *p = strstr(cpuinfo_line, " neon");
+            if (p && (p[5] == ' ' || p[5] == '\n')) {
+                fclose(f);
+                return kCpuHasNEON;
+            }
+            // aarch64 uses asimd for Neon.
+            p = strstr(cpuinfo_line, " asimd");
+            if (p) {
+                fclose(f);
+                return kCpuHasNEON;
+            }
+        }
+    }
+    fclose(f);
+    return 0;
+}
+
+static SAFEBUFFERS int GetCpuFlags(void) {
+    int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+     defined(_M_IX86))
+    int cpu_info0[4] = {0, 0, 0, 0};
+    int cpu_info1[4] = {0, 0, 0, 0};
+    int cpu_info7[4] = {0, 0, 0, 0};
+    CpuId(0, 0, cpu_info0);
+    CpuId(1, 0, cpu_info1);
+    if (cpu_info0[0] >= 7) {
+        CpuId(7, 0, cpu_info7);
+    }
+    cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+               ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+               ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+               ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+               ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
+
+    // AVX requires OS saves YMM registers.
+    if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
+        ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
+        cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                    ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+                    ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+
+        // Detect AVX512bw
+        if ((GetXCR0() & 0xe0) == 0xe0) {
+            cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+            cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+            cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+            cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+            cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
+            cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+            cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+            cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+        }
+    }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+    // gcc -mfpu=neon defines __ARM_NEON__
+    // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+    // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+      cpu_info = kCpuHasNEON;
+    // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+    // flag in it.
+    // So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+      cpu_info = kCpuHasNEON;
+#else
+      // Linux arm parse text file for neon detect.
+      cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+      cpu_info |= kCpuHasARM;
+#endif  // __arm__
+    cpu_info |= kCpuInitialized;
+    return cpu_info;
+}
+
+// Note that use of this function is not thread safe.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags) {
+    int cpu_info = GetCpuFlags() & enable_flags;
+    SetCpuFlags(cpu_info);
+    return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+    return MaskCpuFlags(-1);
+}
diff --git a/pkg/encoder/yuv/libyuv/cpu_id.h b/pkg/encoder/yuv/libyuv/cpu_id.h
new file mode 100644
index 000000000..bf50b9cd1
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/cpu_id.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "basic_types.h"
+
+// Internal flag to indicate cpuid requires initialization.
+static const int kCpuInitialized = 0x1;
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;  // unused at this time.
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VNNI = 0x20000;
+static const int kCpuHasAVX512VBMI = 0x40000;
+static const int kCpuHasAVX512VBMI2 = 0x80000;
+static const int kCpuHasAVX512VBITALG = 0x100000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x200000;
+
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// Returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+    LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+    int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+    int cpu_info = cpu_info_;
+#endif
+    return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
+}
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char *cpuinfo_name);
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags);
+
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+//   again.
+// - enabling CPU features that are not supported by the CPU will result in
+//   undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+    LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+    __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+    cpu_info_ = cpu_flags;
+#endif
+}
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int *cpu_info);
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/libyuv.go b/pkg/encoder/yuv/libyuv/libyuv.go
new file mode 100644
index 000000000..98d4276ff
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/libyuv.go
@@ -0,0 +1,142 @@
+//go:build !darwin && !no_libyuv
+
+package libyuv
+
+// see: https://chromium.googlesource.com/libyuv/libyuv
+
+/*
+#cgo CFLAGS: -Wall
+#cgo LDFLAGS: -lyuv
+
+#include <stdlib.h>
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
+
+//
+typedef enum RotationMode {
+ kRotate0 = 0,      // No rotation.
+ kRotate90 = 90,    // Rotate 90 degrees clockwise.
+ kRotate180 = 180,  // Rotate 180 degrees.
+ kRotate270 = 270,  // Rotate 270 degrees clockwise.
+} RotationModeEnum;
+
+//
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+                 size_t sample_size,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int crop_x,
+                 int crop_y,
+                 int src_width,
+                 int src_height,
+                 int crop_width,
+                 int crop_height,
+                 enum RotationMode rotation,
+                 uint32_t fourcc);
+
+// Supported filtering.
+typedef enum FilterMode {
+    kFilterNone = 0,      // Point sample; Fastest.
+    kFilterLinear = 1,    // Filter horizontally only.
+    kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+    kFilterBox = 3        // Highest quality.
+} FilterModeEnum;
+
+LIBYUV_API
+int I420Scale(const uint8_t *src_y,
+              int src_stride_y,
+              const uint8_t *src_u,
+              int src_stride_u,
+              const uint8_t *src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t *dst_y,
+              int dst_stride_y,
+              uint8_t *dst_u,
+              int dst_stride_u,
+              uint8_t *dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+*/
+import "C"
+import "fmt"
+
+const FourccRgbp uint32 = C.FOURCC_RGBP
+const FourccArgb uint32 = C.FOURCC_ARGB
+const FourccAbgr uint32 = C.FOURCC_ABGR
+
+func Y420(src []byte, dst []byte, _, h, stride int, dw, dh int, rot uint, pix uint32, cx, cy int) {
+	cw := (dw + 1) / 2
+	ch := (dh + 1) / 2
+	i0 := dw * dh
+	i1 := i0 + cw*ch
+	yStride := dw
+	cStride := cw
+
+	C.ConvertToI420(
+		(*C.uchar)(&src[0]),
+		C.size_t(0),
+		(*C.uchar)(&dst[0]),
+		C.int(yStride),
+		(*C.uchar)(&dst[i0]),
+		C.int(cStride),
+		(*C.uchar)(&dst[i1]),
+		C.int(cStride),
+		C.int(0),
+		C.int(0),
+		C.int(stride),
+		C.int(h),
+		C.int(cx),
+		C.int(cy),
+		C.enum_RotationMode(rot),
+		C.uint32_t(pix))
+}
+
+func Y420Scale(src []byte, dst []byte, w, h int, dw, dh int) {
+	srcWidthUV, dstWidthUV := (w+1)>>1, (dw+1)>>1
+	srcHeightUV, dstHeightUV := (h+1)>>1, (dh+1)>>1
+
+	srcYPlaneSize, dstYPlaneSize := w*h, dw*dh
+	srcUVPlaneSize, dstUVPlaneSize := srcWidthUV*srcHeightUV, dstWidthUV*dstHeightUV
+
+	srcStrideY, dstStrideY := w, dw
+	srcStrideU, dstStrideU := srcWidthUV, dstWidthUV
+	srcStrideV, dstStrideV := srcWidthUV, dstWidthUV
+
+	srcY := (*C.uchar)(&src[0])
+	srcU := (*C.uchar)(&src[srcYPlaneSize])
+	srcV := (*C.uchar)(&src[srcYPlaneSize+srcUVPlaneSize])
+
+	dstY := (*C.uchar)(&dst[0])
+	dstU := (*C.uchar)(&dst[dstYPlaneSize])
+	dstV := (*C.uchar)(&dst[dstYPlaneSize+dstUVPlaneSize])
+
+	C.I420Scale(
+		srcY,
+		C.int(srcStrideY),
+		srcU,
+		C.int(srcStrideU),
+		srcV,
+		C.int(srcStrideV),
+		C.int(w),
+		C.int(h),
+		dstY,
+		C.int(dstStrideY),
+		dstU,
+		C.int(dstStrideU),
+		dstV,
+		C.int(dstStrideV),
+		C.int(dw),
+		C.int(dh),
+		C.enum_FilterMode(C.kFilterNone))
+}
+
+func Version() string { return fmt.Sprintf("%v", int(C.LIBYUV_VERSION)) }
diff --git a/pkg/encoder/yuv/libyuv/libyuv2.go b/pkg/encoder/yuv/libyuv/libyuv2.go
new file mode 100644
index 000000000..f4f6a68b5
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/libyuv2.go
@@ -0,0 +1,89 @@
+//go:build darwin || no_libyuv
+
+package libyuv
+
+/*
+#cgo CFLAGS: -Wall
+
+#include "basic_types.h"
+#include "version.h"
+#include "video_common.h"
+#include "rotate.h"
+#include "scale.h"
+#include "convert.h"
+
+*/
+import "C"
+import "fmt"
+
+const FourccRgbp uint32 = C.FOURCC_RGBP
+const FourccArgb uint32 = C.FOURCC_ARGB
+const FourccAbgr uint32 = C.FOURCC_ABGR
+
+func Y420(src []byte, dst []byte, _, h, stride int, dw, dh int, rot uint, pix uint32, cx, cy int) {
+	cw := (dw + 1) / 2
+	ch := (dh + 1) / 2
+	i0 := dw * dh
+	i1 := i0 + cw*ch
+	yStride := dw
+	cStride := cw
+
+	C.ConvertToI420(
+		(*C.uchar)(&src[0]),
+		C.size_t(0),
+		(*C.uchar)(&dst[0]),
+		C.int(yStride),
+		(*C.uchar)(&dst[i0]),
+		C.int(cStride),
+		(*C.uchar)(&dst[i1]),
+		C.int(cStride),
+		C.int(0),
+		C.int(0),
+		C.int(stride),
+		C.int(h),
+		C.int(cx),
+		C.int(cy),
+		C.enum_RotationMode(rot),
+		C.uint32_t(pix))
+}
+
+func Y420Scale(src []byte, dst []byte, w, h int, dw, dh int) {
+	srcWidthUV, dstWidthUV := (w+1)>>1, (dw+1)>>1
+	srcHeightUV, dstHeightUV := (h+1)>>1, (dh+1)>>1
+
+	srcYPlaneSize, dstYPlaneSize := w*h, dw*dh
+	srcUVPlaneSize, dstUVPlaneSize := srcWidthUV*srcHeightUV, dstWidthUV*dstHeightUV
+
+	srcStrideY, dstStrideY := w, dw
+	srcStrideU, dstStrideU := srcWidthUV, dstWidthUV
+	srcStrideV, dstStrideV := srcWidthUV, dstWidthUV
+
+	srcY := (*C.uchar)(&src[0])
+	srcU := (*C.uchar)(&src[srcYPlaneSize])
+	srcV := (*C.uchar)(&src[srcYPlaneSize+srcUVPlaneSize])
+
+	dstY := (*C.uchar)(&dst[0])
+	dstU := (*C.uchar)(&dst[dstYPlaneSize])
+	dstV := (*C.uchar)(&dst[dstYPlaneSize+dstUVPlaneSize])
+
+	C.I420Scale(
+		srcY,
+		C.int(srcStrideY),
+		srcU,
+		C.int(srcStrideU),
+		srcV,
+		C.int(srcStrideV),
+		C.int(w),
+		C.int(h),
+		dstY,
+		C.int(dstStrideY),
+		dstU,
+		C.int(dstStrideU),
+		dstV,
+		C.int(dstStrideV),
+		C.int(dw),
+		C.int(dh),
+		C.enum_FilterMode(C.kFilterNone))
+}
+
+func Version() string { return fmt.Sprintf("%v mod", int(C.LIBYUV_VERSION)) }
diff --git a/pkg/encoder/yuv/libyuv/planar_functions.c b/pkg/encoder/yuv/libyuv/planar_functions.c
new file mode 100644
index 000000000..a5d543cc5
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/planar_functions.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "planar_functions.h"
+
+#include "cpu_id.h"
+#include "row.h"
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8_t *src_y,
+               int src_stride_y,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+    int y;
+    void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C;
+    if (width <= 0 || height == 0) {
+        return;
+    }
+    // Negative height means invert the image.
+    if (height < 0) {
+        height = -height;
+        dst_y = dst_y + (height - 1) * dst_stride_y;
+        dst_stride_y = -dst_stride_y;
+    }
+    // Coalesce rows.
+    if (src_stride_y == width && dst_stride_y == width) {
+        width *= height;
+        height = 1;
+        src_stride_y = dst_stride_y = 0;
+    }
+    // Nothing to do.
+    if (src_y == dst_y && src_stride_y == dst_stride_y) {
+        return;
+    }
+
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+        CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+    }
+#endif
+#if defined(HAS_COPYROW_AVX)
+    if (TestCpuFlag(kCpuHasAVX)) {
+        CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+    }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+    if (TestCpuFlag(kCpuHasERMS)) {
+        CopyRow = CopyRow_ERMS;
+    }
+#endif
+
+    // Copy plane
+    for (y = 0; y < height; ++y) {
+        CopyRow(src_y, dst_y, width);
+        src_y += src_stride_y;
+        dst_y += dst_stride_y;
+    }
+}
diff --git a/pkg/encoder/yuv/libyuv/planar_functions.h b/pkg/encoder/yuv/libyuv/planar_functions.h
new file mode 100644
index 000000000..222109cfc
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/planar_functions.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "basic_types.h"
+
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8_t *src_y,
+               int src_stride_y,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/rotate.c b/pkg/encoder/yuv/libyuv/rotate.c
new file mode 100644
index 000000000..4aabae5b0
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/rotate.c
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rotate.h"
+
+#include "convert.h"
+#include "cpu_id.h"
+#include "rotate_row.h"
+#include "row.h"
+
+LIBYUV_API
+void TransposePlane(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+    int i = height;
+
+    void (*TransposeWx8)(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int width) = TransposeWx8_C;
+
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        TransposeWx8 = TransposeWx8_Any_SSSE3;
+        if (IS_ALIGNED(width, 8)) {
+            TransposeWx8 = TransposeWx8_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            TransposeWx8 = TransposeWx8_Fast_SSSE3;
+        }
+    }
+#endif
+
+    // Work across the source in 8x8 tiles
+    while (i >= 8) {
+        TransposeWx8(src, src_stride, dst, dst_stride, width);
+        src += 8 * src_stride;  // Go down 8 rows.
+        dst += 8;               // Move over 8 columns.
+        i -= 8;
+    }
+
+    if (i > 0) {
+        TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+    }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8_t *src,
+                   int src_stride,
+                   uint8_t *dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
+    // Rotate by 90 is a transpose with the source read
+    // from bottom to top. So set the source pointer to the end
+    // of the buffer and flip the sign of the source stride.
+    src += src_stride * (height - 1);
+    src_stride = -src_stride;
+    TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+    // Rotate by 270 is a transpose with the destination written
+    // from bottom to top. So set the destination pointer to the end
+    // of the buffer and flip the sign of the destination stride.
+    dst += dst_stride * (width - 1);
+    dst_stride = -dst_stride;
+    TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+    // Swap top and bottom row and mirror the content. Uses a temporary row.
+    align_buffer_64(row, width);
+    const uint8_t *src_bot = src + src_stride * (height - 1);
+    uint8_t *dst_bot = dst + dst_stride * (height - 1);
+    int half_height = (height + 1) >> 1;
+    int y;
+    void (*MirrorRow)(const uint8_t *src, uint8_t *dst, int width) = MirrorRow_C;
+    void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        MirrorRow = MirrorRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+            MirrorRow = MirrorRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        MirrorRow = MirrorRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+            MirrorRow = MirrorRow_AVX2;
+        }
+    }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+        CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+    }
+#endif
+#if defined(HAS_COPYROW_AVX)
+    if (TestCpuFlag(kCpuHasAVX)) {
+        CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+    }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+    if (TestCpuFlag(kCpuHasERMS)) {
+        CopyRow = CopyRow_ERMS;
+    }
+#endif
+#if defined(HAS_COPYROW_NEON)
+#endif
+    // Odd height will harmlessly mirror the middle row twice.
+    for (y = 0; y < half_height; ++y) {
+        CopyRow(src, row, width);        // Copy top row into buffer
+        MirrorRow(src_bot, dst, width);  // Mirror bottom row into top row
+        MirrorRow(row, dst_bot, width);  // Mirror buffer into bottom row
+        src += src_stride;
+        dst += dst_stride;
+        src_bot -= src_stride;
+        dst_bot -= dst_stride;
+    }
+    free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int I420Rotate(const uint8_t *src_y,
+               int src_stride_y,
+               const uint8_t *src_u,
+               int src_stride_u,
+               const uint8_t *src_v,
+               int src_stride_v,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               uint8_t *dst_u,
+               int dst_stride_u,
+               uint8_t *dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode) {
+    int halfwidth = (width + 1) >> 1;
+    int halfheight = (height + 1) >> 1;
+    if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
+        !dst_y || !dst_u || !dst_v) {
+        return -1;
+    }
+
+    // Negative height means invert the image.
+    if (height < 0) {
+        height = -height;
+        halfheight = (height + 1) >> 1;
+        src_y = src_y + (height - 1) * src_stride_y;
+        src_u = src_u + (halfheight - 1) * src_stride_u;
+        src_v = src_v + (halfheight - 1) * src_stride_v;
+        src_stride_y = -src_stride_y;
+        src_stride_u = -src_stride_u;
+        src_stride_v = -src_stride_v;
+    }
+
+    switch (mode) {
+        case kRotate0:
+            // copy frame
+            return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                            dst_v, dst_stride_v, width, height);
+        case kRotate90:
+            RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+            RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                          halfheight);
+            RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                          halfheight);
+            return 0;
+        case kRotate270:
+            RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+            RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                           halfheight);
+            RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                           halfheight);
+            return 0;
+        case kRotate180:
+            RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+            RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                           halfheight);
+            RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                           halfheight);
+            return 0;
+        default:
+            break;
+    }
+    return -1;
+}
diff --git a/pkg/encoder/yuv/libyuv/rotate.h b/pkg/encoder/yuv/libyuv/rotate.h
new file mode 100644
index 000000000..59b9ec3cb
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/rotate.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "basic_types.h"
+
+// Supported rotation.
+typedef enum RotationMode {
+    kRotate0 = 0,      // No rotation.
+    kRotate90 = 90,    // Rotate 90 degrees clockwise.
+    kRotate180 = 180,  // Rotate 180 degrees.
+    kRotate270 = 270,  // Rotate 270 degrees clockwise.
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8_t *src_y,
+               int src_stride_y,
+               const uint8_t *src_u,
+               int src_stride_u,
+               const uint8_t *src_v,
+               int src_stride_v,
+               uint8_t *dst_y,
+               int dst_stride_y,
+               uint8_t *dst_u,
+               int dst_stride_u,
+               uint8_t *dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8_t *src,
+                   int src_stride,
+                   uint8_t *dst,
+                   int dst_stride,
+                   int width,
+                   int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/pkg/encoder/yuv/libyuv/rotate_any.c b/pkg/encoder/yuv/libyuv/rotate_any.c
new file mode 100644
index 000000000..9af8c04ab
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/rotate_any.c
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rotate_row.h"
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
+               int dst_stride, int width) {                                   \
+    int r = width & MASK;                                                     \
+    int n = width - r;                                                        \
+    if (n > 0) {                                                              \
+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
+    }                                                                         \
+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+  }
+
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
+               int width) {                                                    \
+    int r = width & MASK;                                                      \
+    int n = width - r;                                                         \
+    if (n > 0) {                                                               \
+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+    }                                                                          \
+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+  }
+
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+
+#endif
+#undef TUVANY
diff --git a/pkg/encoder/yuv/libyuv/rotate_common.c b/pkg/encoder/yuv/libyuv/rotate_common.c
new file mode 100644
index 000000000..20c1481a7
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/rotate_common.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rotate_row.h"
+
+void TransposeWx8_C(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width) {
+    int i;
+    for (i = 0; i < width; ++i) {
+        dst[0] = src[0 * src_stride];
+        dst[1] = src[1 * src_stride];
+        dst[2] = src[2 * src_stride];
+        dst[3] = src[3 * src_stride];
+        dst[4] = src[4 * src_stride];
+        dst[5] = src[5 * src_stride];
+        dst[6] = src[6 * src_stride];
+        dst[7] = src[7 * src_stride];
+        ++src;
+        dst += dst_stride;
+    }
+}
+
+void TransposeUVWx8_C(const uint8_t *src,
+                      int src_stride,
+                      uint8_t *dst_a,
+                      int dst_stride_a,
+                      uint8_t *dst_b,
+                      int dst_stride_b,
+                      int width) {
+    int i;
+    for (i = 0; i < width; ++i) {
+        dst_a[0] = src[0 * src_stride + 0];
+        dst_b[0] = src[0 * src_stride + 1];
+        dst_a[1] = src[1 * src_stride + 0];
+        dst_b[1] = src[1 * src_stride + 1];
+        dst_a[2] = src[2 * src_stride + 0];
+        dst_b[2] = src[2 * src_stride + 1];
+        dst_a[3] = src[3 * src_stride + 0];
+        dst_b[3] = src[3 * src_stride + 1];
+        dst_a[4] = src[4 * src_stride + 0];
+        dst_b[4] = src[4 * src_stride + 1];
+        dst_a[5] = src[5 * src_stride + 0];
+        dst_b[5] = src[5 * src_stride + 1];
+        dst_a[6] = src[6 * src_stride + 0];
+        dst_b[6] = src[6 * src_stride + 1];
+        dst_a[7] = src[7 * src_stride + 0];
+        dst_b[7] = src[7 * src_stride + 1];
+        src += 2;
+        dst_a += dst_stride_a;
+        dst_b += dst_stride_b;
+    }
+}
+
+void TransposeWxH_C(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
+    int i;
+    for (i = 0; i < width; ++i) {
+        int j;
+        for (j = 0; j < height; ++j) {
+            dst[i * dst_stride + j] = src[j * src_stride + i];
+        }
+    }
+}
diff --git a/pkg/encoder/yuv/libyuv/rotate_gcc.c b/pkg/encoder/yuv/libyuv/rotate_gcc.c
new file mode 100644
index 000000000..54fdafff8
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/rotate_gcc.c
@@ -0,0 +1,370 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rotate_row.h"
+#include "row.h"
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+
+void TransposeWx8_SSSE3(const uint8_t *src,
+                        int src_stride,
+                        uint8_t *dst,
+                        int dst_stride,
+                        int width) {
+    asm volatile(
+        // Read in the data from the source pointer.
+        // First round of bit swap.
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"
+            "movq        (%0,%3),%%xmm1                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "punpcklbw   %%xmm1,%%xmm0                 \n"
+            "movq        (%0),%%xmm2                   \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "palignr     $0x8,%%xmm1,%%xmm1            \n"
+            "movq        (%0,%3),%%xmm3                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "punpcklbw   %%xmm3,%%xmm2                 \n"
+            "movdqa      %%xmm2,%%xmm3                 \n"
+            "movq        (%0),%%xmm4                   \n"
+            "palignr     $0x8,%%xmm3,%%xmm3            \n"
+            "movq        (%0,%3),%%xmm5                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "punpcklbw   %%xmm5,%%xmm4                 \n"
+            "movdqa      %%xmm4,%%xmm5                 \n"
+            "movq        (%0),%%xmm6                   \n"
+            "palignr     $0x8,%%xmm5,%%xmm5            \n"
+            "movq        (%0,%3),%%xmm7                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "punpcklbw   %%xmm7,%%xmm6                 \n"
+            "neg         %3                            \n"
+            "movdqa      %%xmm6,%%xmm7                 \n"
+            "lea         0x8(%0,%3,8),%0               \n"
+            "palignr     $0x8,%%xmm7,%%xmm7            \n"
+            "neg         %3                            \n"
+            // Second round of bit swap.
+            "punpcklwd   %%xmm2,%%xmm0                 \n"
+            "punpcklwd   %%xmm3,%%xmm1                 \n"
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "palignr     $0x8,%%xmm2,%%xmm2            \n"
+            "palignr     $0x8,%%xmm3,%%xmm3            \n"
+            "punpcklwd   %%xmm6,%%xmm4                 \n"
+            "punpcklwd   %%xmm7,%%xmm5                 \n"
+            "movdqa      %%xmm4,%%xmm6                 \n"
+            "movdqa      %%xmm5,%%xmm7                 \n"
+            "palignr     $0x8,%%xmm6,%%xmm6            \n"
+            "palignr     $0x8,%%xmm7,%%xmm7            \n"
+            // Third round of bit swap.
+            // Write to the destination pointer.
+            "punpckldq   %%xmm4,%%xmm0                 \n"
+            "movq        %%xmm0,(%1)                   \n"
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "palignr     $0x8,%%xmm4,%%xmm4            \n"
+            "movq        %%xmm4,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm6,%%xmm2                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "movq        %%xmm2,(%1)                   \n"
+            "palignr     $0x8,%%xmm6,%%xmm6            \n"
+            "punpckldq   %%xmm5,%%xmm1                 \n"
+            "movq        %%xmm6,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "movq        %%xmm1,(%1)                   \n"
+            "palignr     $0x8,%%xmm5,%%xmm5            \n"
+            "movq        %%xmm5,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm7,%%xmm3                 \n"
+            "movq        %%xmm3,(%1)                   \n"
+            "movdqa      %%xmm3,%%xmm7                 \n"
+            "palignr     $0x8,%%xmm7,%%xmm7            \n"
+            "sub         $0x8,%2                       \n"
+            "movq        %%xmm7,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "jg          1b                            \n"
+            : "+r"(src),                    // %0
+    "+r"(dst),                    // %1
+    "+r"(width)                   // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+void TransposeWx8_Fast_SSSE3(const uint8_t *src,
+                             int src_stride,
+                             uint8_t *dst,
+                             int dst_stride,
+                             int width) {
+    asm volatile(
+        // Read in the data from the source pointer.
+        // First round of bit swap.
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      (%0,%3),%%xmm1                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "movdqa      %%xmm0,%%xmm8                 \n"
+            "punpcklbw   %%xmm1,%%xmm0                 \n"
+            "punpckhbw   %%xmm1,%%xmm8                 \n"
+            "movdqu      (%0),%%xmm2                   \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "movdqa      %%xmm8,%%xmm9                 \n"
+            "palignr     $0x8,%%xmm1,%%xmm1            \n"
+            "palignr     $0x8,%%xmm9,%%xmm9            \n"
+            "movdqu      (%0,%3),%%xmm3                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "movdqa      %%xmm2,%%xmm10                \n"
+            "punpcklbw   %%xmm3,%%xmm2                 \n"
+            "punpckhbw   %%xmm3,%%xmm10                \n"
+            "movdqa      %%xmm2,%%xmm3                 \n"
+            "movdqa      %%xmm10,%%xmm11               \n"
+            "movdqu      (%0),%%xmm4                   \n"
+            "palignr     $0x8,%%xmm3,%%xmm3            \n"
+            "palignr     $0x8,%%xmm11,%%xmm11          \n"
+            "movdqu      (%0,%3),%%xmm5                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "movdqa      %%xmm4,%%xmm12                \n"
+            "punpcklbw   %%xmm5,%%xmm4                 \n"
+            "punpckhbw   %%xmm5,%%xmm12                \n"
+            "movdqa      %%xmm4,%%xmm5                 \n"
+            "movdqa      %%xmm12,%%xmm13               \n"
+            "movdqu      (%0),%%xmm6                   \n"
+            "palignr     $0x8,%%xmm5,%%xmm5            \n"
+            "palignr     $0x8,%%xmm13,%%xmm13          \n"
+            "movdqu      (%0,%3),%%xmm7                \n"
+            "lea         (%0,%3,2),%0                  \n"
+            "movdqa      %%xmm6,%%xmm14                \n"
+            "punpcklbw   %%xmm7,%%xmm6                 \n"
+            "punpckhbw   %%xmm7,%%xmm14                \n"
+            "neg         %3                            \n"
+            "movdqa      %%xmm6,%%xmm7                 \n"
+            "movdqa      %%xmm14,%%xmm15               \n"
+            "lea         0x10(%0,%3,8),%0              \n"
+            "palignr     $0x8,%%xmm7,%%xmm7            \n"
+            "palignr     $0x8,%%xmm15,%%xmm15          \n"
+            "neg         %3                            \n"
+            // Second round of bit swap.
+            "punpcklwd   %%xmm2,%%xmm0                 \n"
+            "punpcklwd   %%xmm3,%%xmm1                 \n"
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "palignr     $0x8,%%xmm2,%%xmm2            \n"
+            "palignr     $0x8,%%xmm3,%%xmm3            \n"
+            "punpcklwd   %%xmm6,%%xmm4                 \n"
+            "punpcklwd   %%xmm7,%%xmm5                 \n"
+            "movdqa      %%xmm4,%%xmm6                 \n"
+            "movdqa      %%xmm5,%%xmm7                 \n"
+            "palignr     $0x8,%%xmm6,%%xmm6            \n"
+            "palignr     $0x8,%%xmm7,%%xmm7            \n"
+            "punpcklwd   %%xmm10,%%xmm8                \n"
+            "punpcklwd   %%xmm11,%%xmm9                \n"
+            "movdqa      %%xmm8,%%xmm10                \n"
+            "movdqa      %%xmm9,%%xmm11                \n"
+            "palignr     $0x8,%%xmm10,%%xmm10          \n"
+            "palignr     $0x8,%%xmm11,%%xmm11          \n"
+            "punpcklwd   %%xmm14,%%xmm12               \n"
+            "punpcklwd   %%xmm15,%%xmm13               \n"
+            "movdqa      %%xmm12,%%xmm14               \n"
+            "movdqa      %%xmm13,%%xmm15               \n"
+            "palignr     $0x8,%%xmm14,%%xmm14          \n"
+            "palignr     $0x8,%%xmm15,%%xmm15          \n"
+            // Third round of bit swap.
+            // Write to the destination pointer.
+            "punpckldq   %%xmm4,%%xmm0                 \n"
+            "movq        %%xmm0,(%1)                   \n"
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "palignr     $0x8,%%xmm4,%%xmm4            \n"
+            "movq        %%xmm4,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm6,%%xmm2                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "movq        %%xmm2,(%1)                   \n"
+            "palignr     $0x8,%%xmm6,%%xmm6            \n"
+            "punpckldq   %%xmm5,%%xmm1                 \n"
+            "movq        %%xmm6,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "movq        %%xmm1,(%1)                   \n"
+            "palignr     $0x8,%%xmm5,%%xmm5            \n"
+            "movq        %%xmm5,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm7,%%xmm3                 \n"
+            "movq        %%xmm3,(%1)                   \n"
+            "movdqa      %%xmm3,%%xmm7                 \n"
+            "palignr     $0x8,%%xmm7,%%xmm7            \n"
+            "movq        %%xmm7,(%1,%4)                \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm12,%%xmm8                \n"
+            "movq        %%xmm8,(%1)                   \n"
+            "movdqa      %%xmm8,%%xmm12                \n"
+            "palignr     $0x8,%%xmm12,%%xmm12          \n"
+            "movq        %%xmm12,(%1,%4)               \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm14,%%xmm10               \n"
+            "movdqa      %%xmm10,%%xmm14               \n"
+            "movq        %%xmm10,(%1)                  \n"
+            "palignr     $0x8,%%xmm14,%%xmm14          \n"
+            "punpckldq   %%xmm13,%%xmm9                \n"
+            "movq        %%xmm14,(%1,%4)               \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "movdqa      %%xmm9,%%xmm13                \n"
+            "movq        %%xmm9,(%1)                   \n"
+            "palignr     $0x8,%%xmm13,%%xmm13          \n"
+            "movq        %%xmm13,(%1,%4)               \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "punpckldq   %%xmm15,%%xmm11               \n"
+            "movq        %%xmm11,(%1)                  \n"
+            "movdqa      %%xmm11,%%xmm15               \n"
+            "palignr     $0x8,%%xmm15,%%xmm15          \n"
+            "sub         $0x10,%2                      \n"
+            "movq        %%xmm15,(%1,%4)               \n"
+            "lea         (%1,%4,2),%1                  \n"
+            "jg          1b                            \n"
+            : "+r"(src),                    // %0
+    "+r"(dst),                    // %1
+    "+r"(width)                   // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+    "xmm15");
+}
+
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+void TransposeUVWx8_SSE2(const uint8_t *src,
+                         int src_stride,
+                         uint8_t *dst_a,
+                         int dst_stride_a,
+                         uint8_t *dst_b,
+                         int dst_stride_b,
+                         int width) {
+    asm volatile(
+        // Read in the data from the source pointer.
+        // First round of bit swap.
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      (%0,%4),%%xmm1                \n"
+            "lea         (%0,%4,2),%0                  \n"
+            "movdqa      %%xmm0,%%xmm8                 \n"
+            "punpcklbw   %%xmm1,%%xmm0                 \n"
+            "punpckhbw   %%xmm1,%%xmm8                 \n"
+            "movdqa      %%xmm8,%%xmm1                 \n"
+            "movdqu      (%0),%%xmm2                   \n"
+            "movdqu      (%0,%4),%%xmm3                \n"
+            "lea         (%0,%4,2),%0                  \n"
+            "movdqa      %%xmm2,%%xmm8                 \n"
+            "punpcklbw   %%xmm3,%%xmm2                 \n"
+            "punpckhbw   %%xmm3,%%xmm8                 \n"
+            "movdqa      %%xmm8,%%xmm3                 \n"
+            "movdqu      (%0),%%xmm4                   \n"
+            "movdqu      (%0,%4),%%xmm5                \n"
+            "lea         (%0,%4,2),%0                  \n"
+            "movdqa      %%xmm4,%%xmm8                 \n"
+            "punpcklbw   %%xmm5,%%xmm4                 \n"
+            "punpckhbw   %%xmm5,%%xmm8                 \n"
+            "movdqa      %%xmm8,%%xmm5                 \n"
+            "movdqu      (%0),%%xmm6                   \n"
+            "movdqu      (%0,%4),%%xmm7                \n"
+            "lea         (%0,%4,2),%0                  \n"
+            "movdqa      %%xmm6,%%xmm8                 \n"
+            "punpcklbw   %%xmm7,%%xmm6                 \n"
+            "neg         %4                            \n"
+            "lea         0x10(%0,%4,8),%0              \n"
+            "punpckhbw   %%xmm7,%%xmm8                 \n"
+            "movdqa      %%xmm8,%%xmm7                 \n"
+            "neg         %4                            \n"
+            // Second round of bit swap.
+            "movdqa      %%xmm0,%%xmm8                 \n"
+            "movdqa      %%xmm1,%%xmm9                 \n"
+            "punpckhwd   %%xmm2,%%xmm8                 \n"
+            "punpckhwd   %%xmm3,%%xmm9                 \n"
+            "punpcklwd   %%xmm2,%%xmm0                 \n"
+            "punpcklwd   %%xmm3,%%xmm1                 \n"
+            "movdqa      %%xmm8,%%xmm2                 \n"
+            "movdqa      %%xmm9,%%xmm3                 \n"
+            "movdqa      %%xmm4,%%xmm8                 \n"
+            "movdqa      %%xmm5,%%xmm9                 \n"
+            "punpckhwd   %%xmm6,%%xmm8                 \n"
+            "punpckhwd   %%xmm7,%%xmm9                 \n"
+            "punpcklwd   %%xmm6,%%xmm4                 \n"
+            "punpcklwd   %%xmm7,%%xmm5                 \n"
+            "movdqa      %%xmm8,%%xmm6                 \n"
+            "movdqa      %%xmm9,%%xmm7                 \n"
+            // Third round of bit swap.
+            // Write to the destination pointer.
+            "movdqa      %%xmm0,%%xmm8                 \n"
+            "punpckldq   %%xmm4,%%xmm0                 \n"
+            "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
+            "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
+            "punpckhdq   %%xmm4,%%xmm8                 \n"
+            "movlpd      %%xmm8,(%1,%5)                \n"
+            "lea         (%1,%5,2),%1                  \n"
+            "movhpd      %%xmm8,(%2,%6)                \n"
+            "lea         (%2,%6,2),%2                  \n"
+            "movdqa      %%xmm2,%%xmm8                 \n"
+            "punpckldq   %%xmm6,%%xmm2                 \n"
+            "movlpd      %%xmm2,(%1)                   \n"
+            "movhpd      %%xmm2,(%2)                   \n"
+            "punpckhdq   %%xmm6,%%xmm8                 \n"
+            "movlpd      %%xmm8,(%1,%5)                \n"
+            "lea         (%1,%5,2),%1                  \n"
+            "movhpd      %%xmm8,(%2,%6)                \n"
+            "lea         (%2,%6,2),%2                  \n"
+            "movdqa      %%xmm1,%%xmm8                 \n"
+            "punpckldq   %%xmm5,%%xmm1                 \n"
+            "movlpd      %%xmm1,(%1)                   \n"
+            "movhpd      %%xmm1,(%2)                   \n"
+            "punpckhdq   %%xmm5,%%xmm8                 \n"
+            "movlpd      %%xmm8,(%1,%5)                \n"
+            "lea         (%1,%5,2),%1                  \n"
+            "movhpd      %%xmm8,(%2,%6)                \n"
+            "lea         (%2,%6,2),%2                  \n"
+            "movdqa      %%xmm3,%%xmm8                 \n"
+            "punpckldq   %%xmm7,%%xmm3                 \n"
+            "movlpd      %%xmm3,(%1)                   \n"
+            "movhpd      %%xmm3,(%2)                   \n"
+            "punpckhdq   %%xmm7,%%xmm8                 \n"
+            "sub         $0x8,%3                       \n"
+            "movlpd      %%xmm8,(%1,%5)                \n"
+            "lea         (%1,%5,2),%1                  \n"
+            "movhpd      %%xmm8,(%2,%6)                \n"
+            "lea         (%2,%6,2),%2                  \n"
+            "jg          1b                            \n"
+            : "+r"(src),                      // %0
+    "+r"(dst_a),                    // %1
+    "+r"(dst_b),                    // %2
+    "+r"(width)                     // %3
+            : "r"((intptr_t) (src_stride)),    // %4
+    "r"((intptr_t) (dst_stride_a)),  // %5
+    "r"((intptr_t) (dst_stride_b))   // %6
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7", "xmm8", "xmm9");
+}
+
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+#endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/pkg/encoder/yuv/libyuv/rotate_row.h b/pkg/encoder/yuv/libyuv/rotate_row.h
new file mode 100644
index 000000000..afdae49f0
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/rotate_row.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "basic_types.h"
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+void TransposeWxH_C(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+void TransposeWx8_C(const uint8_t *src,
+                    int src_stride,
+                    uint8_t *dst,
+                    int dst_stride,
+                    int width);
+
+void TransposeWx8_SSSE3(const uint8_t *src,
+                        int src_stride,
+                        uint8_t *dst,
+                        int dst_stride,
+                        int width);
+
+void TransposeWx8_Fast_SSSE3(const uint8_t *src,
+                             int src_stride,
+                             uint8_t *dst,
+                             int dst_stride,
+                             int width);
+
+void TransposeWx8_Any_SSSE3(const uint8_t *src,
+                            int src_stride,
+                            uint8_t *dst,
+                            int dst_stride,
+                            int width);
+
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t *src,
+                                 int src_stride,
+                                 uint8_t *dst,
+                                 int dst_stride,
+                                 int width);
+
+void TransposeUVWx8_C(const uint8_t *src,
+                      int src_stride,
+                      uint8_t *dst_a,
+                      int dst_stride_a,
+                      uint8_t *dst_b,
+                      int dst_stride_b,
+                      int width);
+
+void TransposeUVWx8_SSE2(const uint8_t *src,
+                         int src_stride,
+                         uint8_t *dst_a,
+                         int dst_stride_a,
+                         uint8_t *dst_b,
+                         int dst_stride_b,
+                         int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8_t *src,
+                             int src_stride,
+                             uint8_t *dst_a,
+                             int dst_stride_a,
+                             uint8_t *dst_b,
+                             int dst_stride_b,
+                             int width);
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/pkg/encoder/yuv/libyuv/row.h b/pkg/encoder/yuv/libyuv/row.h
new file mode 100644
index 000000000..ca1c0c298
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/row.h
@@ -0,0 +1,426 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stddef.h>  // For NULL
+#include <stdlib.h>  // For malloc
+
+#include "basic_types.h"
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#endif
+
+// Effects:
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ARGBTOUVROW_AVX2
+#endif
+
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_MIRRORUVROW_SSSE3
+
+#endif
+
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__)) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ABGRTOYROW_AVX2
+#define HAS_MIRRORUVROW_AVX2
+#if !defined(LIBYUV_BIT_EXACT)
+#define HAS_ABGRTOUVROW_AVX2
+#endif
+
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+                                                                                                                        #if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#endif
+#define LIBYUV_NOINLINE __declspec(noinline)
+typedef __declspec(align(16)) int16_t vec16[8];
+typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
+typedef __declspec(align(16)) int8_t vec8[16];
+typedef __declspec(align(16)) uint16_t uvec16[8];
+typedef __declspec(align(16)) uint32_t uvec32[4];
+typedef __declspec(align(16)) uint8_t uvec8[16];
+typedef __declspec(align(32)) int16_t lvec16[16];
+typedef __declspec(align(32)) int32_t lvec32[8];
+typedef __declspec(align(32)) int8_t lvec8[32];
+typedef __declspec(align(32)) uint16_t ulvec16[16];
+typedef __declspec(align(32)) uint32_t ulvec32[8];
+typedef __declspec(align(32)) uint8_t ulvec8[32];
+#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+#define LIBYUV_NOINLINE __attribute__((noinline))
+typedef int16_t __attribute__((vector_size(16))) vec16;
+typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
+typedef int8_t __attribute__((vector_size(16))) vec8;
+typedef uint16_t __attribute__((vector_size(16))) uvec16;
+typedef uint32_t __attribute__((vector_size(16))) uvec32;
+typedef uint8_t __attribute__((vector_size(16))) uvec8;
+typedef int16_t __attribute__((vector_size(32))) lvec16;
+typedef int32_t __attribute__((vector_size(32))) lvec32;
+typedef int8_t __attribute__((vector_size(32))) lvec8;
+typedef uint16_t __attribute__((vector_size(32))) ulvec16;
+typedef uint32_t __attribute__((vector_size(32))) ulvec32;
+typedef uint8_t __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+#define LIBYUV_NOINLINE
+typedef int16_t vec16[8];
+typedef int32_t vec32[4];
+typedef float vecf32[4];
+typedef int8_t vec8[16];
+typedef uint16_t uvec16[8];
+typedef uint32_t uvec32[4];
+typedef uint8_t uvec8[16];
+typedef int16_t lvec16[16];
+typedef int32_t lvec32[8];
+typedef int8_t lvec8[32];
+typedef uint16_t ulvec16[16];
+typedef uint32_t ulvec32[8];
+typedef uint8_t ulvec8[32];
+#endif
+
+#if !defined(__aarch64__) || !defined(__arm__)
+// This struct is for Intel color conversion.
+struct YuvConstants {
+    uint8_t kUVToB[32];
+    uint8_t kUVToG[32];
+    uint8_t kUVToR[32];
+    int16_t kYToRgb[16];
+    int16_t kYBiasToRgb[16];
+};
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size)                                         \
+  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);                  \
+  var = NULL
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+
+void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width);
+
+void ARGBToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width);
+
+void ABGRToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width);
+
+void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width);
+
+void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width);
+
+void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width);
+
+void ARGBToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width);
+
+void ABGRToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width);
+
+void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width);
+
+void ARGBToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void BGRAToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void ABGRToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void ARGBToUVRow_AVX2(const uint8_t *src_argb,
+                      int src_stride_argb,
+                      uint8_t *dst_u,
+                      uint8_t *dst_v,
+                      int width);
+
+void ABGRToUVRow_AVX2(const uint8_t *src_abgr,
+                      int src_stride_abgr,
+                      uint8_t *dst_u,
+                      uint8_t *dst_v,
+                      int width);
+
+void ARGBToUVRow_SSSE3(const uint8_t *src_argb,
+                       int src_stride_argb,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width);
+
+void BGRAToUVRow_SSSE3(const uint8_t *src_bgra,
+                       int src_stride_bgra,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width);
+
+void ABGRToUVRow_SSSE3(const uint8_t *src_abgr,
+                       int src_stride_abgr,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width);
+
+void RGBAToUVRow_SSSE3(const uint8_t *src_rgba,
+                       int src_stride_rgba,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width);
+
+void ARGBToUVRow_Any_AVX2(const uint8_t *src_ptr,
+                          int src_stride,
+                          uint8_t *dst_u,
+                          uint8_t *dst_v,
+                          int width);
+
+void ABGRToUVRow_Any_AVX2(const uint8_t *src_ptr,
+                          int src_stride,
+                          uint8_t *dst_u,
+                          uint8_t *dst_v,
+                          int width);
+
+void ARGBToUVRow_Any_SSSE3(const uint8_t *src_ptr,
+                           int src_stride,
+                           uint8_t *dst_u,
+                           uint8_t *dst_v,
+                           int width);
+
+void BGRAToUVRow_Any_SSSE3(const uint8_t *src_ptr,
+                           int src_stride,
+                           uint8_t *dst_u,
+                           uint8_t *dst_v,
+                           int width);
+
+void ABGRToUVRow_Any_SSSE3(const uint8_t *src_ptr,
+                           int src_stride,
+                           uint8_t *dst_u,
+                           uint8_t *dst_v,
+                           int width);
+
+void RGBAToUVRow_Any_SSSE3(const uint8_t *src_ptr,
+                           int src_stride,
+                           uint8_t *dst_u,
+                           uint8_t *dst_v,
+                           int width);
+
+void ARGBToUVRow_C(const uint8_t *src_rgb,
+                   int src_stride_rgb,
+                   uint8_t *dst_u,
+                   uint8_t *dst_v,
+                   int width);
+
+void ARGBToUVRow_C(const uint8_t *src_rgb,
+                   int src_stride_rgb,
+                   uint8_t *dst_u,
+                   uint8_t *dst_v,
+                   int width);
+
+void BGRAToUVRow_C(const uint8_t *src_rgb,
+                   int src_stride_rgb,
+                   uint8_t *dst_u,
+                   uint8_t *dst_v,
+                   int width);
+
+void ABGRToUVRow_C(const uint8_t *src_rgb,
+                   int src_stride_rgb,
+                   uint8_t *dst_u,
+                   uint8_t *dst_v,
+                   int width);
+
+void RGBAToUVRow_C(const uint8_t *src_rgb,
+                   int src_stride_rgb,
+                   uint8_t *dst_u,
+                   uint8_t *dst_v,
+                   int width);
+
+void RGB565ToUVRow_C(const uint8_t *src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t *dst_u,
+                     uint8_t *dst_v,
+                     int width);
+
+void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width);
+
+void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width);
+
+void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width);
+
+void MirrorRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void MirrorRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void MirrorRow_Any_SSE2(const uint8_t *src, uint8_t *dst, int width);
+
+void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width);
+
+void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width);
+
+void MirrorUVRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void MirrorUVRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width);
+
+void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width);
+
+void ARGBMirrorRow_C(const uint8_t *src, uint8_t *dst, int width);
+
+void ARGBMirrorRow_Any_AVX2(const uint8_t *src_ptr,
+                            uint8_t *dst_ptr,
+                            int width);
+
+void ARGBMirrorRow_Any_SSE2(const uint8_t *src_ptr,
+                            uint8_t *dst_ptr,
+                            int width);
+
+void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width);
+
+void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width);
+
+void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width);
+
+void CopyRow_C(const uint8_t *src, uint8_t *dst, int count);
+
+void CopyRow_Any_SSE2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void CopyRow_Any_AVX(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
+
+void RGB565ToARGBRow_SSE2(const uint8_t *src, uint8_t *dst, int width);
+
+void RGB565ToARGBRow_AVX2(const uint8_t *src_rgb565,
+                          uint8_t *dst_argb,
+                          int width);
+
+void RGB565ToARGBRow_C(const uint8_t *src_rgb565, uint8_t *dst_argb, int width);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8_t *src_ptr,
+                              uint8_t *dst_ptr,
+                              int width);
+
+void RGB565ToARGBRow_Any_AVX2(const uint8_t *src_ptr,
+                              uint8_t *dst_ptr,
+                              int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8_t *dst_ptr,
+                      const uint8_t *src_ptr,
+                      ptrdiff_t src_stride,
+                      int width,
+                      int source_y_fraction);
+
+void InterpolateRow_SSSE3(uint8_t *dst_ptr,
+                          const uint8_t *src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction);
+
+void InterpolateRow_AVX2(uint8_t *dst_ptr,
+                         const uint8_t *src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction);
+
+void InterpolateRow_Any_SSSE3(uint8_t *dst_ptr,
+                              const uint8_t *src_ptr,
+                              ptrdiff_t src_stride_ptr,
+                              int width,
+                              int source_y_fraction);
+
+void InterpolateRow_Any_AVX2(uint8_t *dst_ptr,
+                             const uint8_t *src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
+                             int source_y_fraction);
+
+#endif  // INCLUDE_LIBYUV_ROW_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/row_any.c b/pkg/encoder/yuv/libyuv/row_any.c
new file mode 100644
index 000000000..fcc49c672
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/row_any.c
@@ -0,0 +1,206 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+
+#include <string.h>  // For memset.
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
+    SIMD_ALIGNED(uint8_t vin[128]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
+    int r = width & MASK;                                                \
+    int n = width & ~MASK;                                               \
+    if (n > 0) {                                                         \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
+    }                                                                    \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(vin, vout, MASK + 1);                                       \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
+  }
+
+#ifdef HAS_COPYROW_AVX
+
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+
+#endif
+#ifdef HAS_COPYROW_SSE2
+
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+
+#endif
+
+#ifdef HAS_ARGBTOYROW_AVX2
+
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+
+#endif
+#ifdef HAS_ABGRTOYROW_AVX2
+
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+
+#endif
+
+#undef ANY11
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)           \
+  void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+               int width, int source_y_fraction) {                   \
+    SIMD_ALIGNED(TS vin[64 * 2]);                                    \
+    SIMD_ALIGNED(TD vout[64]);                                       \
+    memset(vin, 0, sizeof(vin)); /* for msan */                      \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
+    }                                                                \
+    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));          \
+    if (source_y_fraction) {                                         \
+      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,              \
+             r * SBPP * sizeof(TS));                                 \
+    }                                                                \
+    ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction);            \
+    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));           \
+  }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+
+ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
+
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+
+ANY11I(InterpolateRow_Any_SSSE3,
+       InterpolateRow_SSSE3,
+       uint8_t,
+       uint8_t,
+       1,
+       1,
+       15)
+
+#endif
+
+#undef ANY11I
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t vin[64]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                   \
+    memset(vin, 0, sizeof(vin)); /* for msan */                       \
+    int r = width & MASK;                                             \
+    int n = width & ~MASK;                                            \
+    if (n > 0) {                                                      \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
+    }                                                                 \
+    memcpy(vin, src_ptr, r* BPP);                                     \
+    ANY_SIMD(vin, vout, MASK + 1);                                    \
+    memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP);  \
+  }
+
+#ifdef HAS_MIRRORROW_AVX2
+
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+
+#endif
+#undef ANY11M
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
+    SIMD_ALIGNED(uint8_t vout[128 * 2]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
+    }                                                                        \
+    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
+             BPP);                                                           \
+      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
+             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
+    }                                                                        \
+    ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1);                          \
+    memcpy(dst_u + (n >> 1), vout, SS(r, 1));                                \
+    memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1));                          \
+  }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+
+#endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+
+#endif
+#undef ANY12S
diff --git a/pkg/encoder/yuv/libyuv/row_common.c b/pkg/encoder/yuv/libyuv/row_common.c
new file mode 100644
index 000000000..34a93a074
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/row_common.c
@@ -0,0 +1,887 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+
+#include <assert.h>
+#include <string.h>  // For memcpy and memset.
+
+#define STATIC_CAST(type, expr) (type)(expr)
+
+// This macro controls YUV to RGB using unsigned math to extend range of
+// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
+// LIBYUV_UNLIMITED_DATA
+
+// Macros to enable unlimited data for each colorspace
+// LIBYUV_UNLIMITED_BT601
+// LIBYUV_UNLIMITED_BT709
+// LIBYUV_UNLIMITED_BT2020
+
+#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
+                                   defined(__i386__) || defined(_M_IX86))
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+#if defined(LIBYUV_BIT_EXACT)
+#define LIBYUV_UNATTENUATE_DUP 1
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+
+static __inline int32_t clamp0(int32_t v) {
+    return -(v >= 0) & v;
+}
+
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+    return (-(v >= 255) | v) & 255;
+}
+
+static __inline int32_t clamp1023(int32_t v) {
+    return (-(v >= 1023) | v) & 1023;
+}
+
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+    return (-(v >= max) | v) & max;
+}
+
+static __inline uint32_t Abs(int32_t v) {
+    int m = -(v < 0);
+    return (v + m) ^ m;
+}
+
+#else   // USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32_t clamp255(int32_t v) {
+  return (v > 255) ? 255 : v;
+}
+
+static __inline int32_t clamp1023(int32_t v) {
+  return (v > 1023) ? 1023 : v;
+}
+
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+  return (v > max) ? max : v;
+}
+
+static __inline uint32_t Abs(int32_t v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+
+static __inline uint32_t Clamp(int32_t val) {
+    int v = clamp0(val);
+    return (uint32_t) (clamp255(v));
+}
+
+static __inline uint32_t Clamp10(int32_t val) {
+    int v = clamp0(val);
+    return (uint32_t) (clamp1023(v));
+}
+
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
+#else
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+  p[0] = (uint8_t)(v & 255);
+  p[1] = (uint8_t)((v >> 8) & 255);
+  p[2] = (uint8_t)((v >> 16) & 255);
+  p[3] = (uint8_t)((v >> 24) & 255);
+}
+#endif
+
+void RGB565ToARGBRow_C(const uint8_t *src_rgb565,
+                       uint8_t *dst_argb,
+                       int width) {
+    int x;
+    for (x = 0; x < width; ++x) {
+        uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+        uint8_t g = STATIC_CAST(
+                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+        uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+        dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+        dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+        dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
+        dst_argb[3] = 255u;
+        dst_argb += 4;
+        src_rgb565 += 2;
+    }
+}
+
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+//  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+//  0x7e80) >> 8;
+
+static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+    return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+    return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
+}
+
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+    return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
+}
+
+#else
+// TODO(fbarchard): Add rounding to x86 SIMD and use this
+static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
+}
+static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
+}
+#endif
+
+// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+  return STATIC_CAST(
+      uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
+}
+#endif
+
+// ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
+      dst_u[0] = RGBToU(ar, ag, ab);                                       \
+      dst_v[0] = RGBToV(ar, ag, ab);                                       \
+    }                                                                      \
+  }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                 \
+    for (x = 0; x < width; ++x) {                                          \
+      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                      \
+      dst_y += 1;                                                          \
+    }                                                                      \
+  }                                                                        \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
+    int x;                                                                 \
+    for (x = 0; x < width - 1; x += 2) {                                   \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                             \
+                    1;                                                     \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                             \
+                    1;                                                     \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                  \
+      src_rgb1 += BPP * 2;                                                 \
+      dst_u += 1;                                                          \
+      dst_v += 1;                                                          \
+    }                                                                      \
+    if (width & 1) {                                                       \
+      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
+      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
+      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+    }                                                                      \
+  }
+#endif
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+
+MAKEROWY(BGRA, 1, 2, 3, 4)
+
+MAKEROWY(ABGR, 0, 1, 2, 4)
+
+MAKEROWY(RGBA, 3, 2, 1, 4)
+
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 7 bit Y (deprecated)
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+// 8 bit
+static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+    return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
+
+static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+    return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+
+static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+    return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#else
+static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
+
+// ARGBToYJ_C and ARGBToUVJ_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
+    }                                                                       \
+  }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
+  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+    int x;                                                                  \
+    for (x = 0; x < width; ++x) {                                           \
+      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
+      src_rgb += BPP;                                                       \
+      dst_y += 1;                                                           \
+    }                                                                       \
+  }                                                                         \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
+    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
+    int x;                                                                  \
+    for (x = 0; x < width - 1; x += 2) {                                    \
+      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
+                     src_rgb1[B + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
+                     src_rgb1[G + BPP] + 1) >>                              \
+                    1;                                                      \
+      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
+                     src_rgb1[R + BPP] + 1) >>                              \
+                    1;                                                      \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+      src_rgb += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                  \
+      dst_u += 1;                                                           \
+      dst_v += 1;                                                           \
+    }                                                                       \
+    if (width & 1) {                                                        \
+      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
+      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
+      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
+      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+    }                                                                       \
+  }
+
+#endif
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+
+MAKEROWYJ(ABGR, 0, 1, 2, 4)
+
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+
+MAKEROWYJ(RAW, 0, 1, 2, 3)
+
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width) {
+    int x;
+    for (x = 0; x < width; ++x) {
+        uint8_t b = src_rgb565[0] & 0x1f;
+        uint8_t g = STATIC_CAST(
+                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+        uint8_t r = src_rgb565[1] >> 3;
+        b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
+        g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
+        r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
+        dst_y[0] = RGBToY(r, g, b);
+        src_rgb565 += 2;
+        dst_y += 1;
+    }
+}
+
+void RGB565ToUVRow_C(const uint8_t *src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t *dst_u,
+                     uint8_t *dst_v,
+                     int width) {
+    const uint8_t *next_rgb565 = src_rgb565 + src_stride_rgb565;
+    int x;
+    for (x = 0; x < width - 1; x += 2) {
+        uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+        uint8_t g0 = STATIC_CAST(
+                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+        uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+        uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f);
+        uint8_t g1 = STATIC_CAST(
+                uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3));
+        uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3);
+        uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+        uint8_t g2 = STATIC_CAST(
+                uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+        uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+        uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f);
+        uint8_t g3 = STATIC_CAST(
+                uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3));
+        uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3);
+
+        b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+        g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+        r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+        b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
+        g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4));
+        r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
+        b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+        g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+        r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+        b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
+        g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
+        r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+        uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+        uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+        uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+        dst_u[0] = RGBToU(ar, ag, ab);
+        dst_v[0] = RGBToV(ar, ag, ab);
+#else
+        uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+        uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+        uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+        dst_u[0] = RGB2xToU(r, g, b);
+        dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
+        src_rgb565 += 4;
+        next_rgb565 += 4;
+        dst_u += 1;
+        dst_v += 1;
+    }
+    if (width & 1) {
+        uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
+        uint8_t g0 = STATIC_CAST(
+                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
+        uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
+        uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
+        uint8_t g2 = STATIC_CAST(
+                uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
+        uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
+        b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
+        g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
+        r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
+        b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
+        g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
+        r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
+
+#if LIBYUV_ARGBTOUV_PAVGB
+        uint8_t ab = AVGB(b0, b2);
+        uint8_t ag = AVGB(g0, g2);
+        uint8_t ar = AVGB(r0, r2);
+        dst_u[0] = RGBToU(ar, ag, ab);
+        dst_v[0] = RGBToV(ar, ag, ab);
+#else
+        uint16_t b = b0 + b2;
+        uint16_t g = g0 + g2;
+        uint16_t r = r0 + r2;
+        dst_u[0] = RGB2xToU(r, g, b);
+        dst_v[0] = RGB2xToV(r, g, b);
+#endif
+    }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 24
+
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 16
+
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+#undef SHADE
+
+// Macros to create SIMD specific yuv to rgb conversion constants.
+
+// clang-format off
+
+#if defined(__aarch64__) || defined(__arm__)
+// Bias values include subtract 128 from U and V, bias from Y and rounding.
+// For B and R bias is negative. For G bias is positive.
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
+  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
+   {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
+    0, 0}}
+#else
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
+  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
+   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
+   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
+#endif
+
+// clang-format on
+
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
+      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
+
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.596
+//  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+//  B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114
+
+// U and V contributions to R,G,B.
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
+#define UB 129 /* round(2.018 * 64) */
+#else
+#define UB 128 /* max(128, round(2.018 * 64)) */
+#endif
+#define UG 25  /* round(0.391 * 64) */
+#define VG 52  /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// *  R = Y               + V * 1.40200
+// *  G = Y - U * 0.34414 - V * 0.71414
+// *  B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
+
+// U and V contributions to R,G,B.
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22  /* round(0.34414 * 64) */
+#define VG 46  /* round(0.71414 * 64) */
+#define VR 90  /* round(1.40200 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.709 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.793
+//  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+//  B = (Y - 16) * 1.164 + U * 2.112
+//  KR = 0.2126, KB = 0.0722
+
+// U and V contributions to R,G,B.
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
+#define UB 135 /* round(2.112 * 64) */
+#else
+#define UB 128 /* max(128, round(2.112 * 64)) */
+#endif
+#define UG 14  /* round(0.213 * 64) */
+#define VG 34  /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126, KB = 0.0722
+
+// U and V contributions to R,G,B.
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12  /* round(0.18732 * 64) */
+#define VG 30  /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.2020 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164384                + V * 1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+//  B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593
+
+// U and V contributions to R,G,B.
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
+#define UB 137 /* round(2.142 * 64) */
+#else
+#define UB 128 /* max(128, round(2.142 * 64)) */
+#endif
+#define UG 12  /* round(0.187326 * 64) */
+#define VG 42  /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.2020 full range YUV to RGB reference
+//  R = Y                + V * 1.474600
+//  G = Y - U * 0.164553 - V * 0.571353
+//  B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11  /* round(0.164553 * 64) */
+#define VG 37  /* round(0.571353 * 64) */
+#define VR 94  /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR
+
+#undef MAKEYUVCONSTANTS
+
+#if defined(__aarch64__) || defined(__arm__)
+#define LOAD_YUV_CONSTANTS                 \
+  int ub = yuvconstants->kUVCoeff[0];      \
+  int vr = yuvconstants->kUVCoeff[1];      \
+  int ug = yuvconstants->kUVCoeff[2];      \
+  int vg = yuvconstants->kUVCoeff[3];      \
+  int yg = yuvconstants->kRGBCoeffBias[0]; \
+  int bb = yuvconstants->kRGBCoeffBias[1]; \
+  int bg = yuvconstants->kRGBCoeffBias[2]; \
+  int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16                         \
+  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+  int b16 = y1 + (u * ub) - bb;            \
+  int g16 = y1 + bg - (u * ug + v * vg);   \
+  int r16 = y1 + (v * vr) - br
+#else
+#define LOAD_YUV_CONSTANTS           \
+  int ub = yuvconstants->kUVToB[0];  \
+  int ug = yuvconstants->kUVToG[0];  \
+  int vg = yuvconstants->kUVToG[1];  \
+  int vr = yuvconstants->kUVToR[1];  \
+  int yg = yuvconstants->kYToRgb[0]; \
+  int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16                                \
+  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+  int8_t ui = (int8_t)u;                          \
+  int8_t vi = (int8_t)v;                          \
+  ui -= 0x80;                                     \
+  vi -= 0x80;                                     \
+  int b16 = y1 + (ui * ub);                       \
+  int g16 = y1 - (ui * ug + vi * vg);             \
+  int r16 = y1 + (vi * vr)
+#endif
+
+void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width) {
+    int x;
+    src += width - 1;
+    for (x = 0; x < width - 1; x += 2) {
+        dst[x] = src[0];
+        dst[x + 1] = src[-1];
+        src -= 2;
+    }
+    if (width & 1) {
+        dst[width - 1] = src[0];
+    }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
+void CopyRow_C(const uint8_t *src, uint8_t *dst, int count) {
+    memcpy(dst, src, count);
+}
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32_t fixed_invtbl8[256] = {
+        0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06),
+        T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d),
+        T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14),
+        T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b),
+        T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22),
+        T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29),
+        T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30),
+        T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+        T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e),
+        T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45),
+        T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c),
+        T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53),
+        T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a),
+        T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61),
+        T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68),
+        T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+        T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76),
+        T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d),
+        T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84),
+        T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b),
+        T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92),
+        T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99),
+        T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0),
+        T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+        T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae),
+        T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5),
+        T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc),
+        T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3),
+        T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca),
+        T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1),
+        T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8),
+        T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+        T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6),
+        T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed),
+        T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4),
+        T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb),
+        T(0xfc), T(0xfd), T(0xfe), 0x01000100};
+#undef T
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8_t *src_uv,
+                      ptrdiff_t src_uv_stride,
+                      uint8_t *dst_uv,
+                      int width) {
+    int x;
+    for (x = 0; x < width; ++x) {
+        dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+    }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8_t *dst_ptr,
+                      const uint8_t *src_ptr,
+                      ptrdiff_t src_stride,
+                      int width,
+                      int source_y_fraction) {
+    int y1_fraction = source_y_fraction;
+    int y0_fraction = 256 - y1_fraction;
+    const uint8_t *src_ptr1 = src_ptr + src_stride;
+    int x;
+    assert(source_y_fraction >= 0);
+    assert(source_y_fraction < 256);
+
+    if (y1_fraction == 0) {
+        memcpy(dst_ptr, src_ptr, width);
+        return;
+    }
+    if (y1_fraction == 128) {
+        HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+        return;
+    }
+    for (x = 0; x < width; ++x) {
+        dst_ptr[0] = STATIC_CAST(
+                uint8_t,
+                (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
+        ++src_ptr;
+        ++src_ptr1;
+        ++dst_ptr;
+    }
+}
+
+// Work around GCC 7 punning warning -Wstrict-aliasing
+#if defined(__GNUC__)
+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
+#else
+typedef uint32_t uint32_alias_t;
+#endif
+
+#undef STATIC_CAST
diff --git a/pkg/encoder/yuv/libyuv/row_gcc.c b/pkg/encoder/yuv/libyuv/row_gcc.c
new file mode 100644
index 000000000..07e795e60
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/row_gcc.c
@@ -0,0 +1,1090 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
+
+
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                              -18, -94, 112, 0, -18, -94, 112, 0};
+
+// Constants for BGRA
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+//static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+//                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round)                            \
+  "1:                                        \n" \
+  "movdqu    (%0),%%xmm0                     \n" \
+  "movdqu    0x10(%0),%%xmm1                 \n" \
+  "movdqu    0x20(%0),%%xmm2                 \n" \
+  "movdqu    0x30(%0),%%xmm3                 \n" \
+  "psubb     %%xmm5,%%xmm0                   \n" \
+  "psubb     %%xmm5,%%xmm1                   \n" \
+  "psubb     %%xmm5,%%xmm2                   \n" \
+  "psubb     %%xmm5,%%xmm3                   \n" \
+  "movdqu    %%xmm4,%%xmm6                   \n" \
+  "pmaddubsw %%xmm0,%%xmm6                   \n" \
+  "movdqu    %%xmm4,%%xmm0                   \n" \
+  "pmaddubsw %%xmm1,%%xmm0                   \n" \
+  "movdqu    %%xmm4,%%xmm1                   \n" \
+  "pmaddubsw %%xmm2,%%xmm1                   \n" \
+  "movdqu    %%xmm4,%%xmm2                   \n" \
+  "pmaddubsw %%xmm3,%%xmm2                   \n" \
+  "lea       0x40(%0),%0                     \n" \
+  "phaddw    %%xmm0,%%xmm6                   \n" \
+  "phaddw    %%xmm2,%%xmm1                   \n" \
+  "prefetcht0 1280(%0)                       \n" \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
+  "psrlw     $0x8,%%xmm6                     \n" \
+  "psrlw     $0x8,%%xmm1                     \n" \
+  "packuswb  %%xmm1,%%xmm6                   \n" \
+  "movdqu    %%xmm6,(%1)                     \n" \
+  "lea       0x10(%1),%1                     \n" \
+  "sub       $0x10,%2                        \n" \
+  "jg        1b                              \n"
+
+#define RGBTOY_AVX2(round)                                       \
+  "1:                                        \n"                 \
+  "vmovdqu    (%0),%%ymm0                    \n"                 \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
+  "lea       0x80(%0),%0                     \n"                 \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
+  "prefetcht0 1280(%0)                       \n"                 \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
+  "vmovdqu    %%ymm0,(%1)                    \n"                 \
+  "lea       0x20(%1),%1                     \n"                 \
+  "sub       $0x20,%2                        \n"                 \
+  "jg        1b                              \n"                 \
+  "vzeroupper                                \n"
+
+// clang-format on
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width) {
+    asm volatile(
+            "movdqa      %3,%%xmm4                     \n"
+            "movdqa      %4,%%xmm5                     \n"
+            "movdqa      %5,%%xmm7                     \n"
+
+            LABELALIGN RGBTOY(xmm7)
+            : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)      // %2
+            : "m"(kARGBToY),   // %3
+    "m"(kSub128),    // %4
+    "m"(kAddY16)     // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
+    defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+#endif
+
+#ifdef HAS_ARGBTOYROW_AVX2
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width) {
+    asm volatile(
+            "vbroadcastf128 %3,%%ymm4                  \n"
+            "vbroadcastf128 %4,%%ymm5                  \n"
+            "vbroadcastf128 %5,%%ymm7                  \n"
+            "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+                    ymm7) "vzeroupper                                \n"
+            : "+r"(src_argb),         // %0
+    "+r"(dst_y),            // %1
+    "+r"(width)             // %2
+            : "m"(kARGBToY),          // %3
+    "m"(kSub128),           // %4
+    "m"(kAddY16),           // %5
+    "m"(kPermdARGBToY_AVX)  // %6
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ABGRTOYROW_AVX2
+
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width) {
+    asm volatile(
+            "vbroadcastf128 %3,%%ymm4                  \n"
+            "vbroadcastf128 %4,%%ymm5                  \n"
+            "vbroadcastf128 %5,%%ymm7                  \n"
+            "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
+                    ymm7) "vzeroupper                                \n"
+            : "+r"(src_abgr),         // %0
+    "+r"(dst_y),            // %1
+    "+r"(width)             // %2
+            : "m"(kABGRToY),          // %3
+    "m"(kSub128),           // %4
+    "m"(kAddY16),           // %5
+    "m"(kPermdARGBToY_AVX)  // %6
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif  // HAS_ABGRTOYROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+
+void ARGBToUVRow_SSSE3(const uint8_t *src_argb,
+                       int src_stride_argb,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width) {
+    asm volatile(
+            "movdqa      %5,%%xmm3                     \n"
+            "movdqa      %6,%%xmm4                     \n"
+            "movdqa      %7,%%xmm5                     \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm1                 \n"
+            "movdqu      0x20(%0),%%xmm2               \n"
+            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqu      0x30(%0),%%xmm6               \n"
+            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+
+            "lea         0x40(%0),%0                   \n"
+            "movdqa      %%xmm0,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm1,%%xmm0           \n"
+            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqa      %%xmm2,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm6,%%xmm2           \n"
+            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm3,%%xmm1                 \n"
+            "pmaddubsw   %%xmm3,%%xmm6                 \n"
+            "phaddw      %%xmm2,%%xmm0                 \n"
+            "phaddw      %%xmm6,%%xmm1                 \n"
+            "psraw       $0x8,%%xmm0                   \n"
+            "psraw       $0x8,%%xmm1                   \n"
+            "packsswb    %%xmm1,%%xmm0                 \n"
+            "paddb       %%xmm5,%%xmm0                 \n"
+            "movlps      %%xmm0,(%1)                   \n"
+            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $0x10,%3                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_argb),                    // %0
+    "+r"(dst_u),                       // %1
+    "+r"(dst_v),                       // %2
+    "+rm"(width)                       // %3
+            : "r"((intptr_t) (src_stride_argb)),  // %4
+    "m"(kARGBToV),                     // %5
+    "m"(kARGBToU),                     // %6
+    "m"(kAddUV128)                     // %7
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
+    defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+        0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+        0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#endif
+
+#if defined(HAS_ARGBTOUVROW_AVX2)
+
+void ARGBToUVRow_AVX2(const uint8_t *src_argb,
+                      int src_stride_argb,
+                      uint8_t *dst_u,
+                      uint8_t *dst_v,
+                      int width) {
+    asm volatile(
+            "vbroadcastf128 %5,%%ymm5                  \n"
+            "vbroadcastf128 %6,%%ymm6                  \n"
+            "vbroadcastf128 %7,%%ymm7                  \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "vmovdqu     0x40(%0),%%ymm2               \n"
+            "vmovdqu     0x60(%0),%%ymm3               \n"
+            "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+            "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+            "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+            "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+            "lea         0x80(%0),%0                   \n"
+            "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+            "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+            "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+            "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+            "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+            "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+            "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+            "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+            "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+            "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+            "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vpshufb     %8,%%ymm0,%%ymm0              \n"
+            "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+            "vextractf128 $0x0,%%ymm0,(%1)             \n"
+            "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x20,%3                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_argb),                    // %0
+    "+r"(dst_u),                       // %1
+    "+r"(dst_v),                       // %2
+    "+rm"(width)                       // %3
+            : "r"((intptr_t) (src_stride_argb)),  // %4
+    "m"(kAddUV128),                    // %5
+    "m"(kARGBToV),                     // %6
+    "m"(kARGBToU),                     // %7
+    "m"(kShufARGBToUV_AVX)             // %8
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ABGRTOUVROW_AVX2
+
+void ABGRToUVRow_AVX2(const uint8_t *src_abgr,
+                      int src_stride_abgr,
+                      uint8_t *dst_u,
+                      uint8_t *dst_v,
+                      int width) {
+    asm volatile(
+            "vbroadcastf128 %5,%%ymm5                  \n"
+            "vbroadcastf128 %6,%%ymm6                  \n"
+            "vbroadcastf128 %7,%%ymm7                  \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "vmovdqu     0x40(%0),%%ymm2               \n"
+            "vmovdqu     0x60(%0),%%ymm3               \n"
+            "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
+            "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
+            "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
+            "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
+            "lea         0x80(%0),%0                   \n"
+            "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
+            "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
+            "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
+            "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
+            "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
+            "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
+
+            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
+            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
+            "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
+            "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
+            "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
+            "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
+            "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vpshufb     %8,%%ymm0,%%ymm0              \n"
+            "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
+
+            "vextractf128 $0x0,%%ymm0,(%1)             \n"
+            "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x20,%3                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_abgr),                    // %0
+    "+r"(dst_u),                       // %1
+    "+r"(dst_v),                       // %2
+    "+rm"(width)                       // %3
+            : "r"((intptr_t) (src_stride_abgr)),  // %4
+    "m"(kAddUV128),                    // %5
+    "m"(kABGRToV),                     // %6
+    "m"(kABGRToU),                     // %7
+    "m"(kShufARGBToUV_AVX)             // %8
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif  // HAS_ABGRTOUVROW_AVX2
+
+void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width) {
+    asm volatile(
+            "movdqa      %3,%%xmm4                     \n"
+            "movdqa      %4,%%xmm5                     \n"
+            "movdqa      %5,%%xmm7                     \n"
+
+            LABELALIGN RGBTOY(xmm7)
+            : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)      // %2
+            : "m"(kBGRAToY),   // %3
+    "m"(kSub128),    // %4
+    "m"(kAddY16)     // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+void BGRAToUVRow_SSSE3(const uint8_t *src_bgra,
+                       int src_stride_bgra,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width) {
+    asm volatile(
+            "movdqa      %5,%%xmm3                     \n"
+            "movdqa      %6,%%xmm4                     \n"
+            "movdqa      %7,%%xmm5                     \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm1                 \n"
+            "movdqu      0x20(%0),%%xmm2               \n"
+            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqu      0x30(%0),%%xmm6               \n"
+            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+
+            "lea         0x40(%0),%0                   \n"
+            "movdqa      %%xmm0,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm1,%%xmm0           \n"
+            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqa      %%xmm2,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm6,%%xmm2           \n"
+            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm3,%%xmm1                 \n"
+            "pmaddubsw   %%xmm3,%%xmm6                 \n"
+            "phaddw      %%xmm2,%%xmm0                 \n"
+            "phaddw      %%xmm6,%%xmm1                 \n"
+            "psraw       $0x8,%%xmm0                   \n"
+            "psraw       $0x8,%%xmm1                   \n"
+            "packsswb    %%xmm1,%%xmm0                 \n"
+            "paddb       %%xmm5,%%xmm0                 \n"
+            "movlps      %%xmm0,(%1)                   \n"
+            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $0x10,%3                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_bgra),                    // %0
+    "+r"(dst_u),                       // %1
+    "+r"(dst_v),                       // %2
+    "+rm"(width)                       // %3
+            : "r"((intptr_t) (src_stride_bgra)),  // %4
+    "m"(kBGRAToV),                     // %5
+    "m"(kBGRAToU),                     // %6
+    "m"(kAddUV128)                     // %7
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width) {
+    asm volatile(
+            "movdqa      %3,%%xmm4                     \n"
+            "movdqa      %4,%%xmm5                     \n"
+            "movdqa      %5,%%xmm7                     \n"
+
+            LABELALIGN RGBTOY(xmm7)
+            : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)      // %2
+            : "m"(kABGRToY),   // %3
+    "m"(kSub128),    // %4
+    "m"(kAddY16)     // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+void ABGRToUVRow_SSSE3(const uint8_t *src_abgr,
+                       int src_stride_abgr,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width) {
+    asm volatile(
+            "movdqa      %5,%%xmm3                     \n"
+            "movdqa      %6,%%xmm4                     \n"
+            "movdqa      %7,%%xmm5                     \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm1                 \n"
+            "movdqu      0x20(%0),%%xmm2               \n"
+            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqu      0x30(%0),%%xmm6               \n"
+            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+
+            "lea         0x40(%0),%0                   \n"
+            "movdqa      %%xmm0,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm1,%%xmm0           \n"
+            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqa      %%xmm2,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm6,%%xmm2           \n"
+            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm3,%%xmm1                 \n"
+            "pmaddubsw   %%xmm3,%%xmm6                 \n"
+            "phaddw      %%xmm2,%%xmm0                 \n"
+            "phaddw      %%xmm6,%%xmm1                 \n"
+            "psraw       $0x8,%%xmm0                   \n"
+            "psraw       $0x8,%%xmm1                   \n"
+            "packsswb    %%xmm1,%%xmm0                 \n"
+            "paddb       %%xmm5,%%xmm0                 \n"
+            "movlps      %%xmm0,(%1)                   \n"
+            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $0x10,%3                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_abgr),                    // %0
+    "+r"(dst_u),                       // %1
+    "+r"(dst_v),                       // %2
+    "+rm"(width)                       // %3
+            : "r"((intptr_t) (src_stride_abgr)),  // %4
+    "m"(kABGRToV),                     // %5
+    "m"(kABGRToU),                     // %6
+    "m"(kAddUV128)                     // %7
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void RGBAToUVRow_SSSE3(const uint8_t *src_rgba,
+                       int src_stride_rgba,
+                       uint8_t *dst_u,
+                       uint8_t *dst_v,
+                       int width) {
+    asm volatile(
+            "movdqa      %5,%%xmm3                     \n"
+            "movdqa      %6,%%xmm4                     \n"
+            "movdqa      %7,%%xmm5                     \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm1                 \n"
+            "movdqu      0x20(%0),%%xmm2               \n"
+            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqu      0x30(%0),%%xmm6               \n"
+            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+
+            "lea         0x40(%0),%0                   \n"
+            "movdqa      %%xmm0,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm1,%%xmm0           \n"
+            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm0                 \n"
+            "movdqa      %%xmm2,%%xmm7                 \n"
+            "shufps      $0x88,%%xmm6,%%xmm2           \n"
+            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm2                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm3,%%xmm1                 \n"
+            "pmaddubsw   %%xmm3,%%xmm6                 \n"
+            "phaddw      %%xmm2,%%xmm0                 \n"
+            "phaddw      %%xmm6,%%xmm1                 \n"
+            "psraw       $0x8,%%xmm0                   \n"
+            "psraw       $0x8,%%xmm1                   \n"
+            "packsswb    %%xmm1,%%xmm0                 \n"
+            "paddb       %%xmm5,%%xmm0                 \n"
+            "movlps      %%xmm0,(%1)                   \n"
+            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $0x10,%3                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_rgba),                    // %0
+    "+r"(dst_u),                       // %1
+    "+r"(dst_v),                       // %2
+    "+rm"(width)                       // %3
+            : "r"((intptr_t) (src_stride_rgba)),  // %4
+    "m"(kRGBAToV),                     // %5
+    "m"(kRGBAToU),                     // %6
+    "m"(kAddUV128)                     // %7
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+
+            "movdqa      %3,%%xmm5                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
+            "pshufb      %%xmm5,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src),           // %0
+    "+r"(dst),           // %1
+    "+r"(temp_width)     // %2
+            : "m"(kShuffleMirror)  // %3
+            : "memory", "cc", "xmm0", "xmm5");
+}
+
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+
+void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+
+            "vbroadcastf128 %3,%%ymm5                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
+            "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src),           // %0
+    "+r"(dst),           // %1
+    "+r"(temp_width)     // %2
+            : "m"(kShuffleMirror)  // %3
+            : "memory", "cc", "xmm0", "xmm5");
+}
+
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+
+            "movdqa      %3,%%xmm5                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
+            "pshufb      %%xmm5,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_uv),          // %0
+    "+r"(dst_uv),          // %1
+    "+r"(temp_width)       // %2
+            : "m"(kShuffleMirrorUV)  // %3
+            : "memory", "cc", "xmm0", "xmm5");
+}
+
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+
+void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+
+            "vbroadcastf128 %3,%%ymm5                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
+            "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_uv),          // %0
+    "+r"(dst_uv),          // %1
+    "+r"(temp_width)       // %2
+            : "m"(kShuffleMirrorUV)  // %3
+            : "memory", "cc", "xmm0", "xmm5");
+}
+
+#endif  // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+
+void MirrorSplitUVRow_SSSE3(const uint8_t *src,
+                            uint8_t *dst_u,
+                            uint8_t *dst_v,
+                            int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+            "movdqa      %4,%%xmm1                     \n"
+            "lea         -0x10(%0,%3,2),%0             \n"
+            "sub         %1,%2                         \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "lea         -0x10(%0),%0                  \n"
+            "pshufb      %%xmm1,%%xmm0                 \n"
+            "movlpd      %%xmm0,(%1)                   \n"
+            "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $8,%3                         \n"
+            "jg          1b                            \n"
+            : "+r"(src),                  // %0
+    "+r"(dst_u),                // %1
+    "+r"(dst_v),                // %2
+    "+r"(temp_width)            // %3
+            : "m"(kShuffleMirrorSplitUV)  // %4
+            : "memory", "cc", "xmm0", "xmm1");
+}
+
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+
+            "lea         -0x10(%0,%2,4),%0             \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
+            "lea         -0x10(%0),%0                  \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x4,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src),        // %0
+    "+r"(dst),        // %1
+    "+r"(temp_width)  // %2
+            :
+            : "memory", "cc", "xmm0");
+}
+
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) {
+    intptr_t temp_width = (intptr_t) (width);
+    asm volatile(
+
+            "vmovdqu     %3,%%ymm5                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src),                    // %0
+    "+r"(dst),                    // %1
+    "+r"(temp_width)              // %2
+            : "m"(kARGBShuffleMirror_AVX2)  // %3
+            : "memory", "cc", "xmm0", "xmm5");
+}
+
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+
+#ifdef HAS_COPYROW_SSE2
+
+void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width) {
+    asm volatile(
+            "test        $0xf,%0                       \n"
+            "jne         2f                            \n"
+            "test        $0xf,%1                       \n"
+            "jne         2f                            \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqa      (%0),%%xmm0                   \n"
+            "movdqa      0x10(%0),%%xmm1               \n"
+            "lea         0x20(%0),%0                   \n"
+            "movdqa      %%xmm0,(%1)                   \n"
+            "movdqa      %%xmm1,0x10(%1)               \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "jmp         9f                            \n"
+
+            LABELALIGN
+            "2:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "lea         0x20(%0),%0                   \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "movdqu      %%xmm1,0x10(%1)               \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          2b                            \n"
+
+            LABELALIGN "9:                                        \n"
+            : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1");
+}
+
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+
+void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width) {
+    asm volatile(
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "lea         0x40(%0),%0                   \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "vmovdqu     %%ymm1,0x20(%1)               \n"
+            "lea         0x40(%1),%1                   \n"
+            "sub         $0x40,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1");
+}
+
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+
+// Multiple of 1.
+void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width) {
+    size_t width_tmp = (size_t) (width);
+    asm volatile(
+
+            "rep         movsb                         \n"
+            : "+S"(src),       // %0
+    "+D"(dst),       // %1
+    "+c"(width_tmp)  // %2
+            :
+            : "memory", "cc");
+}
+
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8_t *dst_ptr,
+                          const uint8_t *src_ptr,
+                          ptrdiff_t src_stride,
+                          int width,
+                          int source_y_fraction) {
+    asm volatile(
+            "sub         %1,%0                         \n"
+            "cmp         $0x0,%3                       \n"
+            "je          100f                          \n"
+            "cmp         $0x80,%3                      \n"
+            "je          50f                           \n"
+
+            "movd        %3,%%xmm0                     \n"
+            "neg         %3                            \n"
+            "add         $0x100,%3                     \n"
+            "movd        %3,%%xmm5                     \n"
+            "punpcklbw   %%xmm0,%%xmm5                 \n"
+            "punpcklwd   %%xmm5,%%xmm5                 \n"
+            "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+            "mov         $0x80808080,%%eax             \n"
+            "movd        %%eax,%%xmm4                  \n"
+            "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+
+            // General purpose row blend.
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%1),%%xmm0                   \n"
+            "movdqu      0x00(%1,%4,1),%%xmm2          \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "punpcklbw   %%xmm2,%%xmm0                 \n"
+            "punpckhbw   %%xmm2,%%xmm1                 \n"
+            "psubb       %%xmm4,%%xmm0                 \n"
+            "psubb       %%xmm4,%%xmm1                 \n"
+            "movdqa      %%xmm5,%%xmm2                 \n"
+            "movdqa      %%xmm5,%%xmm3                 \n"
+            "pmaddubsw   %%xmm0,%%xmm2                 \n"
+            "pmaddubsw   %%xmm1,%%xmm3                 \n"
+            "paddw       %%xmm4,%%xmm2                 \n"
+            "paddw       %%xmm4,%%xmm3                 \n"
+            "psrlw       $0x8,%%xmm2                   \n"
+            "psrlw       $0x8,%%xmm3                   \n"
+            "packuswb    %%xmm3,%%xmm2                 \n"
+            "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "jmp         99f                           \n"
+
+            // Blend 50 / 50.
+            LABELALIGN
+            "50:                                       \n"
+            "movdqu      (%1),%%xmm0                   \n"
+            "movdqu      0x00(%1,%4,1),%%xmm1          \n"
+            "pavgb       %%xmm1,%%xmm0                 \n"
+            "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          50b                           \n"
+            "jmp         99f                           \n"
+
+            // Blend 100 / 0 - Copy row unchanged.
+            LABELALIGN
+            "100:                                      \n"
+            "movdqu      (%1),%%xmm0                   \n"
+            "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          100b                          \n"
+
+            "99:                                       \n"
+            : "+r"(dst_ptr),               // %0
+    "+r"(src_ptr),               // %1
+    "+rm"(width),                // %2
+    "+r"(source_y_fraction)      // %3
+            : "r"((intptr_t) (src_stride))  // %4
+            : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8_t *dst_ptr,
+                         const uint8_t *src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction) {
+    asm volatile(
+            "sub         %1,%0                         \n"
+            "cmp         $0x0,%3                       \n"
+            "je          100f                          \n"
+            "cmp         $0x80,%3                      \n"
+            "je          50f                           \n"
+
+            "vmovd       %3,%%xmm0                     \n"
+            "neg         %3                            \n"
+            "add         $0x100,%3                     \n"
+            "vmovd       %3,%%xmm5                     \n"
+            "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
+            "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
+            "vbroadcastss %%xmm5,%%ymm5                \n"
+            "mov         $0x80808080,%%eax             \n"
+            "vmovd       %%eax,%%xmm4                  \n"
+            "vbroadcastss %%xmm4,%%ymm4                \n"
+
+            // General purpose row blend.
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%1),%%ymm0                   \n"
+            "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
+            "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
+            "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
+            "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
+            "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
+            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
+            "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
+            "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+            "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "jmp         99f                           \n"
+
+            // Blend 50 / 50.
+            LABELALIGN
+            "50:                                       \n"
+            "vmovdqu     (%1),%%ymm0                   \n"
+            "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
+            "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          50b                           \n"
+            "jmp         99f                           \n"
+
+            // Blend 100 / 0 - Copy row unchanged.
+            LABELALIGN
+            "100:                                      \n"
+            "vmovdqu     (%1),%%ymm0                   \n"
+            "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          100b                          \n"
+
+            "99:                                       \n"
+            "vzeroupper                                \n"
+            : "+r"(dst_ptr),               // %0
+    "+r"(src_ptr),               // %1
+    "+r"(width),                 // %2
+    "+r"(source_y_fraction)      // %3
+            : "r"((intptr_t) (src_stride))  // %4
+            : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
+}
+
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/pkg/encoder/yuv/libyuv/scale.c b/pkg/encoder/yuv/libyuv/scale.c
new file mode 100644
index 000000000..c4bd5b0b4
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/scale.c
@@ -0,0 +1,946 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "cpu_id.h"
+#include "planar_functions.h"  // For CopyPlane
+#include "row.h"
+#include "scale_row.h"
+
+static __inline int Abs(int v) {
+    return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t *src_ptr,
+                            uint8_t *dst_ptr,
+                            enum FilterMode filtering) {
+    int y;
+    void (*ScaleRowDown2)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                          uint8_t *dst_ptr, int dst_width) =
+    filtering == kFilterNone
+    ? ScaleRowDown2_C
+    : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+                                  : ScaleRowDown2Box_C);
+    int row_stride = src_stride * 2;
+    (void) src_width;
+    (void) src_height;
+    if (!filtering) {
+        src_ptr += src_stride;  // Point to odd rows.
+        src_stride = 0;
+    }
+
+
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ScaleRowDown2 =
+                filtering == kFilterNone
+                ? ScaleRowDown2_Any_SSSE3
+                : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+                                              : ScaleRowDown2Box_Any_SSSE3);
+        if (IS_ALIGNED(dst_width, 16)) {
+            ScaleRowDown2 =
+                    filtering == kFilterNone
+                    ? ScaleRowDown2_SSSE3
+                    : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+                                                  : ScaleRowDown2Box_SSSE3);
+        }
+    }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ScaleRowDown2 =
+                filtering == kFilterNone
+                ? ScaleRowDown2_Any_AVX2
+                : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+                                              : ScaleRowDown2Box_Any_AVX2);
+        if (IS_ALIGNED(dst_width, 32)) {
+            ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+                                                     : (filtering == kFilterLinear
+                                                        ? ScaleRowDown2Linear_AVX2
+                                                        : ScaleRowDown2Box_AVX2);
+        }
+    }
+#endif
+
+    if (filtering == kFilterLinear) {
+        src_stride = 0;
+    }
+    // TODO(fbarchard): Loop through source height to allow odd height.
+    for (y = 0; y < dst_height; ++y) {
+        ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+        src_ptr += row_stride;
+        dst_ptr += dst_stride;
+    }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t *src_ptr,
+                            uint8_t *dst_ptr,
+                            enum FilterMode filtering) {
+    int y;
+    void (*ScaleRowDown4)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                          uint8_t *dst_ptr, int dst_width) =
+    filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+    int row_stride = src_stride * 4;
+    (void) src_width;
+    (void) src_height;
+    if (!filtering) {
+        src_ptr += src_stride * 2;  // Point to row 2.
+        src_stride = 0;
+    }
+
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ScaleRowDown4 =
+                filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+        if (IS_ALIGNED(dst_width, 8)) {
+            ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ScaleRowDown4 =
+                filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+        if (IS_ALIGNED(dst_width, 16)) {
+            ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+        }
+    }
+#endif
+
+    if (filtering == kFilterLinear) {
+        src_stride = 0;
+    }
+    for (y = 0; y < dst_height; ++y) {
+        ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+        src_ptr += row_stride;
+        dst_ptr += dst_stride;
+    }
+}
+
+// Scale plane down, 3/4
+static void ScalePlaneDown34(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t *src_ptr,
+                             uint8_t *dst_ptr,
+                             enum FilterMode filtering) {
+    int y;
+    void (*ScaleRowDown34_0)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             uint8_t *dst_ptr, int dst_width);
+    void (*ScaleRowDown34_1)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             uint8_t *dst_ptr, int dst_width);
+    const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+    (void) src_width;
+    (void) src_height;
+    assert(dst_width % 3 == 0);
+    if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_C;
+        ScaleRowDown34_1 = ScaleRowDown34_C;
+    } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+    }
+
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        if (!filtering) {
+            ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+            ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+        } else {
+            ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+            ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+        }
+        if (dst_width % 24 == 0) {
+            if (!filtering) {
+                ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+                ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+            } else {
+                ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+                ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+            }
+        }
+    }
+#endif
+
+    for (y = 0; y < dst_height - 2; y += 3) {
+        ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 2;
+        dst_ptr += dst_stride;
+    }
+
+    // Remainder 1 or 2 rows with last row vertically unfiltered
+    if ((dst_height % 3) == 2) {
+        ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+        ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+    } else if ((dst_height % 3) == 1) {
+        ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+    }
+}
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t *src_ptr,
+                             uint8_t *dst_ptr,
+                             enum FilterMode filtering) {
+    int y;
+    void (*ScaleRowDown38_3)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             uint8_t *dst_ptr, int dst_width);
+    void (*ScaleRowDown38_2)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             uint8_t *dst_ptr, int dst_width);
+    const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+    assert(dst_width % 3 == 0);
+    (void) src_width;
+    (void) src_height;
+    if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_C;
+        ScaleRowDown38_2 = ScaleRowDown38_C;
+    } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+    }
+
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        if (!filtering) {
+            ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+            ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+        } else {
+            ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+            ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+        }
+        if (dst_width % 12 == 0 && !filtering) {
+            ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+            ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+        }
+        if (dst_width % 6 == 0 && filtering) {
+            ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+            ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+        }
+    }
+#endif
+
+    for (y = 0; y < dst_height - 2; y += 3) {
+        ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 3;
+        dst_ptr += dst_stride;
+        ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 3;
+        dst_ptr += dst_stride;
+        ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 2;
+        dst_ptr += dst_stride;
+    }
+
+    // Remainder 1 or 2 rows with last row vertically unfiltered
+    if ((dst_height % 3) == 2) {
+        ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+        src_ptr += src_stride * 3;
+        dst_ptr += dst_stride;
+        ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+    } else if ((dst_height % 3) == 1) {
+        ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+    }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t *src_ptr) {
+    uint32_t sum = 0u;
+    int x;
+    assert(iboxwidth > 0);
+    for (x = 0; x < iboxwidth; ++x) {
+        sum += src_ptr[x];
+    }
+    return sum;
+}
+
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t *src_ptr) {
+    uint32_t sum = 0u;
+    int x;
+    assert(iboxwidth > 0);
+    for (x = 0; x < iboxwidth; ++x) {
+        sum += src_ptr[x];
+    }
+    return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t *src_ptr,
+                            uint8_t *dst_ptr) {
+    int i;
+    int scaletbl[2];
+    int minboxwidth = dx >> 16;
+    int boxwidth;
+    scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+    scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+    for (i = 0; i < dst_width; ++i) {
+        int ix = x >> 16;
+        x += dx;
+        boxwidth = MIN1((x >> 16) - ix);
+        int scaletbl_index = boxwidth - minboxwidth;
+        assert((scaletbl_index == 0) || (scaletbl_index == 1));
+        *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + ix) *
+                                scaletbl[scaletbl_index] >>
+                                                         16);
+    }
+}
+
+static void ScaleAddCols0_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t *src_ptr,
+                            uint8_t *dst_ptr) {
+    int scaleval = 65536 / boxheight;
+    int i;
+    (void) dx;
+    src_ptr += (x >> 16);
+    for (i = 0; i < dst_width; ++i) {
+        *dst_ptr++ = (uint8_t) (src_ptr[i] * scaleval >> 16);
+    }
+}
+
+static void ScaleAddCols1_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t *src_ptr,
+                            uint8_t *dst_ptr) {
+    int boxwidth = MIN1(dx >> 16);
+    int scaleval = 65536 / (boxwidth * boxheight);
+    int i;
+    x >>= 16;
+    for (i = 0; i < dst_width; ++i) {
+        *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
+        x += boxwidth;
+    }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t *src_ptr,
+                          uint8_t *dst_ptr) {
+    int j, k;
+    // Initial source x/y coordinate and step values as 16.16 fixed point.
+    int x = 0;
+    int y = 0;
+    int dx = 0;
+    int dy = 0;
+    const int max_y = (src_height << 16);
+    ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+               &dx, &dy);
+    src_width = Abs(src_width);
+    {
+        // Allocate a row buffer of uint16_t.
+        align_buffer_64(row16, src_width * 2);
+        void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+                             const uint16_t *src_ptr, uint8_t *dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C
+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+        void (*ScaleAddRow)(const uint8_t *src_ptr, uint16_t *dst_ptr,
+                            int src_width) = ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+        if (TestCpuFlag(kCpuHasSSE2)) {
+            ScaleAddRow = ScaleAddRow_Any_SSE2;
+            if (IS_ALIGNED(src_width, 16)) {
+                ScaleAddRow = ScaleAddRow_SSE2;
+            }
+        }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+        if (TestCpuFlag(kCpuHasAVX2)) {
+            ScaleAddRow = ScaleAddRow_Any_AVX2;
+            if (IS_ALIGNED(src_width, 32)) {
+                ScaleAddRow = ScaleAddRow_AVX2;
+            }
+        }
+#endif
+
+
+        for (j = 0; j < dst_height; ++j) {
+            int boxheight;
+            int iy = y >> 16;
+            const uint8_t *src = src_ptr + iy * (int64_t) src_stride;
+            y += dy;
+            if (y > max_y) {
+                y = max_y;
+            }
+            boxheight = MIN1((y >> 16) - iy);
+            memset(row16, 0, src_width * 2);
+            for (k = 0; k < boxheight; ++k) {
+                ScaleAddRow(src, (uint16_t *) (row16), src_width);
+                src += src_stride;
+            }
+            ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t *) (row16), dst_ptr);
+            dst_ptr += dst_stride;
+        }
+        free_aligned_buffer_64(row16);
+    }
+}
+
+// Scale plane down with bilinear interpolation.
+static void ScalePlaneBilinearDown(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint8_t *src_ptr,
+                                   uint8_t *dst_ptr,
+                                   enum FilterMode filtering) {
+    // Initial source x/y coordinate and step values as 16.16 fixed point.
+    int x = 0;
+    int y = 0;
+    int dx = 0;
+    int dy = 0;
+    // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+    // Allocate a row buffer.
+    align_buffer_64(row, src_width);
+
+    const int max_y = (src_height - 1) << 16;
+    int j;
+    void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr,
+                            int dst_width, int x, int dx) =
+    (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+    void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr,
+                           ptrdiff_t src_stride, int dst_width,
+                           int source_y_fraction) = InterpolateRow_C;
+    ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+               &dx, &dy);
+    src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        InterpolateRow = InterpolateRow_Any_SSSE3;
+        if (IS_ALIGNED(src_width, 16)) {
+            InterpolateRow = InterpolateRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        InterpolateRow = InterpolateRow_Any_AVX2;
+        if (IS_ALIGNED(src_width, 32)) {
+            InterpolateRow = InterpolateRow_AVX2;
+        }
+    }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+        ScaleFilterCols = ScaleFilterCols_SSSE3;
+    }
+#endif
+
+    if (y > max_y) {
+        y = max_y;
+    }
+
+    for (j = 0; j < dst_height; ++j) {
+        int yi = y >> 16;
+        const uint8_t *src = src_ptr + yi * (int64_t) src_stride;
+        if (filtering == kFilterLinear) {
+            ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+        } else {
+            int yf = (y >> 8) & 255;
+            InterpolateRow(row, src, src_stride, src_width, yf);
+            ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+        }
+        dst_ptr += dst_stride;
+        y += dy;
+        if (y > max_y) {
+            y = max_y;
+        }
+    }
+    free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+static void ScalePlaneBilinearUp(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t *src_ptr,
+                                 uint8_t *dst_ptr,
+                                 enum FilterMode filtering) {
+    int j;
+    // Initial source x/y coordinate and step values as 16.16 fixed point.
+    int x = 0;
+    int y = 0;
+    int dx = 0;
+    int dy = 0;
+    const int max_y = (src_height - 1) << 16;
+    void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr,
+                           ptrdiff_t src_stride, int dst_width,
+                           int source_y_fraction) = InterpolateRow_C;
+    void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr,
+                            int dst_width, int x, int dx) =
+    filtering ? ScaleFilterCols_C : ScaleCols_C;
+    ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+               &dx, &dy);
+    src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        InterpolateRow = InterpolateRow_Any_SSSE3;
+        if (IS_ALIGNED(dst_width, 16)) {
+            InterpolateRow = InterpolateRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        InterpolateRow = InterpolateRow_Any_AVX2;
+        if (IS_ALIGNED(dst_width, 32)) {
+            InterpolateRow = InterpolateRow_AVX2;
+        }
+    }
+#endif
+
+    if (filtering && src_width >= 32768) {
+        ScaleFilterCols = ScaleFilterCols64_C;
+    }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+    if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+        ScaleFilterCols = ScaleFilterCols_SSSE3;
+    }
+#endif
+
+    if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+        ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+        if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+          ScaleFilterCols = ScaleColsUp2_SSE2;
+        }
+#endif
+    }
+
+    if (y > max_y) {
+        y = max_y;
+    }
+    {
+        int yi = y >> 16;
+        const uint8_t *src = src_ptr + yi * (int64_t) src_stride;
+
+        // Allocate 2 row buffers.
+        const int row_size = (dst_width + 31) & ~31;
+        align_buffer_64(row, row_size * 2);
+
+        uint8_t *rowptr = row;
+        int rowstride = row_size;
+        int lasty = yi;
+
+        ScaleFilterCols(rowptr, src, dst_width, x, dx);
+        if (src_height > 1) {
+            src += src_stride;
+        }
+        ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+        if (src_height > 2) {
+            src += src_stride;
+        }
+
+        for (j = 0; j < dst_height; ++j) {
+            yi = y >> 16;
+            if (yi != lasty) {
+                if (y > max_y) {
+                    y = max_y;
+                    yi = y >> 16;
+                    src = src_ptr + yi * (int64_t) src_stride;
+                }
+                if (yi != lasty) {
+                    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+                    rowptr += rowstride;
+                    rowstride = -rowstride;
+                    lasty = yi;
+                    if ((y + 65536) < max_y) {
+                        src += src_stride;
+                    }
+                }
+            }
+            if (filtering == kFilterLinear) {
+                InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+            } else {
+                int yf = (y >> 8) & 255;
+                InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+            }
+            dst_ptr += dst_stride;
+            y += dy;
+        }
+        free_aligned_buffer_64(row);
+    }
+}
+
+// Scale plane, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of I422 to I444.
+static void ScalePlaneUp2_Linear(int src_width,
+                                 int src_height,
+                                 int dst_width,
+                                 int dst_height,
+                                 int src_stride,
+                                 int dst_stride,
+                                 const uint8_t *src_ptr,
+                                 uint8_t *dst_ptr) {
+    void (*ScaleRowUp)(const uint8_t *src_ptr, uint8_t *dst_ptr, int dst_width) =
+    ScaleRowUp2_Linear_Any_C;
+    int i;
+    int y;
+    int dy;
+
+    (void) src_width;
+    // This function can only scale up by 2 times horizontally.
+    assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+    if (TestCpuFlag(kCpuHasSSE2)) {
+        ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+    }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+    }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+    }
+#endif
+
+
+    if (dst_height == 1) {
+        ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t) src_stride, dst_ptr,
+                   dst_width);
+    } else {
+        dy = FixedDiv(src_height - 1, dst_height - 1);
+        y = (1 << 15) - 1;
+        for (i = 0; i < dst_height; ++i) {
+            ScaleRowUp(src_ptr + (y >> 16) * (int64_t) src_stride, dst_ptr, dst_width);
+            dst_ptr += dst_stride;
+            y += dy;
+        }
+    }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of I420 to I444.
+static void ScalePlaneUp2_Bilinear(int src_width,
+                                   int src_height,
+                                   int dst_width,
+                                   int dst_height,
+                                   int src_stride,
+                                   int dst_stride,
+                                   const uint8_t *src_ptr,
+                                   uint8_t *dst_ptr) {
+    void (*Scale2RowUp)(const uint8_t *src_ptr, ptrdiff_t src_stride,
+                        uint8_t *dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+    ScaleRowUp2_Bilinear_Any_C;
+    int x;
+
+    (void) src_width;
+    // This function can only scale up by 2 times.
+    assert(src_width == ((dst_width + 1) / 2));
+    assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+    if (TestCpuFlag(kCpuHasSSE2)) {
+        Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+    }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+    }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+    }
+#endif
+
+
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    dst_ptr += dst_stride;
+    for (x = 0; x < src_height - 1; ++x) {
+        Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+        src_ptr += src_stride;
+        // TODO(fbarchard): Test performance of writing one row of destination at a
+        // time.
+        dst_ptr += 2 * dst_stride;
+    }
+    if (!(dst_height & 1)) {
+        Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t *src_ptr,
+                             uint8_t *dst_ptr) {
+    int i;
+    void (*ScaleCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, int dst_width,
+                      int x, int dx) = ScaleCols_C;
+    // Initial source x/y coordinate and step values as 16.16 fixed point.
+    int x = 0;
+    int y = 0;
+    int dx = 0;
+    int dy = 0;
+    ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+               &dx, &dy);
+    src_width = Abs(src_width);
+
+    if (src_width * 2 == dst_width && x < 0x8000) {
+        ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+        if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+          ScaleCols = ScaleColsUp2_SSE2;
+        }
+#endif
+    }
+
+    for (i = 0; i < dst_height; ++i) {
+        ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t) src_stride, dst_width, x,
+                  dx);
+        dst_ptr += dst_stride;
+        y += dy;
+    }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+LIBYUV_API
+void ScalePlane(const uint8_t *src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t *dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering) {
+    // Simplify filtering when possible.
+    filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                  filtering);
+
+    // Negative height means invert the image.
+    if (src_height < 0) {
+        src_height = -src_height;
+        src = src + (src_height - 1) * (int64_t) src_stride;
+        src_stride = -src_stride;
+    }
+    // Use specialized scales to improve performance for common resolutions.
+    // For example, all the 1/2 scalings will use ScalePlaneDown2()
+    if (dst_width == src_width && dst_height == src_height) {
+        // Straight copy.
+        CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+        return;
+    }
+    if (dst_width == src_width && filtering != kFilterBox) {
+        int dy = 0;
+        int y = 0;
+        // When scaling down, use the center 2 rows to filter.
+        // When scaling up, last row of destination uses the last 2 source rows.
+        if (dst_height <= src_height) {
+            dy = FixedDiv(src_height, dst_height);
+            y = CENTERSTART(dy, -32768);  // Subtract 0.5 (32768) to center filter.
+        } else if (src_height > 1 && dst_height > 1) {
+            dy = FixedDiv1(src_height, dst_height);
+        }
+        // Arbitrary scale vertically, but unscaled horizontally.
+        ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+                           dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
+        return;
+    }
+    if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+        // Scale down.
+        if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+            // optimized, 3/4
+            ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+                             dst_stride, src, dst, filtering);
+            return;
+        }
+        if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+            // optimized, 1/2
+            ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+                            dst_stride, src, dst, filtering);
+            return;
+        }
+        // 3/8 rounded up for odd sized chroma height.
+        if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+            // optimized, 3/8
+            ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+                             dst_stride, src, dst, filtering);
+            return;
+        }
+        if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+            (filtering == kFilterBox || filtering == kFilterNone)) {
+            // optimized, 1/4
+            ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+                            dst_stride, src, dst, filtering);
+            return;
+        }
+    }
+    if (filtering == kFilterBox && dst_height * 2 < src_height) {
+        ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst);
+        return;
+    }
+    if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+        ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
+                             src_stride, dst_stride, src, dst);
+        return;
+    }
+    if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+        (filtering == kFilterBilinear || filtering == kFilterBox)) {
+        ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
+                               src_stride, dst_stride, src, dst);
+        return;
+    }
+    if (filtering && dst_height > src_height) {
+        ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                             src_stride, dst_stride, src, dst, filtering);
+        return;
+    }
+    if (filtering) {
+        ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                               src_stride, dst_stride, src, dst, filtering);
+        return;
+    }
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+                     dst_stride, src, dst);
+}
+
+LIBYUV_API
+int I420Scale(const uint8_t *src_y,
+              int src_stride_y,
+              const uint8_t *src_u,
+              int src_stride_u,
+              const uint8_t *src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t *dst_y,
+              int dst_stride_y,
+              uint8_t *dst_u,
+              int dst_stride_u,
+              uint8_t *dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+    int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+    int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+    int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+    int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+
+    if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+        src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+        dst_width <= 0 || dst_height <= 0) {
+        return -1;
+    }
+
+    ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+               dst_width, dst_height, filtering);
+    ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+               dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+    ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+               dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+    return 0;
+}
diff --git a/pkg/encoder/yuv/libyuv/scale.h b/pkg/encoder/yuv/libyuv/scale.h
new file mode 100644
index 000000000..ed0a1983f
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/scale.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "basic_types.h"
+
+// Supported filtering.
+typedef enum FilterMode {
+    kFilterNone = 0,      // Point sample; Fastest.
+    kFilterLinear = 1,    // Filter horizontally only.
+    kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+    kFilterBox = 3        // Highest quality.
+} FilterModeEnum;
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8_t *src_y,
+              int src_stride_y,
+              const uint8_t *src_u,
+              int src_stride_u,
+              const uint8_t *src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t *dst_y,
+              int dst_stride_y,
+              uint8_t *dst_u,
+              int dst_stride_u,
+              uint8_t *dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/pkg/encoder/yuv/libyuv/scale_any.c b/pkg/encoder/yuv/libyuv/scale_any.c
new file mode 100644
index 000000000..f05e55b6e
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/scale_any.c
@@ -0,0 +1,632 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "scale_row.h"
+
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
+  }
+
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
+  }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+      ScaleUVRowDown2Box_SSSE3,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      3)
+
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+      ScaleUVRowDown2Box_AVX2,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      7)
+
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
+
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
+
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+
+#endif
+
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
+
+#endif
+
+
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
+  }
+
+
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down.  Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
+    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
+    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
+    }                                                                  \
+    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
+    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
+    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
+    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#undef SAANY
+
+#else
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
+  }
+
+#ifdef HAS_SCALEADDROW_SSE2
+
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+
+#endif
+#undef SAANY
+
+#endif  // SASIMDONLY
+
+// Scale up horizontally 2 times using linear filter.
+#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE)                       \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+    int work_width = (dst_width - 1) & ~1;                         \
+    int r = work_width & MASK;                                     \
+    int n = work_width & ~MASK;                                    \
+    dst_ptr[0] = src_ptr[0];                                       \
+    if (work_width > 0) {                                          \
+      if (n != 0) {                                                \
+        SIMD(src_ptr, dst_ptr + 1, n);                             \
+      }                                                            \
+      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
+    }                                                              \
+    dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];         \
+  }
+
+// Even the C versions need to be wrapped, because boundary pixels have to
+// be handled differently
+
+SUH2LANY(ScaleRowUp2_Linear_Any_C,
+         ScaleRowUp2_Linear_C,
+         ScaleRowUp2_Linear_C,
+         0,
+         uint8_t)
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
+         ScaleRowUp2_Linear_16_C,
+         ScaleRowUp2_Linear_16_C,
+         0,
+         uint16_t)
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+
+SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
+         ScaleRowUp2_Linear_SSE2,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+
+SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
+         ScaleRowUp2_Linear_SSSE3,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+
+SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
+         ScaleRowUp2_Linear_12_SSSE3,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+         ScaleRowUp2_Linear_16_SSE2,
+         ScaleRowUp2_Linear_16_C,
+         7,
+         uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+
+SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
+         ScaleRowUp2_Linear_AVX2,
+         ScaleRowUp2_Linear_C,
+         31,
+         uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+
+SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
+         ScaleRowUp2_Linear_12_AVX2,
+         ScaleRowUp2_Linear_16_C,
+         31,
+         uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
+         ScaleRowUp2_Linear_16_AVX2,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+
+#endif
+#undef SUH2LANY
+
+// Scale up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                              \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr,   \
+            ptrdiff_t dst_stride, int dst_width) {                        \
+    int work_width = (dst_width - 1) & ~1;                                \
+    int r = work_width & MASK;                                            \
+    int n = work_width & ~MASK;                                           \
+    const PTYPE* sa = src_ptr;                                            \
+    const PTYPE* sb = src_ptr + src_stride;                               \
+    PTYPE* da = dst_ptr;                                                  \
+    PTYPE* db = dst_ptr + dst_stride;                                     \
+    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
+    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
+    if (work_width > 0) {                                                 \
+      if (n != 0) {                                                       \
+        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
+      }                                                                   \
+      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
+    }                                                                     \
+    da[dst_width - 1] =                                                   \
+        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
+    db[dst_width - 1] =                                                   \
+        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
+  }
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
+         ScaleRowUp2_Bilinear_C,
+         ScaleRowUp2_Bilinear_C,
+         0,
+         uint8_t)
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
+         ScaleRowUp2_Bilinear_16_C,
+         ScaleRowUp2_Bilinear_16_C,
+         0,
+         uint16_t)
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
+         ScaleRowUp2_Bilinear_SSE2,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
+         ScaleRowUp2_Bilinear_12_SSSE3,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
+         ScaleRowUp2_Bilinear_16_SSE2,
+         ScaleRowUp2_Bilinear_16_C,
+         7,
+         uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
+         ScaleRowUp2_Bilinear_SSSE3,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
+         ScaleRowUp2_Bilinear_AVX2,
+         ScaleRowUp2_Bilinear_C,
+         31,
+         uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
+         ScaleRowUp2_Bilinear_12_AVX2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
+         ScaleRowUp2_Bilinear_16_AVX2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+
+#endif
+
+#undef SU2BLANY
+
+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE)                         \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) {    \
+    int work_width = (dst_width - 1) & ~1;                            \
+    int r = work_width & MASK;                                        \
+    int n = work_width & ~MASK;                                       \
+    dst_ptr[0] = src_ptr[0];                                          \
+    dst_ptr[1] = src_ptr[1];                                          \
+    if (work_width > 0) {                                             \
+      if (n != 0) {                                                   \
+        SIMD(src_ptr, dst_ptr + 2, n);                                \
+      }                                                               \
+      C(src_ptr + n, dst_ptr + 2 * n + 2, r);                         \
+    }                                                                 \
+    dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+    dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+  }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+          ScaleUVRowUp2_Linear_C,
+          ScaleUVRowUp2_Linear_C,
+          0,
+          uint8_t)
+
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
+          ScaleUVRowUp2_Linear_16_C,
+          ScaleUVRowUp2_Linear_16_C,
+          0,
+          uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+          ScaleUVRowUp2_Linear_SSSE3,
+          ScaleUVRowUp2_Linear_C,
+          7,
+          uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+          ScaleUVRowUp2_Linear_AVX2,
+          ScaleUVRowUp2_Linear_C,
+          15,
+          uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
+          ScaleUVRowUp2_Linear_16_SSE41,
+          ScaleUVRowUp2_Linear_16_C,
+          3,
+          uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
+          ScaleUVRowUp2_Linear_16_AVX2,
+          ScaleUVRowUp2_Linear_16_C,
+          7,
+          uint16_t)
+
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE)                           \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+            ptrdiff_t dst_stride, int dst_width) {                      \
+    int work_width = (dst_width - 1) & ~1;                              \
+    int r = work_width & MASK;                                          \
+    int n = work_width & ~MASK;                                         \
+    const PTYPE* sa = src_ptr;                                          \
+    const PTYPE* sb = src_ptr + src_stride;                             \
+    PTYPE* da = dst_ptr;                                                \
+    PTYPE* db = dst_ptr + dst_stride;                                   \
+    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                               \
+    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                               \
+    da[1] = (3 * sa[1] + sb[1] + 2) >> 2;                               \
+    db[1] = (sa[1] + 3 * sb[1] + 2) >> 2;                               \
+    if (work_width > 0) {                                               \
+      if (n != 0) {                                                     \
+        SIMD(sa, sb - sa, da + 2, db - da, n);                          \
+      }                                                                 \
+      C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
+    }                                                                   \
+    da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
+                             sb[((dst_width + 1) & ~1) - 2] + 2) >>     \
+                            2;                                          \
+    db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+                            2;                                          \
+    da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
+                             sb[((dst_width + 1) & ~1) - 1] + 2) >>     \
+                            2;                                          \
+    db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+                            2;                                          \
+  }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+          ScaleUVRowUp2_Bilinear_C,
+          ScaleUVRowUp2_Bilinear_C,
+          0,
+          uint8_t)
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
+          ScaleUVRowUp2_Bilinear_16_C,
+          ScaleUVRowUp2_Bilinear_16_C,
+          0,
+          uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+          ScaleUVRowUp2_Bilinear_SSSE3,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_AVX2,
+          ScaleUVRowUp2_Bilinear_C,
+          15,
+          uint8_t)
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
+          ScaleUVRowUp2_Bilinear_16_SSE41,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_16_AVX2,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+
+#endif
+
+#undef SBU2BLANY
diff --git a/pkg/encoder/yuv/libyuv/scale_common.c b/pkg/encoder/yuv/libyuv/scale_common.c
new file mode 100644
index 000000000..17eedd992
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/scale_common.c
@@ -0,0 +1,930 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "scale.h"
+
+#include <assert.h>
+
+#include "cpu_id.h"
+#include "row.h"
+#include "scale_row.h"
+
+#define STATIC_CAST(type, expr) (type)(expr)
+
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+    return (-(v >= 255) | v) & 255;
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
+static __inline int Abs(int v) {
+    return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8_t *src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t *dst,
+                     int dst_width) {
+    int x;
+    (void) src_stride;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = src_ptr[1];
+        dst[1] = src_ptr[3];
+        dst += 2;
+        src_ptr += 4;
+    }
+    if (dst_width & 1) {
+        dst[0] = src_ptr[1];
+    }
+}
+
+void ScaleRowDown2Linear_C(const uint8_t *src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t *dst,
+                           int dst_width) {
+    const uint8_t *s = src_ptr;
+    int x;
+    (void) src_stride;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = (s[0] + s[1] + 1) >> 1;
+        dst[1] = (s[2] + s[3] + 1) >> 1;
+        dst += 2;
+        s += 4;
+    }
+    if (dst_width & 1) {
+        dst[0] = (s[0] + s[1] + 1) >> 1;
+    }
+}
+
+void ScaleRowDown2Box_C(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst,
+                        int dst_width) {
+    const uint8_t *s = src_ptr;
+    const uint8_t *t = src_ptr + src_stride;
+    int x;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+        dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+        dst += 2;
+        s += 4;
+        t += 4;
+    }
+    if (dst_width & 1) {
+        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst,
+                            int dst_width) {
+    const uint8_t *s = src_ptr;
+    const uint8_t *t = src_ptr + src_stride;
+    int x;
+    dst_width -= 1;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+        dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+        dst += 2;
+        s += 4;
+        t += 4;
+    }
+    if (dst_width & 1) {
+        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+        dst += 1;
+        s += 2;
+        t += 2;
+    }
+    dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown4_C(const uint8_t *src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t *dst,
+                     int dst_width) {
+    int x;
+    (void) src_stride;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = src_ptr[2];
+        dst[1] = src_ptr[6];
+        dst += 2;
+        src_ptr += 8;
+    }
+    if (dst_width & 1) {
+        dst[0] = src_ptr[2];
+    }
+}
+
+void ScaleRowDown4Box_C(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst,
+                        int dst_width) {
+    intptr_t stride = src_stride;
+    int x;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+                  src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+                  src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+                  src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+                  src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+                  src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+                  src_ptr[stride * 3 + 3] + 8) >>
+                                               4;
+        dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+                  src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+                  src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+                  src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+                  src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+                  src_ptr[stride * 3 + 7] + 8) >>
+                                               4;
+        dst += 2;
+        src_ptr += 8;
+    }
+    if (dst_width & 1) {
+        dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+                  src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+                  src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+                  src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+                  src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+                  src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+                  src_ptr[stride * 3 + 3] + 8) >>
+                                               4;
+    }
+}
+
+void ScaleRowDown34_C(const uint8_t *src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t *dst,
+                      int dst_width) {
+    int x;
+    (void) src_stride;
+    assert((dst_width % 3 == 0) && (dst_width > 0));
+    for (x = 0; x < dst_width; x += 3) {
+        dst[0] = src_ptr[0];
+        dst[1] = src_ptr[1];
+        dst[2] = src_ptr[3];
+        dst += 3;
+        src_ptr += 4;
+    }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *d,
+                            int dst_width) {
+    const uint8_t *s = src_ptr;
+    const uint8_t *t = src_ptr + src_stride;
+    int x;
+    assert((dst_width % 3 == 0) && (dst_width > 0));
+    for (x = 0; x < dst_width; x += 3) {
+        uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+        uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+        uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+        uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+        uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+        uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+        d[0] = (a0 * 3 + b0 + 2) >> 2;
+        d[1] = (a1 * 3 + b1 + 2) >> 2;
+        d[2] = (a2 * 3 + b2 + 2) >> 2;
+        d += 3;
+        s += 4;
+        t += 4;
+    }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *d,
+                            int dst_width) {
+    const uint8_t *s = src_ptr;
+    const uint8_t *t = src_ptr + src_stride;
+    int x;
+    assert((dst_width % 3 == 0) && (dst_width > 0));
+    for (x = 0; x < dst_width; x += 3) {
+        uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+        uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+        uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+        uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+        uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+        uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+        d[0] = (a0 + b0 + 1) >> 1;
+        d[1] = (a1 + b1 + 1) >> 1;
+        d[2] = (a2 + b2 + 1) >> 1;
+        d += 3;
+        s += 4;
+        t += 4;
+    }
+}
+
+// Sample position: (O is src sample position, X is dst sample position)
+//
+//      v dst_ptr at here           v stop at here
+//  X O X   X O X   X O X   X O X   X O X
+//    ^ src_ptr at here
+void ScaleRowUp2_Linear_C(const uint8_t *src_ptr,
+                          uint8_t *dst_ptr,
+                          int dst_width) {
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+        dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+    }
+}
+
+// Sample position: (O is src sample position, X is dst sample position)
+//
+//    src_ptr at here
+//  X v X   X   X   X   X   X   X   X   X
+//    O       O       O       O       O
+//  X   X   X   X   X   X   X   X   X   X
+//      ^ dst_ptr at here           ^ stop at here
+//  X   X   X   X   X   X   X   X   X   X
+//    O       O       O       O       O
+//  X   X   X   X   X   X   X   X   X   X
+void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width) {
+    const uint8_t *s = src_ptr;
+    const uint8_t *t = src_ptr + src_stride;
+    uint8_t *d = dst_ptr;
+    uint8_t *e = dst_ptr + dst_stride;
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        d[2 * x + 0] =
+                (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+        d[2 * x + 1] =
+                (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+        e[2 * x + 0] =
+                (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+        e[2 * x + 1] =
+                (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+    }
+}
+
+// Only suitable for at most 14 bit range.
+void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr,
+                             uint16_t *dst_ptr,
+                             int dst_width) {
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+        dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+    }
+}
+
+// Only suitable for at most 12bit range.
+void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t *dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+    const uint16_t *s = src_ptr;
+    const uint16_t *t = src_ptr + src_stride;
+    uint16_t *d = dst_ptr;
+    uint16_t *e = dst_ptr + dst_stride;
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        d[2 * x + 0] =
+                (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+        d[2 * x + 1] =
+                (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+        e[2 * x + 0] =
+                (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+        e[2 * x + 1] =
+                (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+    }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
+
+void ScaleFilterCols_C(uint8_t *dst_ptr,
+                       const uint8_t *src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+    int j;
+    for (j = 0; j < dst_width - 1; j += 2) {
+        int xi = x >> 16;
+        int a = src_ptr[xi];
+        int b = src_ptr[xi + 1];
+        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+        x += dx;
+        xi = x >> 16;
+        a = src_ptr[xi];
+        b = src_ptr[xi + 1];
+        dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+        x += dx;
+        dst_ptr += 2;
+    }
+    if (dst_width & 1) {
+        int xi = x >> 16;
+        int a = src_ptr[xi];
+        int b = src_ptr[xi + 1];
+        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    }
+}
+
+void ScaleFilterCols64_C(uint8_t *dst_ptr,
+                         const uint8_t *src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx) {
+    int64_t x = (int64_t) (x32);
+    int j;
+    for (j = 0; j < dst_width - 1; j += 2) {
+        int64_t xi = x >> 16;
+        int a = src_ptr[xi];
+        int b = src_ptr[xi + 1];
+        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+        x += dx;
+        xi = x >> 16;
+        a = src_ptr[xi];
+        b = src_ptr[xi + 1];
+        dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+        x += dx;
+        dst_ptr += 2;
+    }
+    if (dst_width & 1) {
+        int64_t xi = x >> 16;
+        int a = src_ptr[xi];
+        int b = src_ptr[xi + 1];
+        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    }
+}
+
+#undef BLENDER
+
+// Same as 8 bit arm blender but return is cast to uint16_t
+#define BLENDER(a, b, f) \
+  (uint16_t)(            \
+      (int)(a) +         \
+      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8_t *src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t *dst,
+                      int dst_width) {
+    int x;
+    (void) src_stride;
+    assert(dst_width % 3 == 0);
+    for (x = 0; x < dst_width; x += 3) {
+        dst[0] = src_ptr[0];
+        dst[1] = src_ptr[3];
+        dst[2] = src_ptr[6];
+        dst += 3;
+        src_ptr += 8;
+    }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width) {
+    intptr_t stride = src_stride;
+    int i;
+    assert((dst_width % 3 == 0) && (dst_width > 0));
+    for (i = 0; i < dst_width; i += 3) {
+        dst_ptr[0] =
+                (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                 src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+                 src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+                (65536 / 9) >>
+                            16;
+        dst_ptr[1] =
+                (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                 src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+                 src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+                (65536 / 9) >>
+                            16;
+        dst_ptr[2] =
+                (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+                 src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+                (65536 / 6) >>
+                            16;
+        src_ptr += 8;
+        dst_ptr += 3;
+    }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width) {
+    intptr_t stride = src_stride;
+    int i;
+    assert((dst_width % 3 == 0) && (dst_width > 0));
+    for (i = 0; i < dst_width; i += 3) {
+        dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                      src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                                 16;
+        dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                      src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                                 16;
+        dst_ptr[2] =
+                (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+                (65536 / 4) >>
+                            16;
+        src_ptr += 8;
+        dst_ptr += 3;
+    }
+}
+
+void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width) {
+    int x;
+    assert(src_width > 0);
+    for (x = 0; x < src_width - 1; x += 2) {
+        dst_ptr[0] += src_ptr[0];
+        dst_ptr[1] += src_ptr[1];
+        src_ptr += 2;
+        dst_ptr += 2;
+    }
+    if (src_width & 1) {
+        dst_ptr[0] += src_ptr[0];
+    }
+}
+
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t *src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t *dst_uv,
+                       int dst_width) {
+    int x;
+    (void) src_stride;
+    for (x = 0; x < dst_width; ++x) {
+        dst_uv[0] = src_uv[2];  // Store the 2nd UV
+        dst_uv[1] = src_uv[3];
+        src_uv += 4;
+        dst_uv += 2;
+    }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t *src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t *dst_uv,
+                             int dst_width) {
+    int x;
+    (void) src_stride;
+    for (x = 0; x < dst_width; ++x) {
+        dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+        dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+        src_uv += 4;
+        dst_uv += 2;
+    }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t *src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t *dst_uv,
+                          int dst_width) {
+    int x;
+    for (x = 0; x < dst_width; ++x) {
+        dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+                     src_uv[src_stride + 2] + 2) >>
+                                                 2;
+        dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+                     src_uv[src_stride + 3] + 2) >>
+                                                 2;
+        src_uv += 4;
+        dst_uv += 2;
+    }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t *src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t *dst_uv,
+                          int dst_width) {
+    const uint16_t *src = (const uint16_t *) (src_uv);
+    uint16_t *dst = (uint16_t *) (dst_uv);
+    (void) src_stride;
+    int x;
+    for (x = 0; x < dst_width - 1; x += 2) {
+        dst[0] = src[0];
+        dst[1] = src[src_stepx];
+        src += src_stepx * 2;
+        dst += 2;
+    }
+    if (dst_width & 1) {
+        dst[0] = src[0];
+    }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8_t *dst_ptr,
+                 const uint8_t *src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx) {
+    int j;
+    for (j = 0; j < dst_width - 1; j += 2) {
+        dst_ptr[0] = src_ptr[x >> 16];
+        x += dx;
+        dst_ptr[1] = src_ptr[x >> 16];
+        x += dx;
+        dst_ptr += 2;
+    }
+    if (dst_width & 1) {
+        dst_ptr[0] = src_ptr[x >> 16];
+    }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8_t *dst_ptr,
+                    const uint8_t *src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
+    int j;
+    (void) x;
+    (void) dx;
+    for (j = 0; j < dst_width - 1; j += 2) {
+        dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+        src_ptr += 1;
+        dst_ptr += 2;
+    }
+    if (dst_width & 1) {
+        dst_ptr[0] = src_ptr[0];
+    }
+}
+
+void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr,
+                            uint8_t *dst_ptr,
+                            int dst_width) {
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        dst_ptr[4 * x + 0] =
+                (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+        dst_ptr[4 * x + 1] =
+                (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+        dst_ptr[4 * x + 2] =
+                (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+        dst_ptr[4 * x + 3] =
+                (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+    }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+    const uint8_t *s = src_ptr;
+    const uint8_t *t = src_ptr + src_stride;
+    uint8_t *d = dst_ptr;
+    uint8_t *e = dst_ptr + dst_stride;
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                        t[2 * x + 2] * 1 + 8) >>
+                                              4;
+        d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                        t[2 * x + 3] * 1 + 8) >>
+                                              4;
+        d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                        t[2 * x + 2] * 3 + 8) >>
+                                              4;
+        d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                        t[2 * x + 3] * 3 + 8) >>
+                                              4;
+        e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                        t[2 * x + 2] * 3 + 8) >>
+                                              4;
+        e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                        t[2 * x + 3] * 3 + 8) >>
+                                              4;
+        e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                        t[2 * x + 2] * 9 + 8) >>
+                                              4;
+        e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                        t[2 * x + 3] * 9 + 8) >>
+                                              4;
+    }
+}
+
+void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr,
+                               uint16_t *dst_ptr,
+                               int dst_width) {
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        dst_ptr[4 * x + 0] =
+                (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+        dst_ptr[4 * x + 1] =
+                (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+        dst_ptr[4 * x + 2] =
+                (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+        dst_ptr[4 * x + 3] =
+                (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+    }
+}
+
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t *dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+    const uint16_t *s = src_ptr;
+    const uint16_t *t = src_ptr + src_stride;
+    uint16_t *d = dst_ptr;
+    uint16_t *e = dst_ptr + dst_stride;
+    int src_width = dst_width >> 1;
+    int x;
+    assert((dst_width % 2 == 0) && (dst_width >= 0));
+    for (x = 0; x < src_width; ++x) {
+        d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                        t[2 * x + 2] * 1 + 8) >>
+                                              4;
+        d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                        t[2 * x + 3] * 1 + 8) >>
+                                              4;
+        d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                        t[2 * x + 2] * 3 + 8) >>
+                                              4;
+        d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                        t[2 * x + 3] * 3 + 8) >>
+                                              4;
+        e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                        t[2 * x + 2] * 3 + 8) >>
+                                              4;
+        e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                        t[2 * x + 3] * 3 + 8) >>
+                                              4;
+        e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                        t[2 * x + 2] * 9 + 8) >>
+                                              4;
+        e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                        t[2 * x + 3] * 9 + 8) >>
+                                              4;
+    }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t *dst_uv,
+                         const uint8_t *src_uv,
+                         int dst_width,
+                         int x,
+                         int dx) {
+    const uint16_t *src = (const uint16_t *) (src_uv);
+    uint16_t *dst = (uint16_t *) (dst_uv);
+    int j;
+    for (j = 0; j < dst_width - 1; j += 2) {
+        int xi = x >> 16;
+        int xf = (x >> 9) & 0x7f;
+        uint16_t a = src[xi];
+        uint16_t b = src[xi + 1];
+        dst[0] = BLENDER(a, b, xf);
+        x += dx;
+        xi = x >> 16;
+        xf = (x >> 9) & 0x7f;
+        a = src[xi];
+        b = src[xi + 1];
+        dst[1] = BLENDER(a, b, xf);
+        x += dx;
+        dst += 2;
+    }
+    if (dst_width & 1) {
+        int xi = x >> 16;
+        int xf = (x >> 9) & 0x7f;
+        uint16_t a = src[xi];
+        uint16_t b = src[xi + 1];
+        dst[0] = BLENDER(a, b, xf);
+    }
+}
+
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t *src_argb,
+                        uint8_t *dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,  // bytes per pixel. 4 for ARGB.
+                        enum FilterMode filtering) {
+    // TODO(fbarchard): Allow higher bpp.
+    int dst_width_bytes = dst_width * bpp;
+    void (*InterpolateRow)(uint8_t *dst_argb, const uint8_t *src_argb,
+                           ptrdiff_t src_stride, int dst_width,
+                           int source_y_fraction) = InterpolateRow_C;
+    const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+    int j;
+    assert(bpp >= 1 && bpp <= 4);
+    assert(src_height != 0);
+    assert(dst_width > 0);
+    assert(dst_height > 0);
+    src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+        InterpolateRow = InterpolateRow_Any_SSSE3;
+        if (IS_ALIGNED(dst_width_bytes, 16)) {
+            InterpolateRow = InterpolateRow_SSSE3;
+        }
+    }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+        InterpolateRow = InterpolateRow_Any_AVX2;
+        if (IS_ALIGNED(dst_width_bytes, 32)) {
+            InterpolateRow = InterpolateRow_AVX2;
+        }
+    }
+#endif
+
+
+    for (j = 0; j < dst_height; ++j) {
+        int yi;
+        int yf;
+        if (y > max_y) {
+            y = max_y;
+        }
+        yi = y >> 16;
+        yf = filtering ? ((y >> 8) & 255) : 0;
+        InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                       dst_width_bytes, yf);
+        dst_argb += dst_stride;
+        y += dy;
+    }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  enum FilterMode filtering) {
+    if (src_width < 0) {
+        src_width = -src_width;
+    }
+    if (src_height < 0) {
+        src_height = -src_height;
+    }
+    if (filtering == kFilterBox) {
+        // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+        if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
+            filtering = kFilterBilinear;
+        }
+    }
+    if (filtering == kFilterBilinear) {
+        if (src_height == 1) {
+            filtering = kFilterLinear;
+        }
+        // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+        if (dst_height == src_height || dst_height * 3 == src_height) {
+            filtering = kFilterLinear;
+        }
+        // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+        // avoid reading 2 pixels horizontally that causes memory exception.
+        if (src_width == 1) {
+            filtering = kFilterNone;
+        }
+    }
+    if (filtering == kFilterLinear) {
+        if (src_width == 1) {
+            filtering = kFilterNone;
+        }
+        // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+        if (dst_width == src_width || dst_width * 3 == src_width) {
+            filtering = kFilterNone;
+        }
+    }
+    return filtering;
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering,
+                int *x,
+                int *y,
+                int *dx,
+                int *dy) {
+    assert(x != NULL);
+    assert(y != NULL);
+    assert(dx != NULL);
+    assert(dy != NULL);
+    assert(src_width != 0);
+    assert(src_height != 0);
+    assert(dst_width > 0);
+    assert(dst_height > 0);
+    // Check for 1 pixel and avoid FixedDiv overflow.
+    if (dst_width == 1 && src_width >= 32768) {
+        dst_width = src_width;
+    }
+    if (dst_height == 1 && src_height >= 32768) {
+        dst_height = src_height;
+    }
+    if (filtering == kFilterBox) {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = 0;
+        *y = 0;
+    } else if (filtering == kFilterBilinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+            *dx = FixedDiv(Abs(src_width), dst_width);
+            *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+        } else if (src_width > 1 && dst_width > 1) {
+            *dx = FixedDiv1(Abs(src_width), dst_width);
+            *x = 0;
+        }
+        if (dst_height <= src_height) {
+            *dy = FixedDiv(src_height, dst_height);
+            *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+        } else if (src_height > 1 && dst_height > 1) {
+            *dy = FixedDiv1(src_height, dst_height);
+            *y = 0;
+        }
+    } else if (filtering == kFilterLinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+            *dx = FixedDiv(Abs(src_width), dst_width);
+            *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+        } else if (src_width > 1 && dst_width > 1) {
+            *dx = FixedDiv1(Abs(src_width), dst_width);
+            *x = 0;
+        }
+        *dy = FixedDiv(src_height, dst_height);
+        *y = *dy >> 1;
+    } else {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = CENTERSTART(*dx, 0);
+        *y = CENTERSTART(*dy, 0);
+    }
+    // Negative src_width means horizontally mirror.
+    if (src_width < 0) {
+        *x += (dst_width - 1) * *dx;
+        *dx = -*dx;
+        // src_width = -src_width;   // Caller must do this.
+    }
+}
+
+#undef CENTERSTART
diff --git a/pkg/encoder/yuv/libyuv/scale_gcc.c b/pkg/encoder/yuv/libyuv/scale_gcc.c
new file mode 100644
index 000000000..716d6cfdb
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/scale_gcc.c
@@ -0,0 +1,2651 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "row.h"
+#include "scale_row.h"
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+                               6, 8, 11, 14, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+                               6, 7, 12, 13, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0, 0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0, 0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8_t *src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t *dst_ptr,
+                         int dst_width) {
+    (void) src_stride;
+    asm volatile(
+        // 16 pixel loop.
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "lea         0x20(%0),%0                   \n"
+            "psrlw       $0x8,%%xmm0                   \n"
+            "psrlw       $0x8,%%xmm1                   \n"
+            "packuswb    %%xmm1,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               int dst_width) {
+    (void) src_stride;
+    asm volatile(
+            "pcmpeqb     %%xmm4,%%xmm4                 \n"
+            "psrlw       $0xf,%%xmm4                   \n"
+            "packuswb    %%xmm4,%%xmm4                 \n"
+            "pxor        %%xmm5,%%xmm5                 \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "lea         0x20(%0),%0                   \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm1                 \n"
+            "pavgw       %%xmm5,%%xmm0                 \n"
+            "pavgw       %%xmm5,%%xmm1                 \n"
+            "packuswb    %%xmm1,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width) {
+    asm volatile(
+            "pcmpeqb     %%xmm4,%%xmm4                 \n"
+            "psrlw       $0xf,%%xmm4                   \n"
+            "packuswb    %%xmm4,%%xmm4                 \n"
+            "pxor        %%xmm5,%%xmm5                 \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+            "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+            "lea         0x20(%0),%0                   \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm1                 \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm4,%%xmm3                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"
+            "psrlw       $0x1,%%xmm0                   \n"
+            "psrlw       $0x1,%%xmm1                   \n"
+            "pavgw       %%xmm5,%%xmm0                 \n"
+            "pavgw       %%xmm5,%%xmm1                 \n"
+            "packuswb    %%xmm1,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),               // %0
+    "+r"(dst_ptr),               // %1
+    "+r"(dst_width)              // %2
+            : "r"((intptr_t) (src_stride))  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown2_AVX2(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst_ptr,
+                        int dst_width) {
+    (void) src_stride;
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "lea         0x40(%0),%0                   \n"
+            "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+            "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              int dst_width) {
+    (void) src_stride;
+    asm volatile(
+            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "lea         0x40(%0),%0                   \n"
+            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+            "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t *dst_ptr,
+                           int dst_width) {
+    asm volatile(
+            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+            "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+            "lea         0x40(%0),%0                   \n"
+            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+            "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
+            "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
+            "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "lea         0x20(%1),%1                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),               // %0
+    "+r"(dst_ptr),               // %1
+    "+r"(dst_width)              // %2
+            : "r"((intptr_t) (src_stride))  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t *src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t *dst_ptr,
+                         int dst_width) {
+    (void) src_stride;
+    asm volatile(
+            "pcmpeqb     %%xmm5,%%xmm5                 \n"
+            "psrld       $0x18,%%xmm5                  \n"
+            "pslld       $0x10,%%xmm5                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "lea         0x20(%0),%0                   \n"
+            "pand        %%xmm5,%%xmm0                 \n"
+            "pand        %%xmm5,%%xmm1                 \n"
+            "packuswb    %%xmm1,%%xmm0                 \n"
+            "psrlw       $0x8,%%xmm0                   \n"
+            "packuswb    %%xmm0,%%xmm0                 \n"
+            "movq        %%xmm0,(%1)                   \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width) {
+    intptr_t stridex3;
+    asm volatile(
+            "pcmpeqb     %%xmm4,%%xmm4                 \n"
+            "psrlw       $0xf,%%xmm4                   \n"
+            "movdqa      %%xmm4,%%xmm5                 \n"
+            "packuswb    %%xmm4,%%xmm4                 \n"
+            "psllw       $0x3,%%xmm5                   \n"
+            "lea         0x00(%4,%4,2),%3              \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "movdqu      0x00(%0,%4,1),%%xmm2          \n"
+            "movdqu      0x10(%0,%4,1),%%xmm3          \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"
+            "pmaddubsw   %%xmm4,%%xmm1                 \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm4,%%xmm3                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"
+            "movdqu      0x00(%0,%4,2),%%xmm2          \n"
+            "movdqu      0x10(%0,%4,2),%%xmm3          \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm4,%%xmm3                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"
+            "movdqu      0x00(%0,%3,1),%%xmm2          \n"
+            "movdqu      0x10(%0,%3,1),%%xmm3          \n"
+            "lea         0x20(%0),%0                   \n"
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "pmaddubsw   %%xmm4,%%xmm3                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"
+            "phaddw      %%xmm1,%%xmm0                 \n"
+            "paddw       %%xmm5,%%xmm0                 \n"
+            "psrlw       $0x4,%%xmm0                   \n"
+            "packuswb    %%xmm0,%%xmm0                 \n"
+            "movq        %%xmm0,(%1)                   \n"
+            "lea         0x8(%1),%1                    \n"
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),               // %0
+    "+r"(dst_ptr),               // %1
+    "+r"(dst_width),             // %2
+    "=&r"(stridex3)              // %3
+            : "r"((intptr_t) (src_stride))  // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown4_AVX2(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst_ptr,
+                        int dst_width) {
+    (void) src_stride;
+    asm volatile(
+            "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+            "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
+            "vpslld      $0x10,%%ymm5,%%ymm5           \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "lea         0x40(%0),%0                   \n"
+            "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
+            "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t *dst_ptr,
+                           int dst_width) {
+    asm volatile(
+            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+            "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
+            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"
+            "vmovdqu     0x20(%0),%%ymm1               \n"
+            "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
+            "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
+            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
+            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+            "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
+            "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
+            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+            "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
+            "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
+            "lea         0x40(%0),%0                   \n"
+            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
+            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
+            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
+            "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
+            "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+            "vmovdqu     %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                   // %0
+    "+r"(dst_ptr),                   // %1
+    "+r"(dst_width)                  // %2
+            : "r"((intptr_t) (src_stride)),     // %3
+    "r"((intptr_t) (src_stride * 3))  // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t *src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t *dst_ptr,
+                          int dst_width) {
+    (void) src_stride;
+    asm volatile(
+            "movdqa      %0,%%xmm3                     \n"
+            "movdqa      %1,%%xmm4                     \n"
+            "movdqa      %2,%%xmm5                     \n"
+            :
+            : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+            );
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm2               \n"
+            "lea         0x20(%0),%0                   \n"
+            "movdqa      %%xmm2,%%xmm1                 \n"
+            "palignr     $0x8,%%xmm0,%%xmm1            \n"
+            "pshufb      %%xmm3,%%xmm0                 \n"
+            "pshufb      %%xmm4,%%xmm1                 \n"
+            "pshufb      %%xmm5,%%xmm2                 \n"
+            "movq        %%xmm0,(%1)                   \n"
+            "movq        %%xmm1,0x8(%1)                \n"
+            "movq        %%xmm2,0x10(%1)               \n"
+            "lea         0x18(%1),%1                   \n"
+            "sub         $0x18,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "movdqa      %0,%%xmm2                     \n"  // kShuf01
+            "movdqa      %1,%%xmm3                     \n"  // kShuf11
+            "movdqa      %2,%%xmm4                     \n"  // kShuf21
+            :
+            : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+            );
+    asm volatile(
+            "movdqa      %0,%%xmm5                     \n"  // kMadd01
+            "movdqa      %1,%%xmm0                     \n"  // kMadd11
+            "movdqa      %2,%%xmm1                     \n"  // kRound34
+            :
+            : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+            );
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm6                   \n"
+            "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+            "pshufb      %%xmm2,%%xmm6                 \n"
+            "pmaddubsw   %%xmm5,%%xmm6                 \n"
+            "paddsw      %%xmm1,%%xmm6                 \n"
+            "psrlw       $0x2,%%xmm6                   \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movq        %%xmm6,(%1)                   \n"
+            "movdqu      0x8(%0),%%xmm6                \n"
+            "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+            "pshufb      %%xmm3,%%xmm6                 \n"
+            "pmaddubsw   %%xmm0,%%xmm6                 \n"
+            "paddsw      %%xmm1,%%xmm6                 \n"
+            "psrlw       $0x2,%%xmm6                   \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movq        %%xmm6,0x8(%1)                \n"
+            "movdqu      0x10(%0),%%xmm6               \n"
+            "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+            "lea         0x20(%0),%0                   \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+            "pshufb      %%xmm4,%%xmm6                 \n"
+            "pmaddubsw   %4,%%xmm6                     \n"
+            "paddsw      %%xmm1,%%xmm6                 \n"
+            "psrlw       $0x2,%%xmm6                   \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movq        %%xmm6,0x10(%1)               \n"
+            "lea         0x18(%1),%1                   \n"
+            "sub         $0x18,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "m"(kMadd21)                  // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+    "xmm6", "xmm7");
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "movdqa      %0,%%xmm2                     \n"  // kShuf01
+            "movdqa      %1,%%xmm3                     \n"  // kShuf11
+            "movdqa      %2,%%xmm4                     \n"  // kShuf21
+            :
+            : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+            );
+    asm volatile(
+            "movdqa      %0,%%xmm5                     \n"  // kMadd01
+            "movdqa      %1,%%xmm0                     \n"  // kMadd11
+            "movdqa      %2,%%xmm1                     \n"  // kRound34
+            :
+            : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+            );
+
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm6                   \n"
+            "movdqu      0x00(%0,%3,1),%%xmm7          \n"
+            "pavgb       %%xmm6,%%xmm7                 \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+            "pshufb      %%xmm2,%%xmm6                 \n"
+            "pmaddubsw   %%xmm5,%%xmm6                 \n"
+            "paddsw      %%xmm1,%%xmm6                 \n"
+            "psrlw       $0x2,%%xmm6                   \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movq        %%xmm6,(%1)                   \n"
+            "movdqu      0x8(%0),%%xmm6                \n"
+            "movdqu      0x8(%0,%3,1),%%xmm7           \n"
+            "pavgb       %%xmm6,%%xmm7                 \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+            "pshufb      %%xmm3,%%xmm6                 \n"
+            "pmaddubsw   %%xmm0,%%xmm6                 \n"
+            "paddsw      %%xmm1,%%xmm6                 \n"
+            "psrlw       $0x2,%%xmm6                   \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movq        %%xmm6,0x8(%1)                \n"
+            "movdqu      0x10(%0),%%xmm6               \n"
+            "movdqu      0x10(%0,%3,1),%%xmm7          \n"
+            "lea         0x20(%0),%0                   \n"
+            "pavgb       %%xmm6,%%xmm7                 \n"
+            "pavgb       %%xmm7,%%xmm6                 \n"
+            "pshufb      %%xmm4,%%xmm6                 \n"
+            "pmaddubsw   %4,%%xmm6                     \n"
+            "paddsw      %%xmm1,%%xmm6                 \n"
+            "psrlw       $0x2,%%xmm6                   \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movq        %%xmm6,0x10(%1)               \n"
+            "lea         0x18(%1),%1                   \n"
+            "sub         $0x18,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "m"(kMadd21)                  // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+    "xmm6", "xmm7");
+}
+
+void ScaleRowDown38_SSSE3(const uint8_t *src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t *dst_ptr,
+                          int dst_width) {
+    (void) src_stride;
+    asm volatile(
+            "movdqa      %3,%%xmm4                     \n"
+            "movdqa      %4,%%xmm5                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x10(%0),%%xmm1               \n"
+            "lea         0x20(%0),%0                   \n"
+            "pshufb      %%xmm4,%%xmm0                 \n"
+            "pshufb      %%xmm5,%%xmm1                 \n"
+            "paddusb     %%xmm1,%%xmm0                 \n"
+            "movq        %%xmm0,(%1)                   \n"
+            "movhlps     %%xmm0,%%xmm1                 \n"
+            "movd        %%xmm1,0x8(%1)                \n"
+            "lea         0xc(%1),%1                    \n"
+            "sub         $0xc,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "movdqa      %0,%%xmm2                     \n"
+            "movdqa      %1,%%xmm3                     \n"
+            "movdqa      %2,%%xmm4                     \n"
+            "movdqa      %3,%%xmm5                     \n"
+            :
+            : "m"(kShufAb0),  // %0
+    "m"(kShufAb1),  // %1
+    "m"(kShufAb2),  // %2
+    "m"(kScaleAb2)  // %3
+            );
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x00(%0,%3,1),%%xmm1          \n"
+            "lea         0x10(%0),%0                   \n"
+            "pavgb       %%xmm1,%%xmm0                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "pshufb      %%xmm2,%%xmm1                 \n"
+            "movdqa      %%xmm0,%%xmm6                 \n"
+            "pshufb      %%xmm3,%%xmm6                 \n"
+            "paddusw     %%xmm6,%%xmm1                 \n"
+            "pshufb      %%xmm4,%%xmm0                 \n"
+            "paddusw     %%xmm0,%%xmm1                 \n"
+            "pmulhuw     %%xmm5,%%xmm1                 \n"
+            "packuswb    %%xmm1,%%xmm1                 \n"
+            "movd        %%xmm1,(%1)                   \n"
+            "psrlq       $0x10,%%xmm1                  \n"
+            "movd        %%xmm1,0x2(%1)                \n"
+            "lea         0x6(%1),%1                    \n"
+            "sub         $0x6,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),               // %0
+    "+r"(dst_ptr),               // %1
+    "+r"(dst_width)              // %2
+            : "r"((intptr_t) (src_stride))  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+    "xmm6");
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "movdqa      %0,%%xmm2                     \n"
+            "movdqa      %1,%%xmm3                     \n"
+            "movdqa      %2,%%xmm4                     \n"
+            "pxor        %%xmm5,%%xmm5                 \n"
+            :
+            : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+            );
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"
+            "movdqu      0x00(%0,%3,1),%%xmm6          \n"
+            "movhlps     %%xmm0,%%xmm1                 \n"
+            "movhlps     %%xmm6,%%xmm7                 \n"
+            "punpcklbw   %%xmm5,%%xmm0                 \n"
+            "punpcklbw   %%xmm5,%%xmm1                 \n"
+            "punpcklbw   %%xmm5,%%xmm6                 \n"
+            "punpcklbw   %%xmm5,%%xmm7                 \n"
+            "paddusw     %%xmm6,%%xmm0                 \n"
+            "paddusw     %%xmm7,%%xmm1                 \n"
+            "movdqu      0x00(%0,%3,2),%%xmm6          \n"
+            "lea         0x10(%0),%0                   \n"
+            "movhlps     %%xmm6,%%xmm7                 \n"
+            "punpcklbw   %%xmm5,%%xmm6                 \n"
+            "punpcklbw   %%xmm5,%%xmm7                 \n"
+            "paddusw     %%xmm6,%%xmm0                 \n"
+            "paddusw     %%xmm7,%%xmm1                 \n"
+            "movdqa      %%xmm0,%%xmm6                 \n"
+            "psrldq      $0x2,%%xmm0                   \n"
+            "paddusw     %%xmm0,%%xmm6                 \n"
+            "psrldq      $0x2,%%xmm0                   \n"
+            "paddusw     %%xmm0,%%xmm6                 \n"
+            "pshufb      %%xmm2,%%xmm6                 \n"
+            "movdqa      %%xmm1,%%xmm7                 \n"
+            "psrldq      $0x2,%%xmm1                   \n"
+            "paddusw     %%xmm1,%%xmm7                 \n"
+            "psrldq      $0x2,%%xmm1                   \n"
+            "paddusw     %%xmm1,%%xmm7                 \n"
+            "pshufb      %%xmm3,%%xmm7                 \n"
+            "paddusw     %%xmm7,%%xmm6                 \n"
+            "pmulhuw     %%xmm4,%%xmm6                 \n"
+            "packuswb    %%xmm6,%%xmm6                 \n"
+            "movd        %%xmm6,(%1)                   \n"
+            "psrlq       $0x10,%%xmm6                  \n"
+            "movd        %%xmm6,0x2(%1)                \n"
+            "lea         0x6(%1),%1                    \n"
+            "sub         $0x6,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),               // %0
+    "+r"(dst_ptr),               // %1
+    "+r"(dst_width)              // %2
+            : "r"((intptr_t) (src_stride))  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+    "xmm6", "xmm7");
+}
+
+static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
+                                        10, 11, 8, 9, 14, 15, 12, 13};
+
+static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
+                                    3, 1, 1, 3, 3, 1, 1, 3};
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+
+void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr,
+                             uint8_t *dst_ptr,
+                             int dst_width) {
+    asm volatile(
+            "pxor        %%xmm0,%%xmm0                 \n"  // 0
+            "pcmpeqw     %%xmm6,%%xmm6                 \n"
+            "psrlw       $15,%%xmm6                    \n"
+            "psllw       $1,%%xmm6                     \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm1                   \n"  // 01234567
+            "movq        1(%0),%%xmm2                  \n"  // 12345678
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+            "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
+            "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+            "movdqa      %%xmm1,%%xmm4                 \n"
+            "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
+            "movdqa      %%xmm2,%%xmm5                 \n"
+            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
+            "paddw       %%xmm5,%%xmm4                 \n"
+            "movdqa      %%xmm3,%%xmm5                 \n"
+            "paddw       %%xmm6,%%xmm4                 \n"
+            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
+            "paddw       %%xmm5,%%xmm5                 \n"
+            "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
+            "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
+
+            "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
+            "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+            "paddw       %%xmm2,%%xmm1                 \n"
+            "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+            "paddw       %%xmm6,%%xmm1                 \n"
+            "paddw       %%xmm3,%%xmm3                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+            "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+
+            "packuswb    %%xmm1,%%xmm5                 \n"
+            "movdqu      %%xmm5,(%1)                   \n"
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+    asm volatile(
+            LABELALIGN
+            "1:                                        \n"
+            "pxor        %%xmm0,%%xmm0                 \n"  // 0
+            // above line
+            "movq        (%0),%%xmm1                   \n"  // 01234567
+            "movq        1(%0),%%xmm2                  \n"  // 12345678
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+            "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
+            "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+
+            "movdqa      %%xmm1,%%xmm4                 \n"
+            "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
+            "movdqa      %%xmm2,%%xmm5                 \n"
+            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
+            "paddw       %%xmm5,%%xmm4                 \n"  // near+far
+            "movdqa      %%xmm3,%%xmm5                 \n"
+            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
+            "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
+            "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
+
+            "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
+            "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+            "paddw       %%xmm2,%%xmm1                 \n"
+            "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+            "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+            "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+            // below line
+            "movq        (%0,%3),%%xmm6                \n"  // 01234567
+            "movq        1(%0,%3),%%xmm2               \n"  // 12345678
+            "movdqa      %%xmm6,%%xmm3                 \n"
+            "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+            "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
+            "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+
+            "movdqa      %%xmm6,%%xmm5                 \n"
+            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
+            "movdqa      %%xmm2,%%xmm7                 \n"
+            "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
+            "paddw       %%xmm7,%%xmm5                 \n"  // near+far
+            "movdqa      %%xmm3,%%xmm7                 \n"
+            "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
+            "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
+            "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
+
+            "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
+            "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+            "paddw       %%xmm6,%%xmm2                 \n"  // near+far
+            "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+            "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+            "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
+
+            // xmm4 xmm1
+            // xmm5 xmm2
+            "pcmpeqw     %%xmm0,%%xmm0                 \n"
+            "psrlw       $15,%%xmm0                    \n"
+            "psllw       $3,%%xmm0                     \n"  // all 8
+
+            "movdqa      %%xmm4,%%xmm3                 \n"
+            "movdqa      %%xmm5,%%xmm6                 \n"
+            "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
+            "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
+            "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
+            "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
+            "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+
+            "movdqa      %%xmm1,%%xmm7                 \n"
+            "movdqa      %%xmm2,%%xmm6                 \n"
+            "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
+            "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
+            "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
+            "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
+            "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
+
+            "packuswb    %%xmm7,%%xmm3                 \n"
+            "movdqu      %%xmm3,(%1)                   \n"  // save above line
+
+            "movdqa      %%xmm5,%%xmm3                 \n"
+            "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
+            "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
+            "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+            "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
+            "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
+
+            "movdqa      %%xmm2,%%xmm3                 \n"
+            "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+            "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
+            "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+            "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
+            "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
+
+            "packuswb    %%xmm2,%%xmm5                 \n"
+            "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr,
+                                 uint16_t *dst_ptr,
+                                 int dst_width) {
+    asm volatile(
+            "movdqa      %3,%%xmm5                     \n"
+            "pcmpeqw     %%xmm4,%%xmm4                 \n"
+            "psrlw       $15,%%xmm4                    \n"
+            "psllw       $1,%%xmm4                     \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
+            "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
+
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
+            "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
+
+            "movdqa      %%xmm2,%%xmm3                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
+            "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
+
+            "paddw       %%xmm4,%%xmm1                 \n"  // far+2
+            "paddw       %%xmm4,%%xmm3                 \n"  // far+2
+            "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
+            "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
+            "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
+            "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
+            "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
+            "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
+
+            "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
+            "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
+            "movdqu      %%xmm0,(%1)                   \n"
+            "movdqu      %%xmm2,16(%1)                 \n"
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+            : "m"(kLinearShuffleFar)  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t *dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width) {
+    asm volatile(
+            "pcmpeqw     %%xmm7,%%xmm7                 \n"
+            "psrlw       $15,%%xmm7                    \n"
+            "psllw       $3,%%xmm7                     \n"  // all 8
+            "movdqa      %5,%%xmm6                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            // above line
+            "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
+            "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
+            "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
+            "movdqa      %%xmm2,%%xmm3                 \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
+            "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
+            "paddw       %%xmm0,%%xmm1                 \n"  // near+far
+            "paddw       %%xmm2,%%xmm3                 \n"  // near+far
+            "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
+            "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
+            "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
+            "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
+
+            // below line
+            "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
+            "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
+            "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
+            "movdqa      %%xmm3,%%xmm5                 \n"
+            "movdqa      %%xmm1,%%xmm4                 \n"
+            "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
+            "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
+            "paddw       %%xmm1,%%xmm4                 \n"  // near+far
+            "paddw       %%xmm3,%%xmm5                 \n"  // near+far
+            "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
+            "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+            "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
+            "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+            // xmm0 xmm2
+            // xmm1 xmm3
+
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+            "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+            "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
+            "movdqu      %%xmm4,(%1)                   \n"
+
+            "movdqa      %%xmm2,%%xmm4                 \n"
+            "movdqa      %%xmm3,%%xmm5                 \n"
+            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
+            "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
+            "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
+            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
+            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
+            "movdqu      %%xmm4,0x10(%1)               \n"
+
+            "movdqa      %%xmm1,%%xmm4                 \n"
+            "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
+            "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
+            "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
+            "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
+            "movdqu      %%xmm1,(%1,%4,2)              \n"
+
+            "movdqa      %%xmm3,%%xmm4                 \n"
+            "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
+            "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
+            "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
+            "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+            "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride)),  // %4
+    "m"(kLinearShuffleFar)        // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr,
+                                uint16_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "pxor        %%xmm5,%%xmm5                 \n"
+            "pcmpeqd     %%xmm4,%%xmm4                 \n"
+            "psrld       $31,%%xmm4                    \n"
+            "pslld       $1,%%xmm4                     \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
+            "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
+
+            "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
+            "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
+
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+
+            "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
+            "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
+
+            "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
+            "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
+            "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
+            "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
+            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
+            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+
+            "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+            "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+            "packssdw    %%xmm1,%%xmm0                 \n"
+            "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+    asm volatile(
+            "pxor        %%xmm7,%%xmm7                 \n"
+            "pcmpeqd     %%xmm6,%%xmm6                 \n"
+            "psrld       $31,%%xmm6                    \n"
+            "pslld       $3,%%xmm6                     \n"  // all 8
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+            "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+            "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
+            "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
+            "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
+            "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
+            "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
+            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
+            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+            "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
+            "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
+            "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
+            "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
+            "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
+            "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
+            "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
+            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+            "movq        (%0,%3,2),%%xmm2              \n"
+            "movq        2(%0,%3,2),%%xmm3             \n"
+            "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
+            "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
+            "movdqa      %%xmm2,%%xmm4                 \n"
+            "movdqa      %%xmm3,%%xmm5                 \n"
+            "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
+            "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
+            "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
+            "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
+            "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
+            "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
+            "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
+            "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "movdqa      %%xmm2,%%xmm5                 \n"
+            "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+            "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+            "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+            "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+            "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+            "movdqa      %%xmm2,%%xmm5                 \n"
+            "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+            "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+            "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+            "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+            "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+            "movdqa      %%xmm1,%%xmm0                 \n"
+            "movdqa      %%xmm3,%%xmm2                 \n"
+            "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+            "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
+            "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+            "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+            "movdqa      %%xmm3,%%xmm2                 \n"
+            "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
+            "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+            "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+            "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
+            "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
+
+            "packssdw    %%xmm0,%%xmm4                 \n"
+            "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
+            "movdqu      %%xmm4,(%1)                   \n"  // store above
+            "packssdw    %%xmm2,%%xmm5                 \n"
+            "pshufd      $0b11011000,%%xmm5,%%xmm5     \n"
+            "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+
+void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
+                              uint8_t *dst_ptr,
+                              int dst_width) {
+    asm volatile(
+            "pcmpeqw     %%xmm4,%%xmm4                 \n"
+            "psrlw       $15,%%xmm4                    \n"
+            "psllw       $1,%%xmm4                     \n"  // all 2
+            "movdqa      %3,%%xmm3                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 01234567
+            "movq        1(%0),%%xmm1                  \n"  // 12345678
+            "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+            "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+            "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+            "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
+            "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
+            "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+            "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+            "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+            "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+            "packuswb    %%xmm2,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),      // %0
+    "+r"(dst_ptr),      // %1
+    "+r"(dst_width)     // %2
+            : "m"(kLinearMadd31)  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+    asm volatile(
+            "pcmpeqw     %%xmm6,%%xmm6                 \n"
+            "psrlw       $15,%%xmm6                    \n"
+            "psllw       $3,%%xmm6                     \n"  // all 8
+            "movdqa      %5,%%xmm7                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 01234567
+            "movq        1(%0),%%xmm1                  \n"  // 12345678
+            "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+            "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+            "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+            "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
+            "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
+
+            "movq        (%0,%3),%%xmm1                \n"
+            "movq        1(%0,%3),%%xmm4               \n"
+            "punpcklwd   %%xmm1,%%xmm1                 \n"
+            "punpcklwd   %%xmm4,%%xmm4                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "punpckhdq   %%xmm4,%%xmm3                 \n"
+            "punpckldq   %%xmm4,%%xmm1                 \n"
+            "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+            "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
+
+            // xmm0 xmm2
+            // xmm1 xmm3
+
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+            "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+            "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+            "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+            "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+            "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+            "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+            "movdqa      %%xmm2,%%xmm0                 \n"
+            "movdqa      %%xmm3,%%xmm1                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+            "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+            "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+            "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+            "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+            "movdqa      %%xmm3,%%xmm1                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+            "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+            "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+            "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+            "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+            "packuswb    %%xmm0,%%xmm4                 \n"
+            "movdqu      %%xmm4,(%1)                   \n"  // store above
+            "packuswb    %%xmm1,%%xmm5                 \n"
+            "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride)),  // %4
+    "m"(kLinearMadd31)            // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+
+void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr,
+                             uint8_t *dst_ptr,
+                             int dst_width) {
+    asm volatile(
+            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+            "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+            "vbroadcastf128 %3,%%ymm3                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+            "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+            "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+            "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+            "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+            "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+            "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+            "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+            "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+            "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),      // %0
+    "+r"(dst_ptr),      // %1
+    "+r"(dst_width)     // %2
+            : "m"(kLinearMadd31)  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+    asm volatile(
+            "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+            "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+            "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+            "vbroadcastf128 %5,%%ymm7                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+            "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+            "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+            "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+            "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+            "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+            "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+            "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+            "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+            "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+            "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
+            "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
+            "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
+            "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
+
+            // ymm0 ymm1
+            // ymm2 ymm3
+
+            "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+            "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+            "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+            "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+            "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+            "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+            "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+            "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+            "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+            "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+            "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+            "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+            "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+            "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+            "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+            "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+            "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+            "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+            "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+            "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+            "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride)),  // %4
+    "m"(kLinearMadd31)            // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr,
+                                uint16_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "vbroadcastf128 %3,%%ymm5                  \n"
+            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+            "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
+            "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
+
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
+
+            "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
+            "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
+            "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
+            "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
+
+            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
+            "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
+            "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
+            "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
+            "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
+            "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
+            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
+            "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
+
+            "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
+            "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
+            "vmovdqu     %%ymm0,(%1)                   \n"
+            "vmovdqu     %%ymm2,32(%1)                 \n"
+
+            "lea         0x20(%0),%0                   \n"
+            "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+            : "m"(kLinearShuffleFar)  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+    asm volatile(
+            "vbroadcastf128 %5,%%ymm5                  \n"
+            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+            "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
+
+            LABELALIGN
+            "1:                                        \n"
+
+            "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+            "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+            "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
+            "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
+            "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
+            "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
+            "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
+
+            "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
+            "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+            "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
+            "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
+            "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
+            "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
+            "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
+
+            "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
+            "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
+            "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
+            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
+            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+            "vmovdqu     %%ymm0,(%1)                   \n"  // store above
+
+            "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
+            "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
+            "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
+            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
+            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+            "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride)),  // %4
+    "m"(kLinearShuffleFar)        // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
+                                uint16_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrld      $31,%%ymm4,%%ymm4             \n"
+            "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
+            "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
+
+            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+
+            "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
+            "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
+
+            "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
+            "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
+            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
+
+            "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+            "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+            "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+    asm volatile(
+            "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+            "vpsrld      $31,%%ymm6,%%ymm6             \n"
+            "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
+
+            LABELALIGN
+            "1:                                        \n"
+
+            "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
+            "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
+            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+            "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
+            "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
+            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
+
+            "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
+            "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
+            "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
+            "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
+            "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
+            "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
+            "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
+            "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
+            "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
+            "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
+            "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
+            "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
+
+            "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+            "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+            "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+            "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+            "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+            "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+            "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+            "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+            "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+            "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+            "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+            "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+            "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+            "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+            "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+            "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+            "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+            "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+            "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+            "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
+            "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
+            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+            "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
+            "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
+            "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t *src_ptr,
+                      uint16_t *dst_ptr,
+                      int src_width) {
+    asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+
+                 // 16 pixel loop.
+                 LABELALIGN
+                 "1:                                        \n"
+                 "movdqu      (%0),%%xmm3                   \n"
+                 "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+                 "movdqu      (%1),%%xmm0                   \n"
+                 "movdqu      0x10(%1),%%xmm1               \n"
+                 "movdqa      %%xmm3,%%xmm2                 \n"
+                 "punpcklbw   %%xmm5,%%xmm2                 \n"
+                 "punpckhbw   %%xmm5,%%xmm3                 \n"
+                 "paddusw     %%xmm2,%%xmm0                 \n"
+                 "paddusw     %%xmm3,%%xmm1                 \n"
+                 "movdqu      %%xmm0,(%1)                   \n"
+                 "movdqu      %%xmm1,0x10(%1)               \n"
+                 "lea         0x20(%1),%1                   \n"
+                 "sub         $0x10,%2                      \n"
+                 "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t *src_ptr,
+                      uint16_t *dst_ptr,
+                      int src_width) {
+    asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+
+                 LABELALIGN
+                 "1:                                        \n"
+                 "vmovdqu     (%0),%%ymm3                   \n"
+                 "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+                 "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+                 "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+                 "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+                 "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+                 "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+                 "vmovdqu     %%ymm0,(%1)                   \n"
+                 "vmovdqu     %%ymm1,0x20(%1)               \n"
+                 "lea         0x40(%1),%1                   \n"
+                 "sub         $0x20,%2                      \n"
+                 "jg          1b                            \n"
+                 "vzeroupper                                \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8_t *dst_ptr,
+                           const uint8_t *src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
+    intptr_t x0, x1, temp_pixel;
+    asm volatile(
+            "movd        %6,%%xmm2                     \n"
+            "movd        %7,%%xmm3                     \n"
+            "movl        $0x04040000,%k2               \n"
+            "movd        %k2,%%xmm5                    \n"
+            "pcmpeqb     %%xmm6,%%xmm6                 \n"
+            "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
+            "pcmpeqb     %%xmm7,%%xmm7                 \n"
+            "psrlw       $15,%%xmm7                    \n"  // 0x00010001
+
+            "pextrw      $0x1,%%xmm2,%k3               \n"
+            "subl        $0x2,%5                       \n"
+            "jl          29f                           \n"
+            "movdqa      %%xmm2,%%xmm0                 \n"
+            "paddd       %%xmm3,%%xmm0                 \n"
+            "punpckldq   %%xmm0,%%xmm2                 \n"
+            "punpckldq   %%xmm3,%%xmm3                 \n"
+            "paddd       %%xmm3,%%xmm3                 \n"
+            "pextrw      $0x3,%%xmm2,%k4               \n"
+
+            LABELALIGN
+            "2:                                        \n"
+            "movdqa      %%xmm2,%%xmm1                 \n"
+            "paddd       %%xmm3,%%xmm2                 \n"
+            "movzwl      0x00(%1,%3,1),%k2             \n"
+            "movd        %k2,%%xmm0                    \n"
+            "psrlw       $0x9,%%xmm1                   \n"
+            "movzwl      0x00(%1,%4,1),%k2             \n"
+            "movd        %k2,%%xmm4                    \n"
+            "pshufb      %%xmm5,%%xmm1                 \n"
+            "punpcklwd   %%xmm4,%%xmm0                 \n"
+            "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+            "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
+            // 1
+            "paddusb     %%xmm7,%%xmm1                 \n"
+            "pmaddubsw   %%xmm0,%%xmm1                 \n"
+            "pextrw      $0x1,%%xmm2,%k3               \n"
+            "pextrw      $0x3,%%xmm2,%k4               \n"
+            "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
+            "psrlw       $0x7,%%xmm1                   \n"
+            "packuswb    %%xmm1,%%xmm1                 \n"
+            "movd        %%xmm1,%k2                    \n"
+            "mov         %w2,(%0)                      \n"
+            "lea         0x2(%0),%0                    \n"
+            "subl        $0x2,%5                       \n"
+            "jge         2b                            \n"
+
+            LABELALIGN
+            "29:                                       \n"
+            "addl        $0x1,%5                       \n"
+            "jl          99f                           \n"
+            "movzwl      0x00(%1,%3,1),%k2             \n"
+            "movd        %k2,%%xmm0                    \n"
+            "psrlw       $0x9,%%xmm2                   \n"
+            "pshufb      %%xmm5,%%xmm2                 \n"
+            "psubb       %8,%%xmm0                     \n"  // make pixels signed.
+            "pxor        %%xmm6,%%xmm2                 \n"
+            "paddusb     %%xmm7,%%xmm2                 \n"
+            "pmaddubsw   %%xmm0,%%xmm2                 \n"
+            "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
+            "psrlw       $0x7,%%xmm2                   \n"
+            "packuswb    %%xmm2,%%xmm2                 \n"
+            "movd        %%xmm2,%k2                    \n"
+            "mov         %b2,(%0)                      \n"
+            "99:                                       \n"
+            : "+r"(dst_ptr),      // %0
+    "+r"(src_ptr),      // %1
+    "=&a"(temp_pixel),  // %2
+    "=&r"(x0),          // %3
+    "=&r"(x1),          // %4
+#if defined(__x86_64__)
+    "+rm"(dst_width)  // %5
+#else
+        "+m"(dst_width)  // %5
+#endif
+            : "rm"(x),   // %6
+    "rm"(dx),  // %7
+#if defined(__x86_64__)
+    "x"(kFsub80),  // %8
+    "x"(kFadd40)   // %9
+#else
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
+#endif
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8_t *dst_ptr,
+                       const uint8_t *src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+    (void) x;
+    (void) dx;
+    asm volatile(LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%1),%%xmm0                   \n"
+            "lea         0x10(%1),%1                   \n"
+            "movdqa      %%xmm0,%%xmm1                 \n"
+            "punpcklbw   %%xmm0,%%xmm0                 \n"
+            "punpckhbw   %%xmm1,%%xmm1                 \n"
+            "movdqu      %%xmm0,(%0)                   \n"
+            "movdqu      %%xmm1,0x10(%0)               \n"
+            "lea         0x20(%0),%0                   \n"
+            "sub         $0x20,%2                      \n"
+            "jg          1b                            \n"
+
+            : "+r"(dst_ptr),   // %0
+    "+r"(src_ptr),   // %1
+    "+r"(dst_width)  // %2
+            ::"memory",
+    "cc", "xmm0", "xmm1");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+    asm volatile(
+            "cdq                                       \n"
+            "shld        $0x10,%%eax,%%edx             \n"
+            "shl         $0x10,%%eax                   \n"
+            "idiv        %1                            \n"
+            "mov         %0, %%eax                     \n"
+            : "+a"(num)  // %0
+            : "c"(div)   // %1
+            : "memory", "cc", "edx");
+    return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+    asm volatile(
+            "cdq                                       \n"
+            "shld        $0x10,%%eax,%%edx             \n"
+            "shl         $0x10,%%eax                   \n"
+            "sub         $0x10001,%%eax                \n"
+            "sbb         $0x0,%%edx                    \n"
+            "sub         $0x1,%1                       \n"
+            "idiv        %1                            \n"
+            "mov         %0, %%eax                     \n"
+            : "+a"(num)  // %0
+            : "c"(div)   // %1
+            : "memory", "cc", "edx");
+    return num;
+}
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
+    defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
+                                      6u, 14u, 0x80, 0x80, 0x80, 0x80,
+                                      0x80, 0x80, 0x80, 0x80};
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              int dst_width) {
+    asm volatile(
+            "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+            "psrlw       $0xf,%%xmm4                   \n"
+            "packuswb    %%xmm4,%%xmm4                 \n"
+            "pxor        %%xmm5, %%xmm5                \n"  // zero
+            "movdqa      %4,%%xmm1                     \n"  // split shuffler
+            "movdqa      %5,%%xmm3                     \n"  // merge shuffler
+
+            LABELALIGN
+            "1:                                        \n"
+            "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
+            "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
+            "lea         0x10(%0),%0                   \n"
+            "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
+            "pshufb      %%xmm1,%%xmm2                 \n"
+            "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
+            "pmaddubsw   %%xmm4,%%xmm2                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
+            "psrlw       $0x1,%%xmm0                   \n"  // round
+            "pavgw       %%xmm5,%%xmm0                 \n"
+            "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
+            "movq        %%xmm0,(%1)                   \n"
+            "lea         0x8(%1),%1                    \n"  // 4 UV
+            "sub         $0x4,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "m"(kShuffleSplitUV),         // %4
+    "m"(kShuffleMergeUV)          // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+
+void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t *dst_ptr,
+                             int dst_width) {
+    asm volatile(
+            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
+            "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
+            "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
+            "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
+            "lea         0x20(%0),%0                   \n"
+            "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
+            "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
+            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
+            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
+            "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
+            "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
+            "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
+            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
+            "vmovdqu     %%xmm0,(%1)                   \n"
+            "lea         0x10(%1),%1                   \n"  // 8 UV
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "m"(kShuffleSplitUV),         // %4
+    "m"(kShuffleMergeUV)          // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
+
+static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
+                                      3, 1, 3, 1, 1, 3, 1, 3};
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
+                                uint8_t *dst_ptr,
+                                int dst_width) {
+    asm volatile(
+            "pcmpeqw     %%xmm4,%%xmm4                 \n"
+            "psrlw       $15,%%xmm4                    \n"
+            "psllw       $1,%%xmm4                     \n"  // all 2
+            "movdqa      %3,%%xmm3                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+            "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+            "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+            "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+            "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+            "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+            "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+            "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+            "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+            "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+            "packuswb    %%xmm2,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),        // %0
+    "+r"(dst_ptr),        // %1
+    "+r"(dst_width)       // %2
+            : "m"(kUVLinearMadd31)  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+    asm volatile(
+            "pcmpeqw     %%xmm6,%%xmm6                 \n"
+            "psrlw       $15,%%xmm6                    \n"
+            "psllw       $3,%%xmm6                     \n"  // all 8
+            "movdqa      %5,%%xmm7                     \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+            "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+            "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+            "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+            "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+            "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+
+            "movq        (%0,%3),%%xmm1                \n"
+            "movq        2(%0,%3),%%xmm4               \n"
+            "punpcklbw   %%xmm4,%%xmm1                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "punpckhdq   %%xmm1,%%xmm3                 \n"
+            "punpckldq   %%xmm1,%%xmm1                 \n"
+            "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+            "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
+
+            // xmm0 xmm2
+            // xmm1 xmm3
+
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+            "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+            "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+            "movdqa      %%xmm1,%%xmm5                 \n"
+            "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+            "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+            "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+            "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+            "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+            "movdqa      %%xmm2,%%xmm0                 \n"
+            "movdqa      %%xmm3,%%xmm1                 \n"
+            "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+            "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+            "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+            "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+            "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+            "movdqa      %%xmm3,%%xmm1                 \n"
+            "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+            "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+            "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+            "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+            "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+            "packuswb    %%xmm0,%%xmm4                 \n"
+            "movdqu      %%xmm4,(%1)                   \n"  // store above
+            "packuswb    %%xmm1,%%xmm5                 \n"
+            "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride)),  // %4
+    "m"(kUVLinearMadd31)          // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr,
+                               uint8_t *dst_ptr,
+                               int dst_width) {
+    asm volatile(
+            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+            "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+            "vbroadcastf128 %3,%%ymm3                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%xmm0                   \n"
+            "vmovdqu     2(%0),%%xmm1                  \n"
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+            "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+            "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+            "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+            "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+            "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+            "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),        // %0
+    "+r"(dst_ptr),        // %1
+    "+r"(dst_width)       // %2
+            : "m"(kUVLinearMadd31)  // %3
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t *dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+    asm volatile(
+            "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+            "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+            "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+            "vbroadcastf128 %5,%%ymm7                  \n"
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%xmm0                   \n"
+            "vmovdqu     2(%0),%%xmm1                  \n"
+            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+            "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+            "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+            "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+            "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+            "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+            "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+            "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+            "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
+            "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
+            "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
+            "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
+
+            // ymm0 ymm1
+            // ymm2 ymm3
+
+            "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+            "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+            "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+            "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+            "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+            "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+            "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+            "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+            "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+            "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+            "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+            "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+            "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+            "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+            "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+            "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+            "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+            "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+            "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+            "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+            "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+            "sub         $0x10,%2                      \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride)),  // %4
+    "m"(kUVLinearMadd31)          // %5
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr,
+                                   uint16_t *dst_ptr,
+                                   int dst_width) {
+    asm volatile(
+            "pxor        %%xmm5,%%xmm5                 \n"
+            "pcmpeqd     %%xmm4,%%xmm4                 \n"
+            "psrld       $31,%%xmm4                    \n"
+            "pslld       $1,%%xmm4                     \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+            "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+
+            "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
+            "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
+
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+
+            "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
+            "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
+
+            "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
+            "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
+            "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
+            "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
+            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
+            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
+            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+
+            "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+            "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+            "packusdw    %%xmm1,%%xmm0                 \n"
+            "movdqu      %%xmm0,(%1)                   \n"
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
+            "sub         $0x4,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t *dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width) {
+    asm volatile(
+            "pxor        %%xmm7,%%xmm7                 \n"
+            "pcmpeqd     %%xmm6,%%xmm6                 \n"
+            "psrld       $31,%%xmm6                    \n"
+            "pslld       $3,%%xmm6                     \n"  // all 8
+
+            LABELALIGN
+            "1:                                        \n"
+            "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
+            "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
+            "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
+            "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
+            "movdqa      %%xmm0,%%xmm2                 \n"
+            "movdqa      %%xmm1,%%xmm3                 \n"
+            "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
+            "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
+            "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
+            "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
+            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
+            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
+            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+            "movq        (%0,%3,2),%%xmm2              \n"
+            "movq        4(%0,%3,2),%%xmm3             \n"
+            "punpcklwd   %%xmm7,%%xmm2                 \n"
+            "punpcklwd   %%xmm7,%%xmm3                 \n"
+            "movdqa      %%xmm2,%%xmm4                 \n"
+            "movdqa      %%xmm3,%%xmm5                 \n"
+            "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
+            "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
+            "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
+            "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
+            "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
+            "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
+            "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
+            "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
+
+            "movdqa      %%xmm0,%%xmm4                 \n"
+            "movdqa      %%xmm2,%%xmm5                 \n"
+            "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+            "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+            "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+            "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+            "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+            "movdqa      %%xmm2,%%xmm5                 \n"
+            "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+            "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+            "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+            "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+            "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+            "movdqa      %%xmm1,%%xmm0                 \n"
+            "movdqa      %%xmm3,%%xmm2                 \n"
+            "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+            "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
+            "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+            "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+            "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+            "movdqa      %%xmm3,%%xmm2                 \n"
+            "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
+            "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+            "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+            "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
+            "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
+
+            "packusdw    %%xmm0,%%xmm4                 \n"
+            "movdqu      %%xmm4,(%1)                   \n"  // store above
+            "packusdw    %%xmm2,%%xmm5                 \n"
+            "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
+
+            "lea         0x8(%0),%0                    \n"
+            "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
+            "sub         $0x4,%2                       \n"
+            "jg          1b                            \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+    "xmm7");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
+                                  uint16_t *dst_ptr,
+                                  int dst_width) {
+    asm volatile(
+            "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+            "vpsrld      $31,%%ymm4,%%ymm4             \n"
+            "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
+
+            LABELALIGN
+            "1:                                        \n"
+            "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
+            "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
+
+            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+
+            "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
+            "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
+
+            "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
+            "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
+            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
+
+            "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+            "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+            "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
+            "vmovdqu     %%ymm0,(%1)                   \n"
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+            :
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t *dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width) {
+    asm volatile(
+            "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+            "vpsrld      $31,%%ymm6,%%ymm6             \n"
+            "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
+
+            LABELALIGN
+            "1:                                        \n"
+
+            "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
+            "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
+            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
+            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
+            "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
+            "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
+            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
+            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
+            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
+
+            "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
+            "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
+            "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
+            "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
+            "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
+            "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
+            "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
+            "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
+            "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
+            "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
+            "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
+            "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
+
+            "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+            "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+            "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+            "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+            "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+            "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+            "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+            "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+            "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+            "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+            "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+            "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+            "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+            "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+            "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+            "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+            "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+            "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+            "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+            "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
+            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+            "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
+            "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
+
+            "lea         0x10(%0),%0                   \n"
+            "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
+            "sub         $0x8,%2                       \n"
+            "jg          1b                            \n"
+            "vzeroupper                                \n"
+            : "+r"(src_ptr),                // %0
+    "+r"(dst_ptr),                // %1
+    "+r"(dst_width)               // %2
+            : "r"((intptr_t) (src_stride)),  // %3
+    "r"((intptr_t) (dst_stride))   // %4
+            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/pkg/encoder/yuv/libyuv/scale_row.h b/pkg/encoder/yuv/libyuv/scale_row.h
new file mode 100644
index 000000000..16389cdcf
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/scale_row.h
@@ -0,0 +1,768 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "basic_types.h"
+#include "scale.h"
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
+#define LIBYUV_DISABLE_NEON
+#endif
+#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_SSE2
+#define HAS_SCALEROWUP2_LINEAR_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+#define HAS_SCALEROWUP2_LINEAR_16_SSE2
+#define HAS_SCALEROWUP2_BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2_LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__)) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#define HAS_SCALEROWUP2_LINEAR_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_AVX2
+#define HAS_SCALEROWUP2_LINEAR_12_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_12_AVX2
+#define HAS_SCALEROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEROWUP2_BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_AVX2
+#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t *src_argb,
+                        uint8_t *dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div);
+
+int FixedDiv1_X86(int num, int div);
+
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
+                enum FilterMode filtering,
+                int *x,
+                int *y,
+                int *dx,
+                int *dy);
+
+void ScaleRowDown2_C(const uint8_t *src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t *dst,
+                     int dst_width);
+
+void ScaleRowDown2Linear_C(const uint8_t *src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t *dst,
+                           int dst_width);
+
+void ScaleRowDown2Box_C(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst,
+                        int dst_width);
+
+void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst,
+                            int dst_width);
+
+void ScaleRowDown4_C(const uint8_t *src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t *dst,
+                     int dst_width);
+
+void ScaleRowDown4Box_C(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst,
+                        int dst_width);
+
+void ScaleRowDown34_C(const uint8_t *src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t *dst,
+                      int dst_width);
+
+void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *d,
+                            int dst_width);
+
+void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *d,
+                            int dst_width);
+
+void ScaleRowUp2_Linear_C(const uint8_t *src_ptr,
+                          uint8_t *dst_ptr,
+                          int dst_width);
+
+void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width);
+
+void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr,
+                             uint16_t *dst_ptr,
+                             int dst_width);
+
+void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t *dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+
+void ScaleRowUp2_Linear_Any_C(const uint8_t *src_ptr,
+                              uint8_t *dst_ptr,
+                              int dst_width);
+
+void ScaleRowUp2_Bilinear_Any_C(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+
+void ScaleRowUp2_Linear_16_Any_C(const uint16_t *src_ptr,
+                                 uint16_t *dst_ptr,
+                                 int dst_width);
+
+void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t *dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+
+void ScaleCols_C(uint8_t *dst_ptr,
+                 const uint8_t *src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx);
+
+void ScaleColsUp2_C(uint8_t *dst_ptr,
+                    const uint8_t *src_ptr,
+                    int dst_width,
+                    int,
+                    int);
+
+void ScaleFilterCols_C(uint8_t *dst_ptr,
+                       const uint8_t *src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+
+void ScaleFilterCols64_C(uint8_t *dst_ptr,
+                         const uint8_t *src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx);
+
+void ScaleRowDown38_C(const uint8_t *src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t *dst,
+                      int dst_width);
+
+void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width);
+
+void ScaleUVRowDown2_C(const uint8_t *src_uv,
+                       ptrdiff_t src_stride,
+                       uint8_t *dst_uv,
+                       int dst_width);
+
+void ScaleUVRowDown2Linear_C(const uint8_t *src_uv,
+                             ptrdiff_t src_stride,
+                             uint8_t *dst_uv,
+                             int dst_width);
+
+void ScaleUVRowDown2Box_C(const uint8_t *src_uv,
+                          ptrdiff_t src_stride,
+                          uint8_t *dst_uv,
+                          int dst_width);
+
+void ScaleUVRowDownEven_C(const uint8_t *src_uv,
+                          ptrdiff_t src_stride,
+                          int src_stepx,
+                          uint8_t *dst_uv,
+                          int dst_width);
+
+void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
+
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t *src_ptr,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
+void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr,
+                               uint16_t *dst_ptr,
+                               int dst_width);
+
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t *dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+
+void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t *src_ptr,
+                                   uint16_t *dst_ptr,
+                                   int dst_width);
+
+void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t *dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8_t *src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t *dst_ptr,
+                         int dst_width);
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleRowDown2_AVX2(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst_ptr,
+                        int dst_width);
+
+void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              int dst_width);
+
+void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t *dst_ptr,
+                           int dst_width);
+
+void ScaleRowDown4_SSSE3(const uint8_t *src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t *dst_ptr,
+                         int dst_width);
+
+void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleRowDown4_AVX2(const uint8_t *src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t *dst_ptr,
+                        int dst_width);
+
+void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t *dst_ptr,
+                           int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8_t *src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t *dst_ptr,
+                          int dst_width);
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowDown38_SSSE3(const uint8_t *src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t *dst_ptr,
+                          int dst_width);
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr,
+                             uint8_t *dst_ptr,
+                             int dst_width);
+
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr,
+                                 uint16_t *dst_ptr,
+                                 int dst_width);
+
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t *dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr,
+                                uint16_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
+void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
+                              uint8_t *dst_ptr,
+                              int dst_width);
+
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+
+void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr,
+                             uint8_t *dst_ptr,
+                             int dst_width);
+
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr,
+                                uint16_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
+                                uint16_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
+void ScaleRowUp2_Linear_Any_SSE2(const uint8_t *src_ptr,
+                                 uint8_t *dst_ptr,
+                                 int dst_width);
+
+void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t *src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t *dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+
+void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t *src_ptr,
+                                     uint16_t *dst_ptr,
+                                     int dst_width);
+
+void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t *src_ptr,
+                                       ptrdiff_t src_stride,
+                                       uint16_t *dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       int dst_width);
+
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t *src_ptr,
+                                    uint16_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t *src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
+void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr,
+                                  uint8_t *dst_ptr,
+                                  int dst_width);
+
+void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+
+void ScaleRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr,
+                                 uint8_t *dst_ptr,
+                                 int dst_width);
+
+void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t *dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+
+void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t *src_ptr,
+                                    uint16_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t *src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
+void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr,
+                                    uint16_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
+void ScaleRowDown2_Any_SSSE3(const uint8_t *src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t *dst_ptr,
+                             int dst_width);
+
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t *src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t *dst_ptr,
+                                   int dst_width);
+
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowDown2_Any_AVX2(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t *dst_ptr,
+                                  int dst_width);
+
+void ScaleRowDown2Box_Any_AVX2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown4_Any_SSSE3(const uint8_t *src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t *dst_ptr,
+                             int dst_width);
+
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t *src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleRowDown4_Any_AVX2(const uint8_t *src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t *dst_ptr,
+                            int dst_width);
+
+void ScaleRowDown4Box_Any_AVX2(const uint8_t *src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t *dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              int dst_width);
+
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleRowDown38_Any_SSSE3(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_ptr,
+                              int dst_width);
+
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleAddRow_SSE2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width);
+
+void ScaleAddRow_AVX2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width);
+
+void ScaleAddRow_Any_SSE2(const uint8_t *src_ptr,
+                          uint16_t *dst_ptr,
+                          int src_width);
+
+void ScaleAddRow_Any_AVX2(const uint8_t *src_ptr,
+                          uint16_t *dst_ptr,
+                          int src_width);
+
+void ScaleFilterCols_SSSE3(uint8_t *dst_ptr,
+                           const uint8_t *src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+
+void ScaleColsUp2_SSE2(uint8_t *dst_ptr,
+                       const uint8_t *src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+
+// UV Row functions
+void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t *dst_uv,
+                              int dst_width);
+
+void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t *dst_uv,
+                             int dst_width);
+
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t *dst_ptr,
+                                  int dst_width);
+
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t *dst_ptr,
+                                 int dst_width);
+
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
+                                uint8_t *dst_ptr,
+                                int dst_width);
+
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t *dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr,
+                                    uint8_t *dst_ptr,
+                                    int dst_width);
+
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr,
+                               uint8_t *dst_ptr,
+                               int dst_width);
+
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t *dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr,
+                                   uint8_t *dst_ptr,
+                                   int dst_width);
+
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t *dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr,
+                                   uint16_t *dst_ptr,
+                                   int dst_width);
+
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t *dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+
+void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t *src_ptr,
+                                       uint16_t *dst_ptr,
+                                       int dst_width);
+
+void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t *src_ptr,
+                                         ptrdiff_t src_stride,
+                                         uint16_t *dst_ptr,
+                                         ptrdiff_t dst_stride,
+                                         int dst_width);
+
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
+                                  uint16_t *dst_ptr,
+                                  int dst_width);
+
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t *dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+
+void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr,
+                                      uint16_t *dst_ptr,
+                                      int dst_width);
+
+void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t *dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/version.h b/pkg/encoder/yuv/libyuv/version.h
new file mode 100644
index 000000000..d45ef09d6
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1875
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/pkg/encoder/yuv/libyuv/video_common.c b/pkg/encoder/yuv/libyuv/video_common.c
new file mode 100644
index 000000000..e492402e8
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/video_common.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "video_common.h"
+
+struct FourCCAliasEntry {
+    uint32_t alias;
+    uint32_t canonical;
+};
+
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
+        {FOURCC_IYUV, FOURCC_I420},
+        {FOURCC_YU12, FOURCC_I420},
+        {FOURCC_YU16, FOURCC_I422},
+        {FOURCC_YU24, FOURCC_I444},
+        {FOURCC_YUYV, FOURCC_YUY2},
+        {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+        {FOURCC_HDYC, FOURCC_UYVY},
+        {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+        {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+        {FOURCC_DMB1, FOURCC_MJPG},
+        {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+        {FOURCC_RGB3, FOURCC_RAW},
+        {FOURCC_BGR3, FOURCC_24BG},
+        {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+        {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
+        {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+        {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+        {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+
+LIBYUV_API
+uint32_t CanonicalFourCC(uint32_t fourcc) {
+    int i;
+    for (i = 0; i < NUM_ALIASES; ++i) {
+        if (kFourCCAliases[i].alias == fourcc) {
+            return kFourCCAliases[i].canonical;
+        }
+    }
+    // Not an alias, so return it as-is.
+    return fourcc;
+}
diff --git a/pkg/encoder/yuv/libyuv/video_common.h b/pkg/encoder/yuv/libyuv/video_common.h
new file mode 100644
index 000000000..e2aacf44c
--- /dev/null
+++ b/pkg/encoder/yuv/libyuv/video_common.h
@@ -0,0 +1,212 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "basic_types.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d)                                        \
+  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+   (static_cast<uint32_t>(c) << 16) | /* NOLINT */                \
+   (static_cast<uint32_t>(d) << 24))  /* NOLINT */
+#else
+#define FOURCC(a, b, c, d)                                     \
+  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
+   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+    // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+    FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+    FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+    FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+    FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+    FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+    FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+    FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+    FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+    FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
+    FOURCC_I210 = FOURCC('I', '2', '1', '0'),  // bt.601 10 bit 422
+
+    // 1 Secondary YUV format: row biplanar.  deprecated.
+    FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+
+    // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
+    FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+    FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+    FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+    FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
+    FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
+    FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
+    FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
+    FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+    FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+    FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+    FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+    FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+    FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+    // 1 Primary Compressed YUV format.
+    FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+    // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+    FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+    FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+    FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+    FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+    FOURCC_J420 =
+    FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
+    FOURCC_J422 =
+    FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
+    FOURCC_J444 =
+    FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
+    FOURCC_J400 =
+    FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
+    FOURCC_F420 = FOURCC('F', '4', '2', '0'),  // bt.709 full, unofficial fourcc
+    FOURCC_F422 = FOURCC('F', '4', '2', '2'),  // bt.709 full, unofficial fourcc
+    FOURCC_F444 = FOURCC('F', '4', '4', '4'),  // bt.709 full, unofficial fourcc
+    FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
+    FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
+    FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
+    FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
+    FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
+    FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
+    FOURCC_F010 = FOURCC('F', '0', '1', '0'),  // bt.709 full range 10 bit 420
+    FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
+    FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
+    FOURCC_F210 = FOURCC('F', '2', '1', '0'),  // bt.709 full range 10 bit 422
+    FOURCC_H210 = FOURCC('H', '2', '1', '0'),  // bt.709 10 bit 422
+    FOURCC_U210 = FOURCC('U', '2', '1', '0'),  // bt.2020 10 bit 422
+    FOURCC_P010 = FOURCC('P', '0', '1', '0'),
+    FOURCC_P210 = FOURCC('P', '2', '1', '0'),
+
+    // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+    FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+    FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+    FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+    FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+    FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+    FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+    FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+    FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+    FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+    FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+    FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+    FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+    FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+    FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+    FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+    FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+    FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+    // deprecated formats.  Not supported, but defined for backward compatibility.
+    FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+    FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+    FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+    FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+    FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+    FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+    FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+    // Match any fourcc.
+    FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+    // Canonical fourcc codes used in our code.
+    FOURCC_BPP_I420 = 12,
+    FOURCC_BPP_I422 = 16,
+    FOURCC_BPP_I444 = 24,
+    FOURCC_BPP_I411 = 12,
+    FOURCC_BPP_I400 = 8,
+    FOURCC_BPP_NV21 = 12,
+    FOURCC_BPP_NV12 = 12,
+    FOURCC_BPP_YUY2 = 16,
+    FOURCC_BPP_UYVY = 16,
+    FOURCC_BPP_M420 = 12,  // deprecated
+    FOURCC_BPP_Q420 = 12,
+    FOURCC_BPP_ARGB = 32,
+    FOURCC_BPP_BGRA = 32,
+    FOURCC_BPP_ABGR = 32,
+    FOURCC_BPP_RGBA = 32,
+    FOURCC_BPP_AR30 = 32,
+    FOURCC_BPP_AB30 = 32,
+    FOURCC_BPP_AR64 = 64,
+    FOURCC_BPP_AB64 = 64,
+    FOURCC_BPP_24BG = 24,
+    FOURCC_BPP_RAW = 24,
+    FOURCC_BPP_RGBP = 16,
+    FOURCC_BPP_RGBO = 16,
+    FOURCC_BPP_R444 = 16,
+    FOURCC_BPP_RGGB = 8,
+    FOURCC_BPP_BGGR = 8,
+    FOURCC_BPP_GRBG = 8,
+    FOURCC_BPP_GBRG = 8,
+    FOURCC_BPP_YV12 = 12,
+    FOURCC_BPP_YV16 = 16,
+    FOURCC_BPP_YV24 = 24,
+    FOURCC_BPP_YU12 = 12,
+    FOURCC_BPP_J420 = 12,
+    FOURCC_BPP_J400 = 8,
+    FOURCC_BPP_H420 = 12,
+    FOURCC_BPP_H422 = 16,
+    FOURCC_BPP_I010 = 15,
+    FOURCC_BPP_I210 = 20,
+    FOURCC_BPP_H010 = 15,
+    FOURCC_BPP_H210 = 20,
+    FOURCC_BPP_P010 = 15,
+    FOURCC_BPP_P210 = 20,
+    FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+    FOURCC_BPP_H264 = 0,
+    FOURCC_BPP_IYUV = 12,
+    FOURCC_BPP_YU16 = 16,
+    FOURCC_BPP_YU24 = 24,
+    FOURCC_BPP_YUYV = 16,
+    FOURCC_BPP_YUVS = 16,
+    FOURCC_BPP_HDYC = 16,
+    FOURCC_BPP_2VUY = 16,
+    FOURCC_BPP_JPEG = 1,
+    FOURCC_BPP_DMB1 = 1,
+    FOURCC_BPP_BA81 = 8,
+    FOURCC_BPP_RGB3 = 24,
+    FOURCC_BPP_BGR3 = 24,
+    FOURCC_BPP_CM32 = 32,
+    FOURCC_BPP_CM24 = 24,
+
+    // Match any fourcc.
+    FOURCC_BPP_ANY = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/yuv.c b/pkg/encoder/yuv/yuv.c
deleted file mode 100644
index c4d918dc5..000000000
--- a/pkg/encoder/yuv/yuv.c
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "yuv.h"
-
-#define Y601_STUDIO 1
-
-// BT.601 STUDIO
-
-#ifdef Y601_STUDIO
-// 66*R+129*G+25*B
-static __inline int Y(uint8_t *__restrict rgb) {
-    int R = *rgb;
-    int G = *(rgb+1);
-    int B = *(rgb+2);
-    return (66*R+129*G+25*B+128)>>8;
-}
-
-// 112*B-38*R-74G
-static __inline int U(uint8_t *__restrict rgb) {
-    int R = *rgb;
-    int G = *(rgb+1);
-    int B = *(rgb+2);
-    return (-38*R-74*G+112*B+128) >> 8;
-}
-
-// 112*R-94*G-18*B
-static __inline int V(uint8_t *__restrict rgb) {
-    int R = 56**(rgb);
-    int G = 47**(rgb+1);
-    int B =    *(rgb+2);
-    return (R-G-(B+(B<<3))+64) >> 7;
-}
-
-static const int Y_MIN = 16;
-
-#else
-
-// BT.601 FULL
-
-// 77*R+150*G+29*B
-static __inline int Y(uint8_t *rgb) {
-    int R =  77**(rgb);
-    int G = 150**(rgb+1);
-    int B =  29**(rgb+2);
-    return (R+G+B+128) >> 8;
-}
-
-// 127*B-43*R-84*G
-static __inline int U(uint8_t *rgb) {
-    int R =  43**(rgb);
-    int G =  84**(rgb+1);
-    int B = 127**(rgb+2);
-    return (-R-G+B+128) >> 8;
-}
-
-// 127*R-106*G-21*B
-static __inline int V(uint8_t *rgb) {
-    int R =  127**rgb;
-    int G = -106**(rgb+1);
-    int B =  -21**(rgb+2);
-    return (G+B+R+128) >> 8;
-}
-
-static const int Y_MIN = 0;
-#endif
-
-static __inline void _y(uint8_t *__restrict p, uint8_t *__restrict y, int size) {
-    do {
-        *y++ = Y(p) + Y_MIN;
-        p += 4;
-    } while (--size);
-}
-
-// It will take an average color from the 2x2 pixel group for chroma values.
-// X   X   X   X
-//   O       O
-// X   X   X   X
-static __inline void _4uv(uint8_t * __restrict p, uint8_t * __restrict u, uint8_t * __restrict v, const int w, const int h) {
-    uint8_t *p2, *p3, *p4;
-    const int row = w << 2;
-    const int next = 4;
-
-    int x = w, y = h, sumU = 0, sumV = 0;
-    while (y > 0) {
-        while (x > 0) {
-            // xx..
-            // ....
-            p2 = p+next;
-            sumU = U(p) + U(p2);
-            sumV = V(p) + V(p2);
-            // ....
-            // xx..
-            p3 = p+row;
-            p4 = p3+next;
-            sumU += U(p3) + U(p4);
-            sumV += V(p3) + V(p4);
-            *u++ = 128 + (sumU >> 2);
-            *v++ = 128 + (sumV >> 2);
-            // ..x.
-            p += 8;
-            x -= 2;
-        }
-        p += row;
-        y -= 2;
-        x = w;
-    }
-}
-
-// Converts RGBA image to YUV (I420) with BT.601 studio color range.
-void rgbaToYuv(void *__restrict destination, void *__restrict source, const int w, const int h) {
-    const int image_size = w * h;
-    uint8_t *src = source;
-    uint8_t *dst_y = destination;
-    uint8_t *dst_u = destination + image_size;
-    uint8_t *dst_v = destination + image_size + image_size / 4;
-    _y(src, dst_y, image_size);
-    src = source;
-    _4uv(source, dst_u, dst_v, w, h);
-}
-
-void luma(void *__restrict destination, void *__restrict source, const int pos, const int w, const int h) {
-    uint8_t *rgba = source + 4 * pos;
-    uint8_t *dst = destination + pos;
-    _y(rgba, dst, w*h);
-}
-
-void chroma(void *__restrict dst, void *__restrict source, const int pos, const int deu, const int dev, const int w, const int h) {
-    uint8_t *src = source + 4 * pos;
-    uint8_t *dst_u = dst + deu + pos / 4;
-    uint8_t *dst_v = dst + dev + pos / 4;
-    _4uv(src, dst_u, dst_v, w, h);
-}
diff --git a/pkg/encoder/yuv/yuv.go b/pkg/encoder/yuv/yuv.go
index 19a33318a..82f59ea78 100644
--- a/pkg/encoder/yuv/yuv.go
+++ b/pkg/encoder/yuv/yuv.go
@@ -3,123 +3,80 @@ package yuv
 import (
 	"image"
 	"sync"
-	"unsafe"
-)
 
-/*
-#cgo CFLAGS: -Wall
-#include "yuv.h"
-*/
-import "C"
+	"github.com/giongto35/cloud-game/v3/pkg/encoder/yuv/libyuv"
+)
 
-type ImgProcessor interface {
-	Process(rgba *image.RGBA) []byte
-	Put(*[]byte)
+type Conv struct {
+	w, h   int
+	sw, sh int
+	scale  float64
+	pool   sync.Pool
 }
 
-type Options struct {
-	Threads int
+type RawFrame struct {
+	Data   []byte
+	Stride int
+	W, H   int
 }
 
-type processor struct {
-	w, h int
-
-	// cache
-	ww   C.int
-	pool sync.Pool
-}
+type PixFmt uint32
 
-type threadedProcessor struct {
-	*processor
+const FourccRgbp = libyuv.FourccRgbp
+const FourccArgb = libyuv.FourccArgb
+const FourccAbgr = libyuv.FourccAbgr
 
-	// threading
-	threads int
-	chunk   int
-
-	// cache
-	chromaU C.int
-	chromaV C.int
-	wg      sync.WaitGroup
+func NewYuvConv(w, h int, scale float64) Conv {
+	if scale < 1 {
+		scale = 1
+	}
+	sw, sh := round(w, scale), round(h, scale)
+	bufSize := int(float64(sw) * float64(sh) * 1.5)
+	return Conv{
+		w: w, h: h, sw: sw, sh: sh, scale: scale,
+		pool: sync.Pool{New: func() any { b := make([]byte, bufSize); return &b }},
+	}
 }
 
-// NewYuvImgProcessor creates new YUV image converter from RGBA.
-func NewYuvImgProcessor(w, h int, opts *Options) ImgProcessor {
-	bufSize := int(float32(w*h) * 1.5)
+// Process converts an image to YUV I420 format inside the internal buffer.
+func (c *Conv) Process(frame RawFrame, rot uint, pf PixFmt) []byte {
+	dx, dy := c.w, c.h // dest
+	cx, cy := c.w, c.h // crop
+	if rot == 90 || rot == 270 {
+		cx, cy = cy, cx
+	}
 
-	processor := processor{
-		w:  w,
-		h:  h,
-		ww: C.int(w),
-		pool: sync.Pool{New: func() any {
-			b := make([]byte, bufSize)
-			return &b
-		}},
+	stride := frame.Stride >> 2
+	if pf == PixFmt(libyuv.FourccRgbp) {
+		stride = frame.Stride >> 1
 	}
 
-	if opts != nil && opts.Threads > 0 {
-		// chunks the image evenly
-		chunk := h / opts.Threads
-		if chunk%2 != 0 {
-			chunk--
-		}
+	buf := *c.pool.Get().(*[]byte)
+	libyuv.Y420(frame.Data, buf, frame.W, frame.H, stride, dx, dy, rot, uint32(pf), cx, cy)
 
-		return &threadedProcessor{
-			chromaU:   C.int(w * h),
-			chromaV:   C.int(w*h + w*h/4),
-			chunk:     chunk,
-			processor: &processor,
-			threads:   opts.Threads,
-			wg:        sync.WaitGroup{},
-		}
+	if c.scale > 1 {
+		dstBuf := *c.pool.Get().(*[]byte)
+		libyuv.Y420Scale(buf, dstBuf, dx, dy, c.sw, c.sh)
+		c.pool.Put(&buf)
+		return dstBuf
 	}
-	return &processor
-}
-
-// Process converts RGBA colorspace into YUV I420 format inside the internal buffer.
-// Non-threaded version.
-func (yuv *processor) Process(rgba *image.RGBA) []byte {
-	buf := *yuv.pool.Get().(*[]byte)
-	C.rgbaToYuv(unsafe.Pointer(&buf[0]), unsafe.Pointer(&rgba.Pix[0]), yuv.ww, C.int(yuv.h))
 	return buf
 }
 
-func (yuv *processor) Put(x *[]byte) { yuv.pool.Put(x) }
+func (c *Conv) Put(x *[]byte)        { c.pool.Put(x) }
+func (c *Conv) Version() string      { return libyuv.Version() }
+func round(x int, scale float64) int { return (int(float64(x)*scale) + 1) & ^1 }
 
-// Process converts RGBA colorspace into YUV I420 format inside the internal buffer.
-// Threaded version.
-//
-// We divide the input image into chunks by the number of available CPUs.
-// Each chunk should contain 2, 4, 6, etc. rows of the image.
-//
-//	      8x4          CPU (2)
-//	x x x x x x x x  | Coroutine 1
-//	x x x x x x x x  | Coroutine 1
-//	x x x x x x x x  | Coroutine 2
-//	x x x x x x x x  | Coroutine 2
-func (yuv *threadedProcessor) Process(rgba *image.RGBA) []byte {
-	src := unsafe.Pointer(&rgba.Pix[0])
-	buf := *yuv.pool.Get().(*[]byte)
-	dst := unsafe.Pointer(&buf[0])
-	yuv.wg.Add(yuv.threads << 1)
-	chunk := yuv.w * yuv.chunk
-	for i := 0; i < yuv.threads; i++ {
-		pos, hh := C.int(i*chunk), C.int(yuv.chunk)
-		if i == yuv.threads-1 {
-			hh = C.int(yuv.h - i*yuv.chunk)
-		}
-		go yuv.chroma_(src, dst, pos, hh)
-		go yuv.luma_(src, dst, pos, hh)
-	}
-	yuv.wg.Wait()
-	return buf
-}
+func ToYCbCr(bytes []byte, w, h int) *image.YCbCr {
+	cw, ch := (w+1)/2, (h+1)/2
 
-func (yuv *threadedProcessor) luma_(src unsafe.Pointer, dst unsafe.Pointer, pos C.int, hh C.int) {
-	C.luma(dst, src, pos, yuv.ww, hh)
-	yuv.wg.Done()
-}
+	i0 := w*h + 0*cw*ch
+	i1 := w*h + 1*cw*ch
+	i2 := w*h + 2*cw*ch
 
-func (yuv *threadedProcessor) chroma_(src unsafe.Pointer, dst unsafe.Pointer, pos C.int, hh C.int) {
-	C.chroma(dst, src, pos, yuv.chromaU, yuv.chromaV, yuv.ww, hh)
-	yuv.wg.Done()
+	yuv := image.NewYCbCr(image.Rect(0, 0, w, h), image.YCbCrSubsampleRatio420)
+	yuv.Y = bytes[:i0:i0]
+	yuv.Cb = bytes[i0:i1:i1]
+	yuv.Cr = bytes[i1:i2:i2]
+	return yuv
 }
diff --git a/pkg/encoder/yuv/yuv.h b/pkg/encoder/yuv/yuv.h
deleted file mode 100644
index 6b39ec521..000000000
--- a/pkg/encoder/yuv/yuv.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef YUV_H__
-#define YUV_H__
-
-#include <stdint.h>
-
-// Converts RGBA image to YUV (I420) with BT.601 studio color range.
-void rgbaToYuv(void *destination, void *source, int width, int height);
-
-// Converts RGBA image chunk to YUV (I420) chroma with BT.601 studio color range.
-// pos contains a shift value for chunks.
-// deu, dev contains constant shifts for U, V planes in the resulting array.
-// chroma (0, 1) selects chroma estimation algorithm.
-void chroma(void *destination, void *source, int pos, int deu, int dev, int width, int height);
-
-// Converts RGBA image chunk to YUV (I420) luma with BT.601 studio color range.
-void luma(void *destination, void *source, int pos, int width, int height);
-
-#endif
diff --git a/pkg/encoder/yuv/yuv_test.go b/pkg/encoder/yuv/yuv_test.go
index fbf53efe8..6b67c29f0 100644
--- a/pkg/encoder/yuv/yuv_test.go
+++ b/pkg/encoder/yuv/yuv_test.go
@@ -1,213 +1,188 @@
 package yuv
 
 import (
+	"archive/zip"
 	"fmt"
 	"image"
 	"image/color"
 	"image/png"
+	"io"
 	"math"
 	"math/rand"
 	"os"
-	"reflect"
-	"runtime"
+	"path/filepath"
 	"testing"
-	"time"
-)
-
-func TestYuv(t *testing.T) {
-	size1, size2 := 32, 32
-	for i := 1; i < 100; i++ {
-		img := generateImage(size1, size2, randomColor())
-		pc := NewYuvImgProcessor(size1, size2, new(Options))
-		pct := NewYuvImgProcessor(size1, size2, &Options{Threads: runtime.NumCPU()})
-
-		a := pc.Process(img)
-		b := pct.Process(img)
 
-		if !reflect.DeepEqual(a, b) {
-			t.Fatalf("couldn't convert %v, \n %v \n %v", img.Pix, a, b)
-		}
-	}
-}
+	"github.com/giongto35/cloud-game/v3/pkg/encoder/yuv/libyuv"
+	_ "github.com/giongto35/cloud-game/v3/test"
+)
 
 func TestYuvPredefined(t *testing.T) {
 	im := []uint8{101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 101, 0, 106, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255, 18, 226, 78, 255}
 	should := []byte{
-		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
-		142, 142, 142, 142, 126, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		126, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 126, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 126, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 126, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 126, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 126, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 126, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
-		94, 94, 94, 94, 94, 94, 94, 94, 94, 106, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 106, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 106,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 106, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 106, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 106, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		106, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 106, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-		47, 47, 47, 47, 47, 47, 47,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		52, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 52, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		110, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94,
+		110, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94,
+		94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 110, 94, 94, 94,
+		94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94,
+		94, 94, 94, 94, 110, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94,
+		94, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94,
+		110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
+		110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94,
+		110, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94,
+		110, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94,
+		94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94, 110, 94, 94, 94,
+		94, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 110, 94, 94, 94,
+		94, 94, 94, 94, 110, 94, 94, 94, 110, 94, 94, 94, 94, 94, 94, 94, 76, 47, 47, 47,
+		47, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 76,
+		47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 76, 47,
+		47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47,
+		47, 47, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47,
+		76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 76,
+		47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47,
+		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47,
+		47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+		76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76,
+		47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47,
+		47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+		47, 76, 47, 47, 47, 47, 47, 47, 47, 76, 47, 47, 47, 76, 47, 47, 47, 47, 47, 47, 47,
 	}
 
-	pc := NewYuvImgProcessor(32, 32, new(Options))
-	pct := NewYuvImgProcessor(32, 32, &Options{Threads: runtime.NumCPU()})
-
-	img := image.NewRGBA(image.Rect(0, 0, 32, 32))
-	img.Pix = im
+	pc := NewYuvConv(32, 32, 1)
+	frame := RawFrame{Data: im, Stride: 32, W: 32, H: 32}
+	a := pc.Process(frame, 0, PixFmt(libyuv.FourccAbgr))
 
-	a := pc.Process(img)
-	b := pct.Process(img)
-
-	if len(a) != len(b) || len(a) != len(should) || len(b) != len(should) {
-		t.Fatalf("diffrent size a: %v, b: %v, o: %v", len(a), len(b), len(should))
+	if len(a) != len(should) {
+		t.Fatalf("diffrent size a: %v, o: %v", len(a), len(should))
 	}
 
 	for i := 0; i < len(a); i++ {
-		if a[i] != b[i] || a[i] != should[i] || b[i] != should[i] {
-			t.Fatalf("diff in %vth, %v != %v != %v \n%v\n%v", i, a[i], b[i], should[i], im, should)
+		if a[i] != should[i] {
+			t.Fatalf("diff in %vth, %v != %v \n%v\n%v", i, a[i], should[i], im, should)
 		}
 	}
 }
 
-func generateImage(w, h int, color color.RGBA) *image.RGBA {
-	img := image.NewRGBA(image.Rect(0, 0, w, h))
-	for x := 0; x < w; x++ {
-		for y := 0; y < h; y++ {
-			img.Set(x, y, color)
-		}
+func TestYuvScale(t *testing.T) {
+	name := "001_alsa_ABGR_256_240_1024.raw"
+	path := filepath.Join("./test/testdata/raw/", name)
+
+	data, err := ReadZip(path + ".zip")
+	if err != nil {
+		t.Error(err)
 	}
-	return img
-}
 
-func randomColor() color.RGBA {
-	rnd := rand.New(rand.NewSource(time.Now().Unix()))
-	return color.RGBA{
-		R: uint8(rnd.Intn(255)),
-		G: uint8(rnd.Intn(255)),
-		B: uint8(rnd.Intn(255)),
-		A: 255,
+	pf, w, h, stride := PixFmt(libyuv.FourccArgb), 256, 240, 1024
+	scale := 2
+
+	conv := NewYuvConv(w, h, float64(scale))
+	frame := RawFrame{Data: data, Stride: stride, W: w, H: h}
+	out := conv.Process(frame, 0, pf)
+
+	d := float64(len(out)) / float64(len(data))
+	if d != 1.5 {
+		t.Errorf("Scaled not by factor %v, %v", scale, d)
 	}
+
+	// save as RGBA
+	//sw, sh := w*scale, h*scale
+	//yuv := ToYCbCr(out, sw, sh)
+	//if f, err := os.Create(filepath.Join("./", name+".png")); err == nil {
+	//	if err = png.Encode(f, yuv); err != nil {
+	//		t.Logf("Couldn't encode the image, %v", err)
+	//	}
+	//	_ = f.Close()
+	//}
 }
 
-func BenchmarkYUV(b *testing.B) {
-	cpu := runtime.NumCPU()
+func BenchmarkYuv(b *testing.B) {
 	tests := []struct {
-		cpu int
-		w   int
-		h   int
+		w int
+		h int
 	}{
-		{cpu: cpu * 0, w: 1920, h: 1080},
-		{cpu: cpu * 2, w: 1920, h: 1080},
-		{cpu: cpu * 4, w: 1920, h: 1080},
-		{cpu: cpu * 0, w: 320, h: 240},
-		{cpu: cpu * 2, w: 320, h: 240},
-		{cpu: cpu * 4, w: 320, h: 240},
+		{w: 1920, h: 1080},
+		{w: 320, h: 240},
 	}
-	for _, bn := range tests {
-		b.Run(fmt.Sprintf("%d-%vx%v", bn.cpu, bn.w, bn.h), func(b *testing.B) {
-			_processYUV(bn.w, bn.h, bn.cpu, b)
-		})
-	}
-}
-
-func BenchmarkYUVReference(b *testing.B) { _processYUV(1920, 1080, 0, b) }
-
-func _processYUV(w, h, cpu int, b *testing.B) {
-	b.StopTimer()
-
 	r1 := rand.New(rand.NewSource(int64(1))).Float32()
-	r2 := rand.New(rand.NewSource(int64(2))).Float32()
 
-	pc := NewYuvImgProcessor(w, h, &Options{Threads: cpu})
-
-	image1 := genTestImage(w, h, r1)
-	image2 := genTestImage(w, h, r2)
-
-	for i := 0; i < b.N; i++ {
-		im := image1
-		if i%2 == 0 {
-			im = image2
-		}
-		b.StartTimer()
-		pc.Process(im)
-		b.StopTimer()
-		b.SetBytes(int64(len(im.Pix)))
+	for _, test := range tests {
+		w, h := test.w, test.h
+		frame := genFrame(w, h, r1)
+		b.Run(fmt.Sprintf("%vx%v YUV", w, h), func(b *testing.B) {
+			pc := NewYuvConv(w, h, 1)
+			for i := 0; i < b.N; i++ {
+				pc.Process(frame, 0, PixFmt(libyuv.FourccAbgr))
+				b.SetBytes(int64(len(frame.Data)))
+			}
+			b.ReportAllocs()
+		})
 	}
-	b.ReportAllocs()
 }
 
-func genTestImage(w, h int, seed float32) *image.RGBA {
+func genFrame(w, h int, seed float32) RawFrame {
 	img := image.NewRGBA(image.Rectangle{Max: image.Point{X: w, Y: h}})
 	for x := 0; x < w; x++ {
 		for y := 0; y < h; y++ {
@@ -215,7 +190,12 @@ func genTestImage(w, h int, seed float32) *image.RGBA {
 			img.Set(x, y, col)
 		}
 	}
-	return img
+	return RawFrame{
+		Data:   img.Pix,
+		Stride: img.Stride,
+		W:      img.Bounds().Dx(),
+		H:      img.Bounds().Dy(),
+	}
 }
 
 func TestGen24bitFull(t *testing.T) {
@@ -282,3 +262,19 @@ func hsb2rgb(hue, s, bri float64) (r, g, b int) {
 	}
 	return
 }
+
+func ReadZip(path string) ([]byte, error) {
+	zf, err := zip.OpenReader(path)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = zf.Close() }()
+
+	f, err := zf.File[0].Open()
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = f.Close() }()
+
+	return io.ReadAll(f)
+}
diff --git a/pkg/worker/caged/app/app.go b/pkg/worker/caged/app/app.go
index 82fff885e..a1917b4d5 100644
--- a/pkg/worker/caged/app/app.go
+++ b/pkg/worker/caged/app/app.go
@@ -1,7 +1,5 @@
 package app
 
-import "image"
-
 type App interface {
 	AudioSampleRate() int
 	Init() error
@@ -20,6 +18,12 @@ type Audio struct {
 }
 
 type Video struct {
-	Frame    image.RGBA
+	Frame    RawFrame
 	Duration int32
 }
+
+type RawFrame struct {
+	Data   []byte
+	Stride int
+	W, H   int
+}
diff --git a/pkg/worker/caged/libretro/caged.go b/pkg/worker/caged/libretro/caged.go
index 57ea82d8d..de0ba038d 100644
--- a/pkg/worker/caged/libretro/caged.go
+++ b/pkg/worker/caged/libretro/caged.go
@@ -50,8 +50,7 @@ func (c *Caged) Load(game games.GameMetadata, path string) error {
 		return err
 	}
 	w, h := c.ViewportCalc()
-	c.SetViewport(w, h, c.conf.Emulator.Scale)
-
+	c.SetViewport(w, h)
 	return nil
 }
 
@@ -75,8 +74,11 @@ func (c *Caged) EnableCloudStorage(uid string, storage cloud.Storage) {
 	}
 }
 
+func (c *Caged) PixFormat() uint32                 { return c.Emulator.PixFormat() }
+func (c *Caged) Rotation() uint                    { return c.Emulator.Rotation() }
 func (c *Caged) AudioSampleRate() int              { return c.Emulator.AudioSampleRate() }
 func (c *Caged) ViewportSize() (int, int)          { return c.Emulator.ViewportSize() }
+func (c *Caged) Scale() float64                    { return c.Emulator.Scale() }
 func (c *Caged) SendControl(port int, data []byte) { c.base.Input(port, data) }
 func (c *Caged) Start()                            { go c.Emulator.Start() }
 func (c *Caged) SetSaveOnClose(v bool)             { c.base.SaveOnClose = v }
diff --git a/pkg/worker/caged/libretro/frontend.go b/pkg/worker/caged/libretro/frontend.go
index bf7651b4c..63d323201 100644
--- a/pkg/worker/caged/libretro/frontend.go
+++ b/pkg/worker/caged/libretro/frontend.go
@@ -14,7 +14,6 @@ import (
 	"github.com/giongto35/cloud-game/v3/pkg/logger"
 	"github.com/giongto35/cloud-game/v3/pkg/os"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/app"
-	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/image"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/nanoarch"
 )
 
@@ -25,12 +24,14 @@ type Emulator interface {
 	LoadGame(path string) error
 	FPS() int
 	Flipped() bool
+	Rotation() uint
+	PixFormat() uint32
 	AudioSampleRate() int
 	IsPortrait() bool
 	// Start is called after LoadGame
 	Start()
 	// SetViewport sets viewport size
-	SetViewport(width int, height int, scale int)
+	SetViewport(width int, height int)
 	// ViewportCalc calculates the viewport size with the aspect ratio and scale
 	ViewportCalc() (nw int, nh int)
 	ViewportSize() (w, h int)
@@ -48,10 +49,11 @@ type Emulator interface {
 	ToggleMultitap()
 	// Input passes input to the emulator
 	Input(player int, data []byte)
+	// Scale returns set video scale factor
+	Scale() float64
 }
 
 type Frontend struct {
-	canvas  *image.Canvas
 	conf    config.Emulator
 	done    chan struct{}
 	input   InputState
@@ -60,6 +62,7 @@ type Frontend struct {
 	onAudio func(app.Audio)
 	onVideo func(app.Video)
 	storage Storage
+	scale   float64
 	th      int // draw threads
 	vw, vh  int // out frame size
 
@@ -151,6 +154,12 @@ func (f *Frontend) LoadCore(emu string) {
 		UsesLibCo:     conf.UsesLibCo,
 	}
 	f.mu.Lock()
+	scale := 1.0
+	if conf.Scale > 1 {
+		scale = conf.Scale
+		f.log.Debug().Msgf("Scale: x%v", scale)
+	}
+	f.scale = scale
 	f.nano.CoreLoad(meta)
 	f.mu.Unlock()
 }
@@ -169,30 +178,27 @@ func (f *Frontend) handleAudio(audio unsafe.Pointer, samples int) {
 }
 
 func (f *Frontend) handleVideo(data []byte, delta int32, fi nanoarch.FrameInfo) {
-	pixFmt := f.nano.Video.PixFmt
-	bpp := int(f.nano.Video.BPP)
-	drawn := f.canvas.Draw(pixFmt, f.nano.Rot, int(fi.W), int(fi.H), int(fi.Packed), bpp, data, f.th)
-
+	// !to merge both pools
 	fr, _ := videoPool.Get().(*app.Video)
 	if fr == nil {
 		fr = new(app.Video)
 	}
-	fr.Frame = drawn.Unwrap()
+	fr.Frame.Data = data
+	fr.Frame.W = int(fi.W)
+	fr.Frame.H = int(fi.H)
+	fr.Frame.Stride = int(fi.Stride)
 	fr.Duration = delta
 	f.onVideo(*fr)
-	f.canvas.Put(drawn)
 	videoPool.Put(fr)
 }
 
 func (f *Frontend) Shutdown() {
-	f.log.Debug().Msgf("run loop cleanup")
 	f.mu.Lock()
 	f.nano.Shutdown()
-	f.canvas.Clear()
 	f.SetAudioCb(noAudio)
 	f.SetVideoCb(noVideo)
 	f.mu.Unlock()
-	f.log.Debug().Msgf("run loop finished")
+	f.log.Debug().Msgf("frontend closed")
 }
 
 func (f *Frontend) linkNano(nano *nanoarch.Nanoarch) {
@@ -240,6 +246,8 @@ func (f *Frontend) Start() {
 	}
 }
 
+func (f *Frontend) PixFormat() uint32             { return f.nano.Video.PixFmt.C }
+func (f *Frontend) Rotation() uint                { return f.nano.Rot }
 func (f *Frontend) Flipped() bool                 { return f.nano.IsGL() }
 func (f *Frontend) FrameSize() (int, int)         { return f.nano.GeometryBase() }
 func (f *Frontend) FPS() int                      { return f.nano.VideoFramerate() }
@@ -250,21 +258,15 @@ func (f *Frontend) AudioSampleRate() int          { return f.nano.AudioSampleRat
 func (f *Frontend) Input(player int, data []byte) { f.input.setInput(player, data) }
 func (f *Frontend) LoadGame(path string) error    { return f.nano.LoadGame(path) }
 func (f *Frontend) RestoreGameState() error       { return f.Load() }
+func (f *Frontend) Scale() float64                { return f.scale }
 func (f *Frontend) IsPortrait() bool              { return f.nano.IsPortrait() }
 func (f *Frontend) SaveGameState() error          { return f.Save() }
-func (f *Frontend) Scale(factor int)              { w, h := f.ViewportSize(); f.SetViewport(w, h, factor) }
 func (f *Frontend) SetAudioCb(cb func(app.Audio)) { f.onAudio = cb }
 func (f *Frontend) SetSessionId(name string)      { f.storage.SetMainSaveName(name) }
 func (f *Frontend) SetVideoCb(ff func(app.Video)) { f.onVideo = ff }
-func (f *Frontend) SetViewport(width int, height int, scale int) {
+func (f *Frontend) SetViewport(width int, height int) {
 	f.mu.Lock()
 	f.vw, f.vh = width, height
-	mw, mh := f.nano.GeometryMax()
-	size := mw * scale * mh * scale
-	f.canvas = image.NewCanvas(width, height, size)
-	if f.DisableCanvasPool {
-		f.canvas.SetEnabled(false)
-	}
 	f.mu.Unlock()
 }
 
@@ -292,14 +294,9 @@ func (f *Frontend) ViewportCalc() (nw int, nh int) {
 		nw, nh = w, h
 	}
 
-	if f.conf.Scale > 1 {
-		nw, nh = nw*f.conf.Scale, nh*f.conf.Scale
-		f.log.Debug().Msgf("Viewport size scaled: %dx%d", nw, nh)
-	}
-
 	if f.IsPortrait() {
 		nw, nh = nh, nw
-		f.log.Debug().Msgf("Viewport was flipped")
+		f.log.Debug().Msgf("Set portrait mode")
 	}
 
 	f.log.Info().Msgf("Viewport final size: %dx%d", nw, nh)
diff --git a/pkg/worker/caged/libretro/frontend_test.go b/pkg/worker/caged/libretro/frontend_test.go
index afedbd539..60a08dee8 100644
--- a/pkg/worker/caged/libretro/frontend_test.go
+++ b/pkg/worker/caged/libretro/frontend_test.go
@@ -7,18 +7,27 @@ import (
 	"log"
 	"math/rand"
 	"os"
-	"path"
 	"path/filepath"
 	"sync"
 	"testing"
-	"unsafe"
 
 	"github.com/giongto35/cloud-game/v3/pkg/config"
 	"github.com/giongto35/cloud-game/v3/pkg/logger"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/app"
+	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/manager"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/nanoarch"
+	"github.com/giongto35/cloud-game/v3/pkg/worker/thread"
+
+	_ "github.com/giongto35/cloud-game/v3/test"
 )
 
+type TestFrontend struct {
+	*Frontend
+
+	corePath string
+	gamePath string
+}
+
 type testRun struct {
 	room           string
 	system         string
@@ -26,46 +35,48 @@ type testRun struct {
 	emulationTicks int
 }
 
-// EmulatorMock contains Frontend mocking data.
-type EmulatorMock struct {
-	*Frontend
-
-	// Libretro compiled lib core name
-	core string
-	// shared core paths (can't be changed)
-	paths EmulatorPaths
+type game struct {
+	rom    string
+	system string
 }
 
-// EmulatorPaths defines various emulator file paths.
-type EmulatorPaths struct {
-	assets string
-	cores  string
-	games  string
-	save   string
+var (
+	alwa  = game{system: "nes", rom: "Alwa's Awakening (Demo).nes"}
+	sushi = game{system: "gba", rom: "Sushi The Cat.gba"}
+	angua = game{system: "gba", rom: "anguna.gba"}
+)
+
+// TestMain runs all tests in the main thread in macOS.
+func TestMain(m *testing.M) {
+	thread.Wrap(func() { os.Exit(m.Run()) })
 }
 
-// GetEmulatorMock returns a properly stubbed emulator instance.
+// EmulatorMock returns a properly stubbed emulator instance.
 // Due to extensive use of globals -- one mock instance is allowed per a test run.
 // Don't forget to init one image channel consumer, it will lock-out otherwise.
 // Make sure you call Shutdown().
-func GetEmulatorMock(room string, system string) *EmulatorMock {
-	rootPath := getRootPath()
-
+func EmulatorMock(room string, system string) *TestFrontend {
 	var conf config.WorkerConfig
 	if _, err := config.LoadConfig(&conf, ""); err != nil {
 		panic(err)
 	}
 
-	meta := conf.Emulator.GetLibretroCoreConfig(system)
-
-	nano := nanoarch.NewNano(cleanPath(conf.Emulator.LocalPath))
+	conf.Emulator.Libretro.Cores.Repo.ExtLock = expand("tests", ".cr", "cloud-game.lock")
+	conf.Emulator.LocalPath = expand("tests", conf.Emulator.LocalPath)
+	conf.Emulator.Storage = expand("tests", "storage")
 
 	l := logger.Default()
 	l2 := l.Extend(l.Level(logger.ErrorLevel).With())
+
+	if err := manager.CheckCores(conf.Emulator, l); err != nil {
+		panic(err)
+	}
+
+	nano := nanoarch.NewNano(conf.Emulator.LocalPath)
 	nano.SetLogger(l2)
 
 	// an emu
-	emu := &EmulatorMock{
+	emu := &TestFrontend{
 		Frontend: &Frontend{
 			conf: conf.Emulator,
 			storage: &StateStorage{
@@ -78,27 +89,19 @@ func GetEmulatorMock(room string, system string) *EmulatorMock {
 			log:         l2,
 			SaveOnClose: false,
 		},
-
-		core: path.Base(meta.Lib),
-
-		paths: EmulatorPaths{
-			assets: cleanPath(rootPath),
-			cores:  cleanPath(rootPath + "assets/cores/"),
-			games:  cleanPath(rootPath + "assets/games/"),
-		},
+		corePath: expand(conf.Emulator.GetLibretroCoreConfig(system).Lib),
+		gamePath: expand(conf.Worker.Library.BasePath),
 	}
 	emu.linkNano(nano)
 
-	emu.paths.save = cleanPath(emu.HashPath())
-
 	return emu
 }
 
-// GetDefaultFrontend returns initialized emulator mock with default params.
+// DefaultFrontend returns initialized emulator mock with default params.
 // Spawns audio/image channels consumers.
 // Don't forget to close emulator mock with Shutdown().
-func GetDefaultFrontend(room string, system string, rom string) *EmulatorMock {
-	mock := GetEmulatorMock(room, system)
+func DefaultFrontend(room string, system string, rom string) *TestFrontend {
+	mock := EmulatorMock(room, system)
 	mock.loadRom(rom)
 	mock.SetVideoCb(func(app.Video) {})
 	mock.SetAudioCb(func(app.Audio) {})
@@ -107,25 +110,30 @@ func GetDefaultFrontend(room string, system string, rom string) *EmulatorMock {
 
 // loadRom loads a ROM into the emulator.
 // The rom will be loaded from emulators' games path.
-func (emu *EmulatorMock) loadRom(game string) {
-	fmt.Printf("%v %v\n", emu.paths.cores, emu.core)
-	emu.nano.CoreLoad(nanoarch.Metadata{LibPath: emu.paths.cores + emu.core})
-	err := emu.nano.LoadGame(emu.paths.games + game)
+func (emu *TestFrontend) loadRom(game string) {
+	emu.nano.CoreLoad(nanoarch.Metadata{LibPath: emu.corePath})
+
+	gamePath := expand(emu.gamePath, game)
+
+	conf := emu.conf.GetLibretroCoreConfig(gamePath)
+	scale := 1.0
+	if conf.Scale > 1 {
+		scale = conf.Scale
+	}
+	emu.scale = scale
+
+	err := emu.nano.LoadGame(gamePath)
 	if err != nil {
 		log.Fatal(err)
 	}
 	w, h := emu.FrameSize()
-	if emu.conf.Scale == 0 {
-		emu.conf.Scale = 1
-	}
-	emu.SetViewport(w, h, emu.conf.Scale)
+	emu.SetViewport(w, h)
 }
 
 // Shutdown closes the emulator and cleans its resources.
-func (emu *EmulatorMock) Shutdown() {
+func (emu *TestFrontend) Shutdown() {
 	_ = os.Remove(emu.HashPath())
 	_ = os.Remove(emu.SRAMPath())
-
 	emu.Frontend.Close()
 	emu.Frontend.Shutdown()
 }
@@ -133,97 +141,56 @@ func (emu *EmulatorMock) Shutdown() {
 // dumpState returns the current emulator state and
 // the latest saved state for its session.
 // Locks the emulator.
-func (emu *EmulatorMock) dumpState() (string, string) {
+func (emu *TestFrontend) dumpState() (string, string) {
 	emu.mu.Lock()
-	bytes, _ := os.ReadFile(emu.paths.save)
-	persistedStateHash := getHash(bytes)
+	bytes, _ := os.ReadFile(emu.HashPath())
+	lastStateHash := hash(bytes)
 	emu.mu.Unlock()
 
-	stateHash := emu.getStateHash()
-	fmt.Printf("mem: %v, dat: %v\n", stateHash, persistedStateHash)
-	return stateHash, persistedStateHash
-}
-
-// getStateHash returns the current emulator state hash.
-// Locks the emulator.
-func (emu *EmulatorMock) getStateHash() string {
 	emu.mu.Lock()
 	state, _ := nanoarch.SaveState()
 	emu.mu.Unlock()
+	stateHash := hash(state)
 
-	return getHash(state)
+	fmt.Printf("mem: %v, dat: %v\n", stateHash, lastStateHash)
+	return stateHash, lastStateHash
 }
 
-// getRootPath returns absolute path to the root directory.
-func getRootPath() string {
-	p, _ := filepath.Abs("../../../../")
-	return p + string(filepath.Separator)
-}
-
-// getHash returns MD5 hash.
-func getHash(bytes []byte) string { return fmt.Sprintf("%x", md5.Sum(bytes)) }
-
-// cleanPath returns a proper file path for current OS.
-func cleanPath(path string) string { return filepath.FromSlash(path) }
-
-// benchmarkEmulator is a generic function for
-// measuring emulator performance for one emulation frame.
-func benchmarkEmulator(system string, rom string, b *testing.B) {
-	b.StopTimer()
+func BenchmarkEmulators(b *testing.B) {
 	log.SetOutput(io.Discard)
 	os.Stdout, _ = os.Open(os.DevNull)
 
-	s := GetDefaultFrontend("bench_"+system+"_performance", system, rom)
-
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		s.nano.Run()
+	benchmarks := []struct {
+		name   string
+		system string
+		rom    string
+	}{
+		{name: "GBA Sushi", system: sushi.system, rom: sushi.rom},
+		{name: "NES Alwa", system: alwa.system, rom: alwa.rom},
 	}
-	s.Shutdown()
-}
-
-func BenchmarkEmulatorGba(b *testing.B) {
-	benchmarkEmulator("gba", "Sushi The Cat.gba", b)
-}
 
-func BenchmarkEmulatorNes(b *testing.B) {
-	benchmarkEmulator("nes", "Alwa's Awakening (Demo).nes", b)
-}
-
-func TestSwap(t *testing.T) {
-	data := []byte{1, 254, 255, 32}
-	pixel := *(*uint32)(unsafe.Pointer(&data[0]))
-	// 0 1 2 3
-	// 2 1 0 3
-	ll := ((pixel >> 16) & 0xff) | (pixel & 0xff00) | ((pixel << 16) & 0xff0000) | 0xff000000
-
-	rez := []byte{0, 0, 0, 0}
-	*(*uint32)(unsafe.Pointer(&rez[0])) = ll
-
-	log.Printf("%v\n%v", data, rez)
+	for _, bench := range benchmarks {
+		b.Run(bench.name, func(b *testing.B) {
+			s := DefaultFrontend("bench_"+bench.system+"_performance", bench.system, bench.rom)
+			for i := 0; i < b.N; i++ {
+				s.nano.Run()
+			}
+			s.Shutdown()
+		})
+	}
 }
 
 // Tests a successful emulator state save.
 func TestSave(t *testing.T) {
 	tests := []testRun{
-		{
-			room:           "test_save_ok_00",
-			system:         "gba",
-			rom:            "Sushi The Cat.gba",
-			emulationTicks: 100,
-		},
-		{
-			room:           "test_save_ok_01",
-			system:         "gba",
-			rom:            "anguna.gba",
-			emulationTicks: 10,
-		},
+		{room: "test_save_ok_00", system: sushi.system, rom: sushi.rom, emulationTicks: 100},
+		{room: "test_save_ok_01", system: angua.system, rom: angua.rom, emulationTicks: 10},
 	}
 
 	for _, test := range tests {
 		t.Logf("Testing [%v] save with [%v]\n", test.system, test.rom)
 
-		front := GetDefaultFrontend(test.room, test.system, test.rom)
+		front := DefaultFrontend(test.room, test.system, test.rom)
 
 		for test.emulationTicks > 0 {
 			front.Tick()
@@ -255,30 +222,15 @@ func TestSave(t *testing.T) {
 // Compare states (a) and (b), should be =.
 func TestLoad(t *testing.T) {
 	tests := []testRun{
-		{
-			room:           "test_load_00",
-			system:         "nes",
-			rom:            "Alwa's Awakening (Demo).nes",
-			emulationTicks: 100,
-		},
-		{
-			room:           "test_load_01",
-			system:         "gba",
-			rom:            "Sushi The Cat.gba",
-			emulationTicks: 1000,
-		},
-		{
-			room:           "test_load_02",
-			system:         "gba",
-			rom:            "anguna.gba",
-			emulationTicks: 100,
-		},
+		{room: "test_load_00", system: alwa.system, rom: alwa.rom, emulationTicks: 100},
+		{room: "test_load_01", system: sushi.system, rom: sushi.rom, emulationTicks: 1000},
+		{room: "test_load_02", system: angua.system, rom: angua.rom, emulationTicks: 100},
 	}
 
 	for _, test := range tests {
 		t.Logf("Testing [%v] load with [%v]\n", test.system, test.rom)
 
-		mock := GetDefaultFrontend(test.room, test.system, test.rom)
+		mock := DefaultFrontend(test.room, test.system, test.rom)
 
 		fmt.Printf("[%-14v] ", "initial")
 		mock.dumpState()
@@ -317,26 +269,15 @@ func TestLoad(t *testing.T) {
 
 func TestStateConcurrency(t *testing.T) {
 	tests := []struct {
-		run testRun
-		// determine random
+		run  testRun
 		seed int
 	}{
 		{
-			run: testRun{
-				room:           "test_concurrency_00",
-				system:         "gba",
-				rom:            "Sushi The Cat.gba",
-				emulationTicks: 120,
-			},
+			run:  testRun{room: "test_concurrency_00", system: sushi.system, rom: sushi.rom, emulationTicks: 120},
 			seed: 42,
 		},
 		{
-			run: testRun{
-				room:           "test_concurrency_01",
-				system:         "gba",
-				rom:            "anguna.gba",
-				emulationTicks: 300,
-			},
+			run:  testRun{room: "test_concurrency_01", system: angua.system, rom: angua.rom, emulationTicks: 300},
 			seed: 42 + 42,
 		},
 	}
@@ -344,7 +285,7 @@ func TestStateConcurrency(t *testing.T) {
 	for _, test := range tests {
 		t.Logf("Testing [%v] concurrency with [%v]\n", test.run.system, test.run.rom)
 
-		mock := GetEmulatorMock(test.run.room, test.run.system)
+		mock := EmulatorMock(test.run.room, test.run.system)
 
 		ops := &sync.WaitGroup{}
 		// quantum lock
@@ -352,14 +293,14 @@ func TestStateConcurrency(t *testing.T) {
 
 		mock.loadRom(test.run.rom)
 		mock.SetVideoCb(func(v app.Video) {
-			if len(v.Frame.Pix) == 0 {
+			if len(v.Frame.Data) == 0 {
 				t.Errorf("It seems that rom video frame was empty, which is strange!")
 			}
 		})
 		mock.SetAudioCb(func(app.Audio) {})
 
 		t.Logf("Random seed is [%v]\n", test.seed)
-		t.Logf("Save path is [%v]\n", mock.paths.save)
+		t.Logf("Save path is [%v]\n", mock.HashPath())
 
 		_ = mock.Save()
 
@@ -404,36 +345,28 @@ func TestStateConcurrency(t *testing.T) {
 	}
 }
 
-// lucky returns random boolean.
-func lucky() bool { return rand.Intn(2) == 1 }
-
 func TestConcurrentInput(t *testing.T) {
-	players := NewGameSessionInput()
-
-	events := 1000
 	var wg sync.WaitGroup
+	state := NewGameSessionInput()
+	events := 1000
+	wg.Add(2 * events)
 
-	wg.Add(events * 2)
-
-	go func() {
-		for i := 0; i < events; i++ {
-			player := rand.Intn(maxPort)
-			go func() {
-				players.setInput(player, []byte{0, 1})
-				wg.Done()
-			}()
-		}
-	}()
-
-	go func() {
-		for i := 0; i < events; i++ {
-			player := rand.Intn(maxPort)
-			go func() {
-				players.isKeyPressed(uint(player), 100)
-				wg.Done()
-			}()
-		}
-	}()
-
+	for i := 0; i < events; i++ {
+		player := rand.Intn(maxPort)
+		go func() { state.setInput(player, []byte{0, 1}); wg.Done() }()
+		go func() { state.isKeyPressed(uint(player), 100); wg.Done() }()
+	}
 	wg.Wait()
 }
+
+// expand joins a list of file path elements.
+func expand(p ...string) string {
+	ph, _ := filepath.Abs(filepath.FromSlash(filepath.Join(p...)))
+	return ph
+}
+
+// hash returns MD5 hash.
+func hash(bytes []byte) string { return fmt.Sprintf("%x", md5.Sum(bytes)) }
+
+// lucky returns random boolean.
+func lucky() bool { return rand.Intn(2) == 1 }
diff --git a/pkg/worker/caged/libretro/image/canvas.c b/pkg/worker/caged/libretro/image/canvas.c
deleted file mode 100644
index 037530146..000000000
--- a/pkg/worker/caged/libretro/image/canvas.c
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "canvas.h"
-
-__inline xy rotate(int t, int x, int y, int w, int h) {
-    xy p = {x, y};
-    switch (t) {
-        // 90° CCW or 270° CW
-        case A90:
-            p.x = y;
-            p.y = w - 1 - x;
-            break;
-        // 180° CCW
-        case A180:
-            p.x = w - 1 - x;
-            p.y = h - 1 - y;
-            break;
-        // 270° CCW or 90° CW
-        case A270:
-            p.x = h - 1 - y;
-            p.y = x;
-            break;
-        // flip Y
-        case F180:
-            //p.x = x;
-            p.y = h - 1 - y;
-            break;
-    }
-    return p;
-}
-
-__inline uint32_t _565(uint32_t x) {
-    return ((x >> 8 & 0xf8) | ((x >> 3 & 0xfc) << 8) | ((x << 3 & 0xfc) << 16)); // | 0xff000000
-}
-
-__inline uint32_t _8888rev(uint32_t px) {
-    return (((px >> 16) & 0xff) | (px & 0xff00) | ((px << 16) & 0xff0000)); // | 0xff000000
-}
-
-void RGBA(int pix, uint32_t *__restrict dst, const void *__restrict source, int y, int h, int w, int hh, int dw, int pad, int rot) {
-    int x;
-    xy rxy;
-    const uint16_t *src16;
-    const uint32_t *src32;
-
-    switch (pix) {
-        //case BIT_SHORT5551:
-        //    break;
-        case BIT_INT_8888REV:
-            src32 = (const uint32_t *)source;
-            int pad32 = pad >> 2;
-            if (rot == NO_ROT) {
-                for (; y < h; ++y) {
-                    for (x = 0; x < w; ++x) {
-                        *dst++ = _8888rev(*src32++);
-                    }
-                    src32 += pad32;
-                }
-            } else {
-                for (; y < h; ++y) {
-                    for (x = 0; x < w; ++x) {
-                        rxy = rotate(rot, x, y, w, hh);
-                        dst[rxy.x+rxy.y*dw] = _8888rev(*src32++);
-                    }
-                    src32 += pad32;
-                }
-            }
-            break;
-        case BIT_SHORT565:
-            src16 = (const uint16_t *)source;
-            int pad16 = pad >> 1;
-            if (rot == NO_ROT) {
-                for (; y < h; ++y) {
-                    for (x = 0; x < w; ++x) {
-                        *dst++ = _565(*src16++);
-                    }
-                    src16 += pad16;
-                }
-            } else {
-                for (; y < h; ++y) {
-                    for (x = 0; x < w; ++x) {
-                        rxy = rotate(rot, x, y, w, hh);
-                        dst[rxy.x+rxy.y*dw] = _565(*src16++);
-                    }
-                    src16 += pad16;
-                }
-            }
-            break;
-    }
-}
diff --git a/pkg/worker/caged/libretro/image/canvas.go b/pkg/worker/caged/libretro/image/canvas.go
deleted file mode 100644
index 31d9750b8..000000000
--- a/pkg/worker/caged/libretro/image/canvas.go
+++ /dev/null
@@ -1,159 +0,0 @@
-package image
-
-import (
-	"image"
-	"sync"
-	"unsafe"
-
-	"golang.org/x/image/draw"
-)
-
-/*
-#cgo CFLAGS: -Wall
-#include "canvas.h"
-*/
-import "C"
-
-// Canvas is a stateful drawing surface, i.e. image.RGBA
-type Canvas struct {
-	enabled  bool
-	w, h     int
-	vertical bool
-	pool     sync.Pool
-	wg       sync.WaitGroup
-}
-
-type Frame struct{ image.RGBA }
-
-func (f *Frame) Unwrap() image.RGBA { return f.RGBA }
-func (f *Frame) Opaque() bool       { return true }
-func (f *Frame) Copy() Frame {
-	return Frame{image.RGBA{Pix: append([]uint8{}, f.Pix...), Stride: f.Stride, Rect: f.Rect}}
-}
-
-const (
-	BitFormatShort5551  = iota // BIT_FORMAT_SHORT_5_5_5_1 has 5 bits R, 5 bits G, 5 bits B, 1 bit alpha
-	BitFormatInt8888Rev        // BIT_FORMAT_INT_8_8_8_8_REV has 8 bits R, 8 bits G, 8 bits B, 8 bit alpha
-	BitFormatShort565          // BIT_FORMAT_SHORT_5_6_5 has 5 bits R, 6 bits G, 5 bits
-)
-
-const (
-	ScaleNot              = iota // skips image interpolation
-	ScaleNearestNeighbour        // nearest neighbour interpolation
-	ScaleBilinear                // bilinear interpolation
-)
-
-func Resize(scaleType int, src *image.RGBA, out *image.RGBA) {
-	// !to do set it once instead switching on each iteration
-	switch scaleType {
-	case ScaleBilinear:
-		draw.ApproxBiLinear.Scale(out, out.Bounds(), src, src.Bounds(), draw.Src, nil)
-	case ScaleNot:
-		fallthrough
-	case ScaleNearestNeighbour:
-		fallthrough
-	default:
-		draw.NearestNeighbor.Scale(out, out.Bounds(), src, src.Bounds(), draw.Src, nil)
-	}
-}
-
-type Rotation uint
-
-const (
-	A90 Rotation = iota + 1
-	A180
-	A270
-	F180 // F180 is flipped Y
-)
-
-func NewCanvas(w, h, size int) *Canvas {
-	return &Canvas{
-		enabled:  true,
-		w:        w,
-		h:        h,
-		vertical: h > w, // input is inverted
-		pool: sync.Pool{New: func() any {
-			i := Frame{image.RGBA{
-				Pix:  make([]uint8, size<<2),
-				Rect: image.Rectangle{Max: image.Point{X: w, Y: h}},
-			}}
-			return &i
-		}},
-	}
-}
-
-func (c *Canvas) Get(w, h int) *Frame {
-	i := c.pool.Get().(*Frame)
-	if c.vertical {
-		w, h = h, w
-	}
-	i.Stride = w << 2
-	i.Pix = i.Pix[:i.Stride*h]
-	i.Rect.Max.X = w
-	i.Rect.Max.Y = h
-	return i
-}
-
-func (c *Canvas) Put(i *Frame) {
-	if c.enabled {
-		c.pool.Put(i)
-	}
-}
-func (c *Canvas) Clear()                  { c.wg = sync.WaitGroup{} }
-func (c *Canvas) SetEnabled(enabled bool) { c.enabled = enabled }
-
-func (c *Canvas) Draw(encoding uint32, rot Rotation, w, h, packedW, bpp int, data []byte, th int) *Frame {
-	dst := c.Get(w, h)
-	if th == 0 {
-		frame(encoding, dst, data, 0, h, w, h, packedW, bpp, rot)
-	} else {
-		hn := h / th
-		c.wg.Add(th)
-		for i := 0; i < th; i++ {
-			xx := hn * i
-			go func() {
-				frame(encoding, dst, data, xx, hn, w, h, packedW, bpp, rot)
-				c.wg.Done()
-			}()
-		}
-		c.wg.Wait()
-	}
-
-	// rescale
-	if dst.Rect.Dx() != c.w || dst.Rect.Dy() != c.h {
-		ww := c.w
-		hh := c.h
-		// w, h supposedly have been swapped before
-		if c.vertical {
-			ww, hh = c.h, c.w
-		}
-		out := c.Get(ww, hh)
-		Resize(ScaleNearestNeighbour, &dst.RGBA, &out.RGBA)
-		c.Put(dst)
-		return out
-	}
-
-	return dst
-}
-
-func frame(encoding uint32, dst *Frame, data []byte, yy int, hn int, w int, h int, pwb int, bpp int, rot Rotation) {
-	sPtr := unsafe.Pointer(&data[yy*pwb])
-	// some cores can zero-right-pad rows to the packed width value
-	pad := pwb - w*bpp
-	if pad < 0 {
-		pad = 0
-	}
-	ds := 0
-	if rot == 0 {
-		ds = yy * dst.Stride
-	}
-	dPtr := (*C.uint32_t)(unsafe.Pointer(&dst.Pix[ds]))
-	C.RGBA(C.int(encoding), dPtr, sPtr, C.int(yy), C.int(yy+hn), C.int(w), C.int(h), C.int(dst.Stride>>2), C.int(pad), C.int(rot))
-}
-
-func _8888rev(px uint32) uint32 { return uint32(C._8888rev(C.uint32_t(px))) }
-
-func rotate(t int, x int, y int, w int, h int) (int, int) {
-	var rot C.xy = C.rotate(C.int(t), C.int(x), C.int(y), C.int(w), C.int(h))
-	return int(rot.x), int(rot.y)
-}
diff --git a/pkg/worker/caged/libretro/image/canvas.h b/pkg/worker/caged/libretro/image/canvas.h
deleted file mode 100644
index 5ee04a86b..000000000
--- a/pkg/worker/caged/libretro/image/canvas.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef CANVAS_H__
-#define CANVAS_H__
-
-#include <stdint.h>
-
-#define BIT_SHORT5551 0
-#define BIT_INT_8888REV 1
-#define BIT_SHORT565 2
-
-#define NO_ROT 0
-#define	A90 1
-#define	A180 2
-#define	A270 3
-#define	F180 4
-
-typedef struct XY {
-    int x, y;
-} xy;
-
-xy rotate(int t, int x, int y, int w, int h);
-
-void RGBA(int pix, uint32_t *dst, const void *source, int y, int h, int w, int hh, int dw, int pad, int rot);
-
-uint32_t _565(uint32_t x);
-uint32_t _8888rev(uint32_t px);
-
-#endif
diff --git a/pkg/worker/caged/libretro/image/canvas_test.go b/pkg/worker/caged/libretro/image/canvas_test.go
deleted file mode 100644
index b1def658f..000000000
--- a/pkg/worker/caged/libretro/image/canvas_test.go
+++ /dev/null
@@ -1,340 +0,0 @@
-package image
-
-import (
-	"bytes"
-	"fmt"
-	"testing"
-)
-
-func BenchmarkDraw(b *testing.B) {
-	w1, h1 := 256, 240
-	w2, h2 := 640, 480
-
-	type args struct {
-		encoding  uint32
-		rot       Rotation
-		scaleType int
-		w         int
-		h         int
-		packedW   int
-		bpp       int
-		data      []byte
-		dw        int
-		dh        int
-		th        int
-	}
-	tests := []struct {
-		name string
-		args args
-	}{
-		{
-			name: "565_0th",
-			args: args{
-				encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour,
-				w: w1, h: h1, packedW: w1, bpp: 2, data: make([]uint8, w1*h1*2), dw: w1, dh: h1, th: 0,
-			},
-		},
-		{
-			name: "565_0th_90",
-			args: args{
-				encoding: BitFormatShort565, rot: A90, scaleType: ScaleNearestNeighbour,
-				w: h1, h: w1, packedW: h1, bpp: 2, data: make([]uint8, w1*h1*2), dw: w1, dh: h1, th: 0,
-			},
-		},
-		{
-			name: "565_0th",
-			args: args{
-				encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour,
-				w: w2, h: h2, packedW: w1, bpp: 2, data: make([]uint8, w2*h2*2), dw: w2, dh: h2, th: 0,
-			},
-		},
-		{
-			name: "565_4th",
-			args: args{
-				encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour,
-				w: w1, h: h1, packedW: w1, bpp: 2, data: make([]uint8, w1*h1*2), dw: w1, dh: h1, th: 4,
-			},
-		},
-		{
-			name: "565_4th",
-			args: args{
-				encoding: BitFormatShort565, scaleType: ScaleNearestNeighbour,
-				w: w2, h: h2, packedW: w2, bpp: 2, data: make([]uint8, w2*h2*2), dw: w2, dh: h2, th: 4,
-			},
-		},
-		{
-			name: "8888 - 0th",
-			args: args{
-				encoding: BitFormatInt8888Rev, scaleType: ScaleNearestNeighbour,
-				w: w1, h: h1, packedW: w1, bpp: 4, data: make([]uint8, w1*h1*4), dw: w1, dh: h1, th: 0,
-			},
-		},
-		{
-			name: "8888 - 4th",
-			args: args{
-				encoding: BitFormatInt8888Rev, scaleType: ScaleNearestNeighbour,
-				w: w1, h: h1, packedW: w1, bpp: 4, data: make([]uint8, w1*h1*4), dw: w1, dh: h1, th: 4,
-			},
-		},
-	}
-
-	for _, bn := range tests {
-		c := NewCanvas(bn.args.dw, bn.args.dh, bn.args.dw*bn.args.dh)
-		img := c.Get(bn.args.dw, bn.args.dh)
-		c.Put(img)
-		img2 := c.Get(bn.args.dw, bn.args.dh)
-		c.Put(img2)
-		b.ResetTimer()
-		b.Run(fmt.Sprintf("%vx%v_%v", bn.args.w, bn.args.h, bn.name), func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				p := c.Draw(bn.args.encoding, bn.args.rot, bn.args.w, bn.args.h, bn.args.packedW, bn.args.bpp, bn.args.data, bn.args.th)
-				c.Put(p)
-			}
-			b.ReportAllocs()
-		})
-	}
-}
-
-func Test_ix8888(t *testing.T) {
-	type args struct {
-		dst    *uint32
-		px     uint32
-		expect uint32
-	}
-	tests := []struct {
-		name string
-		args args
-	}{
-		{
-			name: "",
-			args: args{
-				dst:    new(uint32),
-				px:     0x11223344,
-				expect: 0x00443322,
-			},
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			*tt.args.dst = _8888rev(tt.args.px)
-			if *tt.args.dst != tt.args.expect {
-				t.Errorf("nope, %x %x", *tt.args.dst, tt.args.expect)
-			}
-		})
-	}
-}
-
-type dimensions struct {
-	w int
-	h int
-}
-
-func TestRotate(t *testing.T) {
-	tests := []struct {
-		// packed bytes from a 2D matrix
-		input []byte
-		// original matrix's width
-		w int
-		// original matrix's height
-		h int
-		// rotation algorithm
-		rotateHow []Rotation
-		expected  [][]byte
-	}{
-		{
-			// a cross
-			[]byte{
-				0, 1, 0,
-				1, 1, 1,
-				0, 1, 0,
-			},
-			3, 3, []Rotation{0, A90, A180, A270},
-			[][]byte{
-				{
-					0, 1, 0,
-					1, 1, 1,
-					0, 1, 0,
-				},
-				{
-					0, 1, 0,
-					1, 1, 1,
-					0, 1, 0,
-				},
-				{
-					0, 1, 0,
-					1, 1, 1,
-					0, 1, 0,
-				},
-				{
-					0, 1, 0,
-					1, 1, 1,
-					0, 1, 0,
-				},
-			},
-		},
-		{
-			[]byte{
-				1, 2,
-				3, 4,
-				5, 6,
-				7, 8,
-			},
-			2, 4, []Rotation{0, A90, A180, A270},
-			[][]byte{
-				{
-					1, 2,
-					3, 4,
-					5, 6,
-					7, 8,
-				},
-				{
-					2, 4, 6, 8,
-					1, 3, 5, 7,
-				},
-				{
-					8, 7,
-					6, 5,
-					4, 3,
-					2, 1,
-				},
-				{
-					7, 5, 3, 1,
-					8, 6, 4, 2,
-				},
-			},
-		},
-		{
-			// a square
-			[]byte{
-				1, 0, 0, 0, 0, 0, 0, 0,
-				0, 1, 1, 1, 1, 1, 1, 0,
-				0, 1, 1, 1, 1, 1, 1, 0,
-				0, 1, 0, 0, 0, 0, 1, 0,
-				0, 1, 1, 1, 1, 1, 1, 0,
-				0, 0, 0, 0, 0, 0, 0, 1,
-			},
-			8, 6, []Rotation{0, A90, A180, A270},
-			[][]byte{
-				{
-					// L              // R
-					1, 0, 0, 0, 0, 0, 0, 0,
-					0, 1, 1, 1, 1, 1, 1, 0,
-					0, 1, 1, 1, 1, 1, 1, 0,
-					0, 1, 0, 0, 0, 0, 1, 0,
-					0, 1, 1, 1, 1, 1, 1, 0,
-					0, 0, 0, 0, 0, 0, 0, 1,
-				},
-				{
-					0, 0, 0, 0, 0, 1,
-					0, 1, 1, 1, 1, 0,
-					0, 1, 1, 0, 1, 0,
-					0, 1, 1, 0, 1, 0,
-					0, 1, 1, 0, 1, 0,
-					0, 1, 1, 0, 1, 0,
-					0, 1, 1, 1, 1, 0,
-					1, 0, 0, 0, 0, 0,
-				},
-
-				{
-					1, 0, 0, 0, 0, 0, 0, 0,
-					0, 1, 1, 1, 1, 1, 1, 0,
-					0, 1, 0, 0, 0, 0, 1, 0,
-					0, 1, 1, 1, 1, 1, 1, 0,
-					0, 1, 1, 1, 1, 1, 1, 0,
-					0, 0, 0, 0, 0, 0, 0, 1,
-				},
-				{
-					0, 0, 0, 0, 0, 1,
-					0, 1, 1, 1, 1, 0,
-					0, 1, 0, 1, 1, 0,
-					0, 1, 0, 1, 1, 0,
-					0, 1, 0, 1, 1, 0,
-					0, 1, 0, 1, 1, 0,
-					0, 1, 1, 1, 1, 0,
-					1, 0, 0, 0, 0, 0,
-				},
-			},
-		},
-	}
-
-	for _, test := range tests {
-		for i, rot := range test.rotateHow {
-			if output := exampleRotate(test.input, test.w, test.h, rot); !bytes.Equal(output, test.expected[i]) {
-				t.Errorf(
-					"Test fail for angle %v with %v that should be \n%v but it's \n%v",
-					rot, test.input, test.expected[i], output)
-			}
-		}
-	}
-}
-
-func TestBoundsAfterRotation(t *testing.T) {
-	tests := []struct {
-		dim       []dimensions
-		rotateHow []Rotation
-	}{
-		{
-			// a combinatorics lib would be nice instead
-			[]dimensions{
-				// square
-				{w: 100, h: 100},
-				// even w/h
-				{w: 100, h: 50},
-				// even h/w
-				{w: 50, h: 100},
-				// odd even w/h
-				{w: 77, h: 32},
-				// even odd h/w
-				{w: 32, h: 77},
-				// just odd
-				{w: 13, h: 19},
-			},
-			[]Rotation{0, A90, A180, A270},
-		},
-	}
-
-	for _, test := range tests {
-		for _, rot := range test.rotateHow {
-			for _, dim := range test.dim {
-
-				for y := 0; y < dim.h; y++ {
-					for x := 0; x < dim.w; x++ {
-
-						xx, yy := rotate(int(rot), x, y, dim.w, dim.h)
-
-						if rot == A90 || rot == A270 { // is even
-							yy, xx = xx, yy
-						}
-
-						if xx < 0 || xx > dim.w {
-							t.Errorf("Rot %v, coordinate x should be in range [0; %v]: %v", rot, dim.w-1, xx)
-						}
-
-						if yy < 0 || yy > dim.h {
-							t.Errorf("Rot %v, coordinate y should be in range [0; %v]: %v", rot, dim.h-1, yy)
-						}
-					}
-				}
-			}
-		}
-	}
-}
-
-// exampleRotate is an example of rotation usage.
-//
-//	[1 2 3 4 5 6 7 8 9]
-//	[7 4 1 8 5 2 9 6 3]
-func exampleRotate(data []uint8, w int, h int, rot Rotation) []uint8 {
-	dest := make([]uint8, len(data))
-	for y := 0; y < h; y++ {
-		for x := 0; x < w; x++ {
-			nx, ny := rotate(int(rot), x, y, w, h)
-			stride := w
-			if rot == A90 || rot == A270 { // is even
-				stride = h
-			}
-			dest[nx+ny*stride] = data[x+y*w]
-		}
-	}
-	return dest
-}
diff --git a/pkg/worker/caged/libretro/manager/http.go b/pkg/worker/caged/libretro/manager/http.go
index 8609314e9..1f2dbc881 100644
--- a/pkg/worker/caged/libretro/manager/http.go
+++ b/pkg/worker/caged/libretro/manager/http.go
@@ -2,6 +2,7 @@ package manager
 
 import (
 	"os"
+	"path/filepath"
 
 	"github.com/giongto35/cloud-game/v3/pkg/config"
 	"github.com/giongto35/cloud-game/v3/pkg/logger"
@@ -31,6 +32,15 @@ func NewRemoteHttpManager(conf config.LibretroConfig, log *logger.Logger) Manage
 	}
 	log.Debug().Msgf("Using .lock file: %v", fileLock)
 
+	if err := os.MkdirAll(filepath.Dir(fileLock), 0770); err != nil {
+		log.Error().Err(err).Msgf("couldn't create lock")
+	} else {
+		f, err := os.Create(fileLock)
+		if err != nil {
+			log.Error().Err(err).Msgf("couldn't create lock")
+		}
+		_ = f.Close()
+	}
 	ar, err := arch.Guess()
 	if err != nil {
 		log.Error().Err(err).Msg("couldn't get Libretro core file extension")
@@ -73,8 +83,16 @@ func CheckCores(conf config.Emulator, log *logger.Logger) error {
 
 func (m *Manager) Sync() error {
 	// IPC lock if multiple worker processes on the same machine
-	m.fmu.Lock()
-	defer m.fmu.Unlock()
+	err := m.fmu.Lock()
+	if err != nil {
+		m.log.Error().Err(err).Msg("file lock fail")
+	}
+	defer func() {
+		err := m.fmu.Unlock()
+		if err != nil {
+			m.log.Error().Err(err).Msg("file unlock fail")
+		}
+	}()
 
 	installed, err := m.GetInstalled(m.arch.LibExt)
 	if err != nil {
diff --git a/pkg/worker/caged/libretro/nanoarch/nanoarch.go b/pkg/worker/caged/libretro/nanoarch/nanoarch.go
index 841ae8c45..b601aadea 100644
--- a/pkg/worker/caged/libretro/nanoarch/nanoarch.go
+++ b/pkg/worker/caged/libretro/nanoarch/nanoarch.go
@@ -12,7 +12,6 @@ import (
 	"github.com/giongto35/cloud-game/v3/pkg/logger"
 	"github.com/giongto35/cloud-game/v3/pkg/os"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/graphics"
-	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/image"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/repo/arch"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/thread"
 )
@@ -33,6 +32,12 @@ const KeyReleased = 0
 
 const MaxPort int = 4
 
+var (
+	RGBA5551    = PixFmt{C: 0, BPP: 2} // BIT_FORMAT_SHORT_5_5_5_1 has 5 bits R, 5 bits G, 5 bits B, 1 bit alpha
+	RGBA8888Rev = PixFmt{C: 1, BPP: 4} // BIT_FORMAT_INT_8_8_8_8_REV has 8 bits R, 8 bits G, 8 bits B, 8 bit alpha
+	RGB565      = PixFmt{C: 2, BPP: 2} // BIT_FORMAT_SHORT_5_6_5 has 5 bits R, 6 bits G, 5 bits
+)
+
 type Nanoarch struct {
 	Handlers
 	LastFrameTime int64
@@ -44,7 +49,7 @@ type Nanoarch struct {
 	}
 	options          *map[string]string
 	reserved         chan struct{} // limits concurrent use
-	Rot              image.Rotation
+	Rot              uint
 	serializeSize    C.size_t
 	stopped          atomic.Bool
 	sysAvInfo        C.struct_retro_system_av_info
@@ -58,9 +63,8 @@ type Nanoarch struct {
 			enabled bool
 			autoCtx bool
 		}
-		BPP    uint
 		hw     *C.struct_retro_hw_render_callback
-		PixFmt uint32
+		PixFmt PixFmt
 	}
 	vfr                      bool
 	sdlCtx                   *graphics.SDL
@@ -78,7 +82,7 @@ type Handlers struct {
 type FrameInfo struct {
 	W      uint
 	H      uint
-	Packed uint
+	Stride uint
 }
 
 type Metadata struct {
@@ -92,6 +96,24 @@ type Metadata struct {
 	Hacks         []string
 }
 
+type PixFmt struct {
+	C   uint32
+	BPP uint
+}
+
+func (p PixFmt) String() string {
+	switch p.C {
+	case 0:
+		return "RGBA5551/2"
+	case 1:
+		return "RGBA8888Rev/4"
+	case 2:
+		return "RGB565/2"
+	default:
+		return fmt.Sprintf("Unknown (%v/%v)", p.C, p.BPP)
+	}
+}
+
 // Nan0 is a global link for C callbacks to Go
 var Nan0 = Nanoarch{
 	reserved: make(chan struct{}, 1), // this thing forbids concurrent use of the emulator
@@ -118,7 +140,7 @@ func NewNano(localPath string) *Nanoarch {
 
 func (n *Nanoarch) AudioSampleRate() int { return int(n.sysAvInfo.timing.sample_rate) }
 func (n *Nanoarch) VideoFramerate() int  { return int(n.sysAvInfo.timing.fps) }
-func (n *Nanoarch) IsPortrait() bool     { return n.Rot == image.A90 || n.Rot == image.A270 }
+func (n *Nanoarch) IsPortrait() bool     { return n.Rot == 90 || n.Rot == 270 }
 func (n *Nanoarch) GeometryBase() (int, int) {
 	return int(n.sysAvInfo.geometry.base_width), int(n.sysAvInfo.geometry.base_height)
 }
@@ -252,7 +274,7 @@ func (n *Nanoarch) LoadGame(path string) error {
 
 	if n.Video.gl.enabled {
 		//setRotation(image.F180) // flip Y coordinates of OpenGL
-		bufS := uint(n.sysAvInfo.geometry.max_width*n.sysAvInfo.geometry.max_height) * n.Video.BPP
+		bufS := uint(n.sysAvInfo.geometry.max_width*n.sysAvInfo.geometry.max_height) * n.Video.PixFmt.BPP
 		graphics.SetBuffer(int(bufS))
 		n.log.Info().Msgf("Set buffer: %v", byteCountBinary(int64(bufS)))
 		if n.LibCo {
@@ -357,34 +379,33 @@ func (n *Nanoarch) IsStopped() bool { return n.stopped.Load() }
 func videoSetPixelFormat(format uint32) (C.bool, error) {
 	switch format {
 	case C.RETRO_PIXEL_FORMAT_0RGB1555:
-		Nan0.Video.PixFmt = image.BitFormatShort5551
+		Nan0.Video.PixFmt = RGBA5551
 		if err := graphics.SetPixelFormat(graphics.UnsignedShort5551); err != nil {
 			return false, fmt.Errorf("unknown pixel format %v", Nan0.Video.PixFmt)
 		}
-		Nan0.Video.BPP = 2
 		// format is not implemented
 		return false, fmt.Errorf("unsupported pixel type %v converter", format)
 	case C.RETRO_PIXEL_FORMAT_XRGB8888:
-		Nan0.Video.PixFmt = image.BitFormatInt8888Rev
+		Nan0.Video.PixFmt = RGBA8888Rev
 		if err := graphics.SetPixelFormat(graphics.UnsignedInt8888Rev); err != nil {
 			return false, fmt.Errorf("unknown pixel format %v", Nan0.Video.PixFmt)
 		}
-		Nan0.Video.BPP = 4
 	case C.RETRO_PIXEL_FORMAT_RGB565:
-		Nan0.Video.PixFmt = image.BitFormatShort565
+		Nan0.Video.PixFmt = RGB565
 		if err := graphics.SetPixelFormat(graphics.UnsignedShort565); err != nil {
 			return false, fmt.Errorf("unknown pixel format %v", Nan0.Video.PixFmt)
 		}
-		Nan0.Video.BPP = 2
 	default:
 		return false, fmt.Errorf("unknown pixel type %v", format)
 	}
+	Nan0.log.Info().Msgf("Pixel format: %v", Nan0.Video.PixFmt)
+
 	return true, nil
 }
 
-func setRotation(rotation image.Rotation) {
-	Nan0.Rot = rotation
-	Nan0.log.Debug().Msgf("Image rotated %v°", map[uint]uint{0: 0, 1: 90, 2: 180, 3: 270}[uint(rotation)])
+func setRotation(rot uint) {
+	Nan0.Rot = rot
+	Nan0.log.Debug().Msgf("Image rotated %v°", rot)
 }
 
 func printOpenGLDriverInfo() {
@@ -557,7 +578,7 @@ func coreVideoRefresh(data unsafe.Pointer, width, height uint, packed uint) {
 	// calculate real frame width in pixels from packed data (realWidth >= width)
 	// some cores or games output zero pitch, i.e. N64 Mupen
 	if packed == 0 {
-		packed = width * Nan0.Video.BPP
+		packed = width * Nan0.Video.PixFmt.BPP
 	}
 	// calculate space for the video frame
 	bytes := packed * height
@@ -575,7 +596,7 @@ func coreVideoRefresh(data unsafe.Pointer, width, height uint, packed uint) {
 	// also we have an option of xN output frame magnification
 	// so, it may be rescaled
 
-	Nan0.Handlers.OnVideo(data_, int32(dt), FrameInfo{W: width, H: height, Packed: packed})
+	Nan0.Handlers.OnVideo(data_, int32(dt), FrameInfo{W: width, H: height, Stride: packed})
 }
 
 //export coreInputPoll
@@ -665,8 +686,16 @@ func coreEnvironment(cmd C.unsigned, data unsafe.Pointer) C.bool {
 	}
 
 	switch cmd {
+	case C.RETRO_ENVIRONMENT_SET_SYSTEM_AV_INFO:
+		av := *(*C.struct_retro_system_av_info)(data)
+		Nan0.log.Info().Msgf(">>> SET SYS AV INFO: %v", av)
+		return true
+	case C.RETRO_ENVIRONMENT_SET_GEOMETRY:
+		geom := *(*C.struct_retro_game_geometry)(data)
+		Nan0.log.Info().Msgf(">>> GEOMETRY: %v", geom)
+		return true
 	case C.RETRO_ENVIRONMENT_SET_ROTATION:
-		setRotation(image.Rotation(*(*uint)(data) % 4))
+		setRotation((*(*uint)(data) % 4) * 90)
 		return true
 	case C.RETRO_ENVIRONMENT_GET_CAN_DUPE:
 		*(*C.bool)(data) = C.bool(true)
diff --git a/pkg/worker/caged/libretro/recording.go b/pkg/worker/caged/libretro/recording.go
index 7c128aeab..cc4cdcdde 100644
--- a/pkg/worker/caged/libretro/recording.go
+++ b/pkg/worker/caged/libretro/recording.go
@@ -1,7 +1,6 @@
 package libretro
 
 import (
-	"image"
 	"time"
 
 	"github.com/giongto35/cloud-game/v3/pkg/config"
@@ -15,23 +14,29 @@ type RecordingFrontend struct {
 	rec *recorder.Recording
 }
 
-// !to fix opaque image save
-
-type opaque struct{ image.RGBA }
+func WithRecording(fe Emulator, rec bool, user string, game string, conf config.Recording, log *logger.Logger) *RecordingFrontend {
 
-func (o *opaque) Opaque() bool { return true }
+	pix := ""
+	switch fe.PixFormat() {
+	case 0:
+		pix = "rgb1555"
+	case 1:
+		pix = "brga"
+	case 2:
+		pix = "rgb565"
+	}
 
-func WithRecording(fe Emulator, rec bool, user string, game string, conf config.Recording, log *logger.Logger) *RecordingFrontend {
 	rr := &RecordingFrontend{Emulator: fe, rec: recorder.NewRecording(
 		recorder.Meta{UserName: user},
 		log,
 		recorder.Options{
-			Dir:                   conf.Folder,
-			Game:                  game,
-			ImageCompressionLevel: conf.CompressLevel,
-			Name:                  conf.Name,
-			Zip:                   conf.Zip,
-			Vsync:                 true,
+			Dir:   conf.Folder,
+			Game:  game,
+			Name:  conf.Name,
+			Zip:   conf.Zip,
+			Vsync: true,
+			Flip:  fe.Flipped(),
+			Pix:   pix,
 		})}
 	rr.ToggleRecording(rec, user)
 	return rr
@@ -52,7 +57,7 @@ func (r *RecordingFrontend) SetAudioCb(fn func(app.Audio)) {
 func (r *RecordingFrontend) SetVideoCb(fn func(app.Video)) {
 	r.Emulator.SetVideoCb(func(v app.Video) {
 		if r.IsRecording() {
-			r.rec.WriteVideo(recorder.Video{Image: &opaque{v.Frame}, Duration: time.Duration(v.Duration)})
+			r.rec.WriteVideo(recorder.Video{Frame: recorder.Frame(v.Frame), Duration: time.Duration(v.Duration)})
 		}
 		fn(v)
 	})
diff --git a/pkg/worker/coordinatorhandlers.go b/pkg/worker/coordinatorhandlers.go
index ebce60622..2a791c1b6 100644
--- a/pkg/worker/coordinatorhandlers.go
+++ b/pkg/worker/coordinatorhandlers.go
@@ -121,6 +121,7 @@ func (c *coordinator) HandleGameStart(rq api.StartGameRequest[com.Uid], w *Worke
 		m.AudioSrcHz = app.AudioSampleRate()
 		m.AudioFrame = w.conf.Encoder.Audio.Frame
 		m.VideoW, m.VideoH = app.ViewportSize()
+		m.VideoScale = app.Scale()
 
 		r.SetMedia(m)
 
diff --git a/pkg/worker/media/media.go b/pkg/worker/media/media.go
index 5f53324a3..5b33404e6 100644
--- a/pkg/worker/media/media.go
+++ b/pkg/worker/media/media.go
@@ -103,11 +103,11 @@ func (s samples) stretch(size int) []int16 {
 }
 
 type WebrtcMediaPipe struct {
+	a        *opus.Encoder
+	v        *encoder.Video
 	onAudio  func([]byte)
-	opus     *opus.Encoder
 	audioBuf buffer
 	log      *logger.Logger
-	enc      *encoder.VideoEncoder
 
 	aConf config.Audio
 	vConf config.Video
@@ -115,6 +115,7 @@ type WebrtcMediaPipe struct {
 	AudioSrcHz     int
 	AudioFrame     int
 	VideoW, VideoH int
+	VideoScale     float64
 }
 
 func NewWebRtcMediaPipe(ac config.Audio, vc config.Video, log *logger.Logger) *WebrtcMediaPipe {
@@ -126,8 +127,8 @@ func (wmp *WebrtcMediaPipe) SetAudioCb(cb func([]byte, int32)) {
 	wmp.onAudio = func(bytes []byte) { cb(bytes, fr) }
 }
 func (wmp *WebrtcMediaPipe) Destroy() {
-	if wmp.enc != nil {
-		wmp.enc.Stop()
+	if wmp.v != nil {
+		wmp.v.Stop()
 	}
 }
 func (wmp *WebrtcMediaPipe) PushAudio(audio []int16) { wmp.audioBuf.write(audio, wmp.encodeAudio) }
@@ -136,7 +137,7 @@ func (wmp *WebrtcMediaPipe) Init() error {
 	if err := wmp.initAudio(wmp.AudioSrcHz, wmp.AudioFrame); err != nil {
 		return err
 	}
-	if err := wmp.initVideo(wmp.VideoW, wmp.VideoH, wmp.vConf); err != nil {
+	if err := wmp.initVideo(wmp.VideoW, wmp.VideoH, wmp.VideoScale, wmp.vConf); err != nil {
 		return err
 	}
 	return nil
@@ -148,7 +149,7 @@ func (wmp *WebrtcMediaPipe) initAudio(srcHz int, frameSize int) error {
 		return fmt.Errorf("opus fail: %w", err)
 	}
 	wmp.log.Debug().Msgf("Opus: %v", au.GetInfo())
-	wmp.opus = au
+	wmp.a = au
 	buf := newBuffer(frame(srcHz, frameSize))
 	dstHz, _ := au.SampleRate()
 	if srcHz != dstHz {
@@ -160,7 +161,7 @@ func (wmp *WebrtcMediaPipe) initAudio(srcHz int, frameSize int) error {
 }
 
 func (wmp *WebrtcMediaPipe) encodeAudio(pcm samples) {
-	data, err := wmp.opus.Encode(pcm)
+	data, err := wmp.a.Encode(pcm)
 	audioPool.Put((*[]int16)(&pcm))
 	if err != nil {
 		wmp.log.Error().Err(err).Msgf("opus encode fail")
@@ -169,25 +170,36 @@ func (wmp *WebrtcMediaPipe) encodeAudio(pcm samples) {
 	wmp.onAudio(data)
 }
 
-func (wmp *WebrtcMediaPipe) initVideo(w, h int, conf config.Video) error {
+func (wmp *WebrtcMediaPipe) initVideo(w, h int, scale float64, conf config.Video) error {
 	var enc encoder.Encoder
 	var err error
+
+	sw, sh := round(w, scale), round(h, scale)
+
+	wmp.log.Debug().Msgf("Scale: %vx%v -> %vx%v", w, h, sw, sh)
+
 	wmp.log.Info().Msgf("Video codec: %v", conf.Codec)
 	if conf.Codec == string(encoder.H264) {
 		wmp.log.Debug().Msgf("x264: build v%v", h264.LibVersion())
 		opts := h264.Options(conf.H264)
-		enc, err = h264.NewEncoder(w, h, &opts)
+		enc, err = h264.NewEncoder(sw, sh, &opts)
 	} else {
 		opts := vpx.Options(conf.Vpx)
-		enc, err = vpx.NewEncoder(w, h, &opts)
+		enc, err = vpx.NewEncoder(sw, sh, &opts)
 	}
 	if err != nil {
 		return fmt.Errorf("couldn't create a video encoder: %w", err)
 	}
-	wmp.enc = encoder.NewVideoEncoder(enc, w, h, conf.Concurrency, wmp.log)
+	wmp.v = encoder.NewVideoEncoder(enc, w, h, scale, wmp.log)
+	wmp.log.Debug().Msgf("%v", wmp.v.Info())
 	return nil
 }
 
-func (wmp *WebrtcMediaPipe) ProcessVideo(v app.Video) []byte { return wmp.enc.Encode(&v.Frame) }
+func round(x int, scale float64) int { return (int(float64(x)*scale) + 1) & ^1 }
 
-func (wmp *WebrtcMediaPipe) SetVideoFlip(b bool) { wmp.enc.SetFlip(b) }
+func (wmp *WebrtcMediaPipe) ProcessVideo(v app.Video) []byte {
+	return wmp.v.Encode(encoder.InFrame(v.Frame))
+}
+func (wmp *WebrtcMediaPipe) SetPixFmt(f uint32)  { wmp.v.SetPixFormat(f) }
+func (wmp *WebrtcMediaPipe) SetVideoFlip(b bool) { wmp.v.SetFlip(b) }
+func (wmp *WebrtcMediaPipe) SetRot(r uint)       { wmp.v.SetRot(r) }
diff --git a/pkg/worker/media/media_test.go b/pkg/worker/media/media_test.go
index 612be7a2a..e99522efa 100644
--- a/pkg/worker/media/media_test.go
+++ b/pkg/worker/media/media_test.go
@@ -46,7 +46,7 @@ func run(w, h int, cod encoder.VideoCodec, count int, a *image.RGBA, b *image.RG
 	}
 
 	logger.SetGlobalLevel(logger.Disabled)
-	ve := encoder.NewVideoEncoder(enc, w, h, 8, l)
+	ve := encoder.NewVideoEncoder(enc, w, h, 1, l)
 	defer ve.Stop()
 
 	if a == nil {
@@ -61,7 +61,12 @@ func run(w, h int, cod encoder.VideoCodec, count int, a *image.RGBA, b *image.RG
 		if i%2 == 0 {
 			im = b
 		}
-		out := ve.Encode(im)
+		out := ve.Encode(encoder.InFrame{
+			Data:   im.Pix,
+			Stride: im.Stride,
+			W:      im.Bounds().Dx(),
+			H:      im.Bounds().Dy(),
+		})
 		if out == nil {
 			backend.Fatalf("encoder closed abnormally")
 		}
diff --git a/pkg/worker/recorder/ffmpegmux.go b/pkg/worker/recorder/ffmpegmux.go
index 4869ef712..37c9df6aa 100644
--- a/pkg/worker/recorder/ffmpegmux.go
+++ b/pkg/worker/recorder/ffmpegmux.go
@@ -15,6 +15,8 @@ const demuxFile = "input.txt"
 // ffmpeg concat demuxer, see: https://ffmpeg.org/ffmpeg-formats.html#concat
 // example:
 //
+// !to change
+//
 //	ffmpeg -f concat -i input.txt \
 //		   -ac 2 -channel_layout stereo -i audio.wav \
 //		   -b:a 192K -crf 23 -vf fps=30 -pix_fmt yuv420p \
@@ -25,9 +27,17 @@ func createFfmpegMuxFile(dir string, fPattern string, frameTimes []time.Duration
 		return err
 	}
 	defer func() { er = demux.Close() }()
-	_, err = demux.WriteString(
-		fmt.Sprintf("ffconcat version 1.0\n# v: 1\n# date: %v\n# game: %v\n# fps: %v\n# freq (hz): %v\n\n",
-			time.Now().Format("20060102"), opts.Game, opts.Fps, opts.Frequency))
+
+	b := strings.Builder{}
+
+	b.WriteString("ffconcat version 1.0\n")
+	b.WriteString(meta("v", "1"))
+	b.WriteString(meta("date", time.Now().Format("20060102")))
+	b.WriteString(meta("game", opts.Game))
+	b.WriteString(meta("fps", opts.Fps))
+	b.WriteString(meta("freq", opts.Frequency))
+	b.WriteString(meta("pix", opts.Pix))
+	_, err = demux.WriteString(fmt.Sprintf("%s\n", b.String()))
 	if err != nil {
 		return err
 	}
@@ -51,7 +61,9 @@ func createFfmpegMuxFile(dir string, fPattern string, frameTimes []time.Duration
 			}
 			i++
 		}
-		inf := fmt.Sprintf("file %v\nduration %f\n", name, dur)
+		w, h, s := ExtractFileInfo(file.Name())
+		inf := fmt.Sprintf("file %v\nduration %f\n%s%s%s", name, dur,
+			metaf("width", w), metaf("height", h), metaf("stride", s))
 		if _, err := demux.WriteString(inf); err != nil {
 			er = err
 		}
@@ -61,3 +73,11 @@ func createFfmpegMuxFile(dir string, fPattern string, frameTimes []time.Duration
 	}
 	return er
 }
+
+// meta adds stream_meta key value line.
+func meta(key string, value any) string { return fmt.Sprintf("stream_meta %s '%v'\n", key, value) }
+
+// metaf adds file_packet_meta key value line.
+func metaf(key string, value any) string {
+	return fmt.Sprintf("file_packet_meta %s '%v'\n", key, value)
+}
diff --git a/pkg/worker/recorder/options.go b/pkg/worker/recorder/options.go
index 9707e1711..fe4ca7ce4 100644
--- a/pkg/worker/recorder/options.go
+++ b/pkg/worker/recorder/options.go
@@ -1,14 +1,18 @@
 package recorder
 
 type Options struct {
-	Dir                   string
-	Fps                   float64
-	Frequency             int
-	Game                  string
-	ImageCompressionLevel int
-	Name                  string
-	Zip                   bool
-	Vsync                 bool
+	Dir       string
+	Fps       float64
+	W         int
+	H         int
+	Stride    int
+	Flip      bool
+	Frequency int
+	Pix       string
+	Game      string
+	Name      string
+	Zip       bool
+	Vsync     bool
 }
 
 type Meta struct {
diff --git a/pkg/worker/recorder/pngstream.go b/pkg/worker/recorder/pngstream.go
deleted file mode 100644
index 1cb0cf884..000000000
--- a/pkg/worker/recorder/pngstream.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package recorder
-
-import (
-	"bytes"
-	"fmt"
-	"image"
-	"image/png"
-	"log"
-	"os"
-	"path/filepath"
-	"sync"
-	"sync/atomic"
-)
-
-type pngStream struct {
-	dir string
-	e   *png.Encoder
-	id  uint32
-	wg  sync.WaitGroup
-}
-
-const videoFile = "f%07d.png"
-
-type pool struct{ sync.Pool }
-
-func pngBuf() *pool                      { return &pool{sync.Pool{New: func() any { return &png.EncoderBuffer{} }}} }
-func (p *pool) Get() *png.EncoderBuffer  { return p.Pool.Get().(*png.EncoderBuffer) }
-func (p *pool) Put(b *png.EncoderBuffer) { p.Pool.Put(b) }
-
-func newPngStream(dir string, opts Options) (*pngStream, error) {
-	return &pngStream{
-		dir: dir,
-		e: &png.Encoder{
-			CompressionLevel: png.CompressionLevel(opts.ImageCompressionLevel),
-			BufferPool:       pngBuf(),
-		},
-	}, nil
-}
-
-func (p *pngStream) Close() error {
-	atomic.StoreUint32(&p.id, 0)
-	p.wg.Wait()
-	return nil
-}
-
-func (p *pngStream) Write(data Video) {
-	fileName := fmt.Sprintf(videoFile, atomic.AddUint32(&p.id, 1))
-	p.wg.Add(1)
-	go p.saveImage(fileName, data.Image)
-}
-
-func (p *pngStream) saveImage(fileName string, img image.Image) {
-	var buf bytes.Buffer
-	x, y := (img).Bounds().Dx(), (img).Bounds().Dy()
-	buf.Grow(x * y * 4)
-
-	if err := p.e.Encode(&buf, img); err != nil {
-		log.Printf("p err: %v", err)
-	} else {
-		file, err := os.Create(filepath.Join(p.dir, fileName))
-		if err != nil {
-			log.Printf("c err: %v", err)
-		}
-		if _, err = file.Write(buf.Bytes()); err != nil {
-			log.Printf("f err: %v", err)
-		}
-		if err = file.Close(); err != nil {
-			log.Printf("fc err: %v", err)
-		}
-	}
-	p.wg.Done()
-}
diff --git a/pkg/worker/recorder/rawstream.go b/pkg/worker/recorder/rawstream.go
new file mode 100644
index 000000000..26b8875c3
--- /dev/null
+++ b/pkg/worker/recorder/rawstream.go
@@ -0,0 +1,66 @@
+package recorder
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+type rawStream struct {
+	dir string
+	id  uint32
+	wg  sync.WaitGroup
+}
+
+const videoFile = "f%07d__%dx%d__%d.raw"
+
+func newRawStream(dir string) (*rawStream, error) {
+	return &rawStream{dir: dir}, nil
+}
+
+func (p *rawStream) Close() error {
+	atomic.StoreUint32(&p.id, 0)
+	p.wg.Wait()
+	return nil
+}
+
+func (p *rawStream) Write(data Video) {
+	i := atomic.AddUint32(&p.id, 1)
+	fileName := fmt.Sprintf(videoFile, i, data.Frame.W, data.Frame.H, data.Frame.Stride)
+	p.wg.Add(1)
+	go p.saveFrame(fileName, data.Frame)
+}
+
+func (p *rawStream) saveFrame(fileName string, frame Frame) {
+	file, err := os.Create(filepath.Join(p.dir, fileName))
+	if err != nil {
+		log.Printf("c err: %v", err)
+	}
+	if _, err = file.Write(frame.Data); err != nil {
+		log.Printf("f err: %v", err)
+	}
+
+	if err = file.Close(); err != nil {
+		log.Printf("fc err: %v", err)
+	}
+	p.wg.Done()
+}
+
+func ExtractFileInfo(name string) (w, h, st string) {
+	s1 := strings.Split(name, "__")
+	if len(s1) > 1 {
+		s12 := strings.Split(s1[1], "x")
+		if len(s12) > 1 {
+			w, h = s12[0], s12[1]
+		}
+		s21 := strings.TrimSuffix(s1[2], filepath.Ext(s1[2]))
+		if s21 != "" {
+			st = s21
+		}
+	}
+	return
+}
diff --git a/pkg/worker/recorder/recorder.go b/pkg/worker/recorder/recorder.go
index 24edc4c8d..4c3d12078 100644
--- a/pkg/worker/recorder/recorder.go
+++ b/pkg/worker/recorder/recorder.go
@@ -1,7 +1,6 @@
 package recorder
 
 import (
-	"image"
 	"io"
 	"math/rand"
 	"os"
@@ -60,9 +59,14 @@ type (
 		Duration time.Duration
 	}
 	Video struct {
-		Image    image.Image
+		Frame    Frame
 		Duration time.Duration
 	}
+	Frame struct {
+		Data   []byte
+		Stride int
+		W, H   int
+	}
 )
 
 // NewRecording creates new media recorder for the emulator.
@@ -96,7 +100,7 @@ func (r *Recording) Start() {
 		r.log.Fatal().Err(err)
 	}
 	r.audio = audio
-	video, err := newPngStream(path, r.opts)
+	video, err := newRawStream(path)
 	if err != nil {
 		r.log.Fatal().Err(err)
 	}
diff --git a/pkg/worker/recorder/recorder_test.go b/pkg/worker/recorder/recorder_test.go
index e44417491..8d2fa998f 100644
--- a/pkg/worker/recorder/recorder_test.go
+++ b/pkg/worker/recorder/recorder_test.go
@@ -30,13 +30,12 @@ func TestName(t *testing.T) {
 		Meta{UserName: "test"},
 		logger.Default(),
 		Options{
-			Dir:                   dir,
-			Fps:                   60,
-			Frequency:             10,
-			Game:                  fmt.Sprintf("test_game_%v", rand.Int()),
-			ImageCompressionLevel: 0,
-			Name:                  "test",
-			Zip:                   false,
+			Dir:       dir,
+			Fps:       60,
+			Frequency: 10,
+			Game:      fmt.Sprintf("test_game_%v", rand.Int()),
+			Name:      "test",
+			Zip:       false,
 		})
 	recorder.Set(true, "test_user")
 
@@ -45,11 +44,11 @@ func TestName(t *testing.T) {
 	var imgWg, audioWg sync.WaitGroup
 	imgWg.Add(iterations)
 	audioWg.Add(iterations)
-	img := generateImage(100, 100)
+	frame := genFrame(100, 100)
 
 	for i := 0; i < 222; i++ {
 		go func() {
-			recorder.WriteVideo(Video{Image: img, Duration: 16 * time.Millisecond})
+			recorder.WriteVideo(Video{Frame: frame, Duration: 16 * time.Millisecond})
 			imgWg.Done()
 		}()
 		go func() {
@@ -66,17 +65,14 @@ func TestName(t *testing.T) {
 }
 
 func BenchmarkNewRecording100x100(b *testing.B) {
-	benchmarkRecorder(100, 100, 0, b)
+	benchmarkRecorder(100, 100, b)
 }
 
-func BenchmarkNewRecording320x240_compressed(b *testing.B) {
-	benchmarkRecorder(320, 240, 0, b)
-}
-func BenchmarkNewRecording320x240_nocompress(b *testing.B) {
-	benchmarkRecorder(320, 240, -1, b)
+func BenchmarkNewRecording320x240(b *testing.B) {
+	benchmarkRecorder(320, 240, b)
 }
 
-func benchmarkRecorder(w, h int, comp int, b *testing.B) {
+func benchmarkRecorder(w, h int, b *testing.B) {
 	b.StopTimer()
 
 	dir, err := os.MkdirTemp("", "rec_bench_")
@@ -89,8 +85,8 @@ func benchmarkRecorder(w, h int, comp int, b *testing.B) {
 		}
 	}()
 
-	image1 := generateImage(w, h)
-	image2 := generateImage(w, h)
+	frame1 := genFrame(w, h)
+	frame2 := genFrame(w, h)
 
 	var bytes int64 = 0
 
@@ -103,25 +99,24 @@ func benchmarkRecorder(w, h int, comp int, b *testing.B) {
 		Meta{UserName: "test"},
 		logger.Default(),
 		Options{
-			Dir:                   dir,
-			Fps:                   60,
-			Frequency:             10,
-			Game:                  fmt.Sprintf("test_game_%v", rand.Int()),
-			ImageCompressionLevel: comp,
-			Name:                  "",
-			Zip:                   false,
+			Dir:       dir,
+			Fps:       60,
+			Frequency: 10,
+			Game:      fmt.Sprintf("test_game_%v", rand.Int()),
+			Name:      "",
+			Zip:       false,
 		})
 	recorder.Set(true, "test_user")
 	samples := []int16{0, 0, 0, 0, 0, 1, 11, 11, 11, 1}
 
 	for i := 0; i < b.N; i++ {
-		im := image1
+		f := frame1
 		if i%2 == 0 {
-			im = image2
+			f = frame2
 		}
 		go func() {
-			recorder.WriteVideo(Video{Image: im, Duration: 16 * time.Millisecond})
-			atomic.AddInt64(&bytes, int64(len(im.(*image.RGBA).Pix)))
+			recorder.WriteVideo(Video{Frame: f, Duration: 16 * time.Millisecond})
+			atomic.AddInt64(&bytes, int64(len(f.Data)))
 			ticks.Done()
 		}()
 		go func() {
@@ -137,14 +132,19 @@ func benchmarkRecorder(w, h int, comp int, b *testing.B) {
 	}
 }
 
-func generateImage(w, h int) image.Image {
+func genFrame(w, h int) Frame {
 	img := image.NewRGBA(image.Rect(0, 0, w, h))
 	for x := 0; x < w; x++ {
 		for y := 0; y < h; y++ {
 			img.Set(x, y, randomColor())
 		}
 	}
-	return img
+	return Frame{
+		Data:   img.Pix,
+		Stride: img.Stride,
+		W:      img.Bounds().Dx(),
+		H:      img.Bounds().Dy(),
+	}
 }
 
 var rnd = rand.New(rand.NewSource(time.Now().Unix()))
diff --git a/pkg/worker/room/room_test.go b/pkg/worker/room/room_test.go
index 36bd25d44..ed67e48cb 100644
--- a/pkg/worker/room/room_test.go
+++ b/pkg/worker/room/room_test.go
@@ -19,16 +19,20 @@ import (
 	"github.com/giongto35/cloud-game/v3/pkg/com"
 	"github.com/giongto35/cloud-game/v3/pkg/config"
 	"github.com/giongto35/cloud-game/v3/pkg/encoder"
+	"github.com/giongto35/cloud-game/v3/pkg/encoder/color/bgra"
+	"github.com/giongto35/cloud-game/v3/pkg/encoder/color/rgb565"
+	"github.com/giongto35/cloud-game/v3/pkg/encoder/color/rgba"
 	"github.com/giongto35/cloud-game/v3/pkg/games"
 	"github.com/giongto35/cloud-game/v3/pkg/logger"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/caged/app"
-	canvas "github.com/giongto35/cloud-game/v3/pkg/worker/caged/libretro/image"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/media"
 	"github.com/giongto35/cloud-game/v3/pkg/worker/thread"
 	"golang.org/x/image/font"
 	"golang.org/x/image/font/basicfont"
 	"golang.org/x/image/math/fixed"
+
+	_ "github.com/giongto35/cloud-game/v3/test"
 )
 
 var (
@@ -58,13 +62,15 @@ func (r testRoom) Close() {
 	time.Sleep(2 * time.Second) // hack: wait room destruction (atm impossible to tell)
 }
 
-func (r testRoom) WaitFrames(n int) canvas.Frame {
-	var frame canvas.Frame
+func (r testRoom) WaitFrame(n int) app.RawFrame {
 	var wg sync.WaitGroup
-	wg.Add(n)
+	wg.Add(1)
+	target := app.RawFrame{}
 	WithEmulator(r.app).SetVideoCb(func(v app.Video) {
-		if n > 0 {
-			frame = (&canvas.Frame{RGBA: v.Frame}).Copy()
+		if n == 1 {
+			target = v.Frame
+			target.Data = make([]byte, len(v.Frame.Data))
+			copy(target.Data, v.Frame.Data)
 			wg.Done()
 		}
 		n--
@@ -73,7 +79,7 @@ func (r testRoom) WaitFrames(n int) canvas.Frame {
 		r.StartApp()
 	}
 	wg.Wait()
-	return frame
+	return target
 }
 
 type testParams struct {
@@ -81,11 +87,11 @@ type testParams struct {
 	game   games.GameMetadata
 	codecs []codec
 	frames int
+	color  int
 }
 
 // Store absolute path to test games
 var testTempDir = filepath.Join(os.TempDir(), "cloud-game-core-tests")
-var root = ""
 
 // games
 var (
@@ -94,12 +100,6 @@ var (
 	fd    = games.GameMetadata{Name: "Florian Demo", Type: "n64", Path: "Sample Demo by Florian (PD).z64", System: "n64"}
 )
 
-func init() {
-	runtime.LockOSThread()
-	p, _ := filepath.Abs("../../../")
-	root = p + string(filepath.Separator)
-}
-
 func TestMain(m *testing.M) {
 	flag.BoolVar(&renderFrames, "renderFrames", false, "Render frames for eye testing purposes")
 	flag.StringVar(&outputPath, "outputPath", "./", "Output path for generated files")
@@ -115,40 +115,51 @@ func TestRoom(t *testing.T) {
 
 	for _, test := range tests {
 		room := room(conf{codec: test.codecs[0], game: test.game})
-		room.WaitFrames(test.frames)
+		room.WaitFrame(test.frames)
 		room.Close()
 	}
 }
 
 func TestAll(t *testing.T) {
 	tests := []testParams{
-		{game: sushi, frames: 150},
-		{game: alwas, frames: 50},
-		{game: fd, frames: 50, system: "main-thread"},
+		{game: sushi, frames: 150, color: 2},
+		{game: alwas, frames: 50, color: 1},
+		{game: fd, frames: 50, system: "gl", color: 1},
 	}
 
 	crc32q := crc32.MakeTable(0xD5828281)
 
 	for _, test := range tests {
+		var frame app.RawFrame
 		room := room(conf{game: test.game, codec: encoder.VP8, autoGlContext: autoGlContext, autoAppStart: false})
-		var frame canvas.Frame
-		if test.system == "main-thread" {
-			thread.Main(func() {
-				frame = room.WaitFrames(test.frames)
-				room.Close()
-			})
-		} else {
-			frame = room.WaitFrames(test.frames)
-			room.Close()
-		}
+		flip := test.system == "gl"
+		thread.Main(func() { frame = room.WaitFrame(test.frames) })
+		room.Close()
+
 		if renderFrames {
-			tag := fmt.Sprintf("%v-%v-0x%08x", runtime.GOOS, test.game.Type, crc32.Checksum(frame.Pix, crc32q))
-			dumpCanvas(&frame, tag, fmt.Sprintf("%v [%v]", tag, test.frames), outputPath)
+			rect := image.Rect(0, 0, frame.W, frame.H)
+			var src image.Image
+			if test.color == 1 {
+				src1 := bgra.NewBGRA(rect)
+				src1.Pix = frame.Data
+				src1.Stride = frame.Stride
+				src = src1
+			} else {
+				if test.color == 2 {
+					src1 := rgb565.NewRGB565(rect)
+					src1.Pix = frame.Data
+					src1.Stride = frame.Stride
+					src = src1
+				}
+			}
+			dst := rgba.ToRGBA(src, flip)
+			tag := fmt.Sprintf("%v-%v-0x%08x", runtime.GOOS, test.game.Type, crc32.Checksum(frame.Data, crc32q))
+			dumpCanvas(dst, tag, fmt.Sprintf("%v [%v]", tag, test.frames), outputPath)
 		}
 	}
 }
 
-func dumpCanvas(frame *canvas.Frame, name string, caption string, path string) {
+func dumpCanvas(frame *image.RGBA, name string, caption string, path string) {
 	// slap 'em caption
 	if caption != "" {
 		draw.Draw(frame, image.Rect(8, 8, 8+len(caption)*7+3, 24), &image.Uniform{C: color.RGBA{}}, image.Point{}, draw.Src)
@@ -187,16 +198,17 @@ func room(cfg conf) testRoom {
 		panic(err)
 	}
 
-	conf.Worker.Library.BasePath = filepath.FromSlash(root + "/assets/games")
+	conf.Emulator.Libretro.Cores.Repo.ExtLock = expand("tests", ".cr", "cloud-game.lock")
+	conf.Emulator.LocalPath = expand("tests", conf.Emulator.LocalPath)
+	conf.Emulator.Storage = expand("tests", "storage")
+
+	conf.Encoder.Video.Codec = string(cfg.codec)
 
-	fixEmulators(&conf, cfg.autoGlContext)
 	l := logger.NewConsole(conf.Worker.Debug, "w", false)
 	if cfg.noLog {
 		logger.SetGlobalLevel(logger.Disabled)
 	}
 
-	conf.Encoder.Video.Codec = string(cfg.codec)
-
 	id := cfg.roomName
 	if id == "" {
 		id = games.GenerateRoomID(cfg.game.Name)
@@ -218,6 +230,7 @@ func room(cfg conf) testRoom {
 	m.AudioSrcHz = emu.AudioSampleRate()
 	m.AudioFrame = conf.Encoder.Audio.Frame
 	m.VideoW, m.VideoH = emu.ViewportSize()
+	m.VideoScale = emu.Scale()
 	if err := m.Init(); err != nil {
 		l.Fatal().Err(err).Msgf("no init")
 	}
@@ -230,22 +243,6 @@ func room(cfg conf) testRoom {
 	return testRoom{Room: room, started: cfg.autoAppStart}
 }
 
-// fixEmulators makes absolute game paths in global GameList and passes GL context config.
-// hack: emulator paths should be absolute and visible to the tests.
-func fixEmulators(config *config.WorkerConfig, autoGlContext bool) {
-	config.Emulator.Libretro.Cores.Paths.Libs =
-		filepath.FromSlash(root + config.Emulator.Libretro.Cores.Paths.Libs)
-	config.Emulator.LocalPath = filepath.FromSlash(filepath.Join(root, "tests", config.Emulator.LocalPath))
-	config.Emulator.Storage = filepath.FromSlash(filepath.Join(root, "tests", "storage"))
-
-	for k, conf := range config.Emulator.Libretro.Cores.List {
-		if conf.IsGlAllowed && autoGlContext {
-			conf.AutoGlContext = true
-		}
-		config.Emulator.Libretro.Cores.List[k] = conf
-	}
-}
-
 // Measures emulation performance of various
 // emulators and encoding options.
 func BenchmarkRoom(b *testing.B) {
@@ -263,7 +260,7 @@ func BenchmarkRoom(b *testing.B) {
 					b.StopTimer()
 					room := room(conf{game: bench.game, codec: cod, noLog: true})
 					b.StartTimer()
-					room.WaitFrames(bench.frames)
+					room.WaitFrame(bench.frames)
 					b.StopTimer()
 					room.Room.Close()
 				}
@@ -299,3 +296,9 @@ func TestRouter(t *testing.T) {
 	router.SetRoom(nil)
 	router.Close()
 }
+
+// expand joins a list of file path elements.
+func expand(p ...string) string {
+	ph, _ := filepath.Abs(filepath.FromSlash(filepath.Join(p...)))
+	return ph
+}
diff --git a/pkg/worker/thread/mainthread_darwin_test.go b/pkg/worker/thread/mainthread_darwin_test.go
index bab4a92c3..15ce9328f 100644
--- a/pkg/worker/thread/mainthread_darwin_test.go
+++ b/pkg/worker/thread/mainthread_darwin_test.go
@@ -1,16 +1,14 @@
 package thread
 
-import "testing"
+import (
+	"os"
+	"testing"
+)
 
-func init() {
-	runtime.LockOSThread()
+func TestMain(m *testing.M) {
+	Wrap(func() { os.Exit(m.Run()) })
 }
 
 func TestMainThread(t *testing.T) {
-	value := 0
-	fn := func() { value = 1 }
-	Main(fn)
-	if value != 1 {
-		t.Errorf("wrong value %v", value)
-	}
+	_ = 10
 }
diff --git a/test/test.go b/test/test.go
new file mode 100644
index 000000000..b80b425ae
--- /dev/null
+++ b/test/test.go
@@ -0,0 +1,17 @@
+package test
+
+import (
+	"os"
+	"path"
+	"runtime"
+)
+
+// runs tests from the root dir when imported
+
+func init() {
+	_, filename, _, _ := runtime.Caller(0)
+	dir := path.Join(path.Dir(filename), "..")
+	if err := os.Chdir(dir); err != nil {
+		panic(err)
+	}
+}
diff --git a/test/testdata/raw/000_name_fourcc_width_height_stride b/test/testdata/raw/000_name_fourcc_width_height_stride
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/testdata/raw/001_alsa_ABGR_256_240_1024.raw.zip b/test/testdata/raw/001_alsa_ABGR_256_240_1024.raw.zip
new file mode 100644
index 000000000..a85e7d7bf
Binary files /dev/null and b/test/testdata/raw/001_alsa_ABGR_256_240_1024.raw.zip differ